代码:
/i', '${1}', $$field); preg_match_all('/(.*?)/i', $$field, $pre_match); if (isset($pre_match[1]) && is_array($pre_match[1]) && !empty($pre_match[1])) { foreach ($pre_match[1] as $pre_val) $$field = str_replace($pre_val, str_replace("【】", "rn", $pre_val), $$field); } //end } //入库之前,将对应的换行符号都还原回来 $$field = str_replace('【】', "rn", $$field); //文本的过滤和替换操作 if (is_array($text_filter) && !empty($text_filter)) { foreach ($text_filter as $tk => $tv) $$field = str_ireplace($tk, $tv, $$field); } if (IS_DEBUG) $this->write('*'."t".'字段:'.$field.' 值:'."n****************************************************n".$$field."n****************************************************"); if ('downurl' == $field && stripos($$field, 'http:') === false) if (substr($$field, 0, 1) == '/') $$field = WEB_HOST.trim($$field); else $$field = WEB_HOST.'/'.trim($$field); $sth->bindValue(':'.$field, trim($$field)); } if (INSERT_DB) $sth->execute(); $sth->closeCursor(); $this->write( '休息,暂停'.SLEEP_TIME.'秒后继续抓取...'); sleep(SLEEP_TIME); } } else { $this->write('列表页面没有抓取到内容,所以过滤掉'); } } $this->write('', true); } protected function closetags($html) { // 不需要补全的标签 $arr_single_tags = array('meta', 'img', 'br', 'link', 'area'); // 匹配开始标签 preg_match_all('#<([a-z]+)(?: .*)?(?#iU', $html, $result); $openedtags = $result[1]; // 匹配关闭标签 preg_match_all('#([a-z]+)>#iU', $html, $result); $closedtags = $result[1]; // 计算关闭开启标签数量,如果相同就返回html数据 $len_opened = count($openedtags); if (count($closedtags) == $len_opened) { return $html; } // 把排序数组,将最后一个开启的标签放在最前面 $openedtags = array_reverse($openedtags); // 遍历开启标签数组 for ($i = 0; $i < $len_opened; $i++) { // 如果需要补全的标签 if (!in_array($openedtags[$i], $arr_single_tags)) { // 如果这个标签不在关闭的标签中 if (!in_array($openedtags[$i], $closedtags)) { // 直接补全闭合标签 $html .= '' . $openedtags[$i] . '>'; } else { unset($closedtags[array_search($openedtags[$i], $closedtags)]); } } } return $html; } protected function init_check() { if (!$this->check_curl_support()) $this->write('对不起,请先开启CURL的类库的支持,否则无法执行', true); $this->check_mysql_connect(); $this->write('程序初始化检查通过,执行后续的流程...'); } private function get($url, $data = array()) { $this->write('开始执行抓取: '.$url); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); //curl_setopt($ch, CURLOPT_USERAGENT, "Baiduspider+(+http://www.baidu.com/search/spider.htm)"); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HTTPHEADER, $data); $ret = curl_exec($ch); $error = curl_error($ch); curl_close($ch); unset($ch); if (!empty($error)) { $this->write('程序抓取URL: '.$url.'发生错误,错误信息: '.$error); return false; } if (WEB_CHARSET != 'utf-8') $ret = iconv(WEB_CHARSET, 'utf-8', $ret); return $ret; } //when check finish,mysql connect will auto close private function check_mysql_connect() { $con = mysql_connect(DB_HOST, DB_USER, DB_PWD); if (!is_resource($con)) $this->write('程序无法成功链接到数据库,具体的错误信息:'.mysql_error(), true); if (!mysql_select_db(DB_NAME, $con)) $this->write('程序无法链接到数据库: '.DB_NAME.',具体的错误信息: '.mysql_error(), true); mysql_close($con); } private function check_curl_support() { if (!extension_loaded('curl') || !function_exists('curl_init')) return false; return true; } private function write($str, $end = false) { if (PATH_SEPARATOR == ':') echo $str,PHP_EOL,PHP_EOL; else echo iconv('UTF-8', 'GBK', $str),PHP_EOL,PHP_EOL; if ($end) die("program exit"); sleep(OUTPUT_SPEED); } }



