栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > PHP

分享下页面关键字抓取components.arrow.com站点代码

PHP 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

分享下页面关键字抓取components.arrow.com站点代码

复制代码 代码如下:
 
 //set_time_limit(0);
 // base function
 function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
 {
 $ch = curl_init();
 if (!empty($data)) {
 $data = is_array($data)?http_build_query($data): $data;
 $url .= (strpos($url,'?')? '&': "?") . $data;
 }
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_POST, 0);
 curl_setopt($ch, CURLOPT_PORT, $port);
 curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面
 $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);
 if($proxy) {
 curl_setopt($ch, CURLOPT_PROXY, $proxy);
 curl_setopt($ch, CURLOPT_PROXYPORT, 1723);
 curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");
 }

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:n" . curl_error($ch);

}
 curl_close($ch);
 return $result;
 }

复制代码 代码如下:
function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)
 {
 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_PORT, $port);
 !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_POST, 1);
 curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:n" . curl_error($ch);

}
 curl_close($ch);

return $result;
 }


 function getListHtml($keywords, $start = 0)
 {
 if ($start < 0)
 {
 return false;
 }

$postData = array(
 'search_token' => $keywords,
 'start' => $start,
 'limit' => 100,
 );

$result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));
 if ( isset($result['error']) )
 {
 return false;
 //exit($result['error']);
 }
 $result = $result['result'];

return $result;
 }


 function getListHref($html)
 {
 $pattern = '/]+)">/isU';
 if (preg_match_all($pattern, $html, $matches))
 {
 return $matches[1];
 } else {
 // 没有匹配项
 return array();
 }
 }


 function getListNextPage($html)
 {
 $pattern = '/buildPagination('d+','d+','(d+)',d+);/isU';
 if (preg_match($pattern, $html, $matches))
 {
 return intval($matches[1]);
 } else {
 return -1;
 }
 }


 function getListHrefAll($keywords)
 {
 if (empty($keywords))
 {
 return false;
 }

$html = getListHtml($keywords);
 $hrefList = getListHref($html);
 if (empty($hrefList))
 {
 // 没有结果
 return array();
 }
 $nextPage = getListNextPage($html);
 $loop =0;
 while ($nextPage > 0)
 {
 $html = getListHtml($keywords, $nextPage);
 $tmpHrefList = getListHref($html);
 $hrefList = array_merge($hrefList, $tmpHrefList);
 $nextPage = getListNextPage($html);
 $loop ++;
 }
 return $hrefList;
 }


 function getDetail($url)
 {
 if ( empty($url) )
 {
 return false;
 }
 $host = 'http://components.arrow.com';

$url = $host . $url;
 $result = curl_get($url);
 if ( isset($result['error']) )
 {
 return array();
 //exit($result['error']);
 }
 $html = $result['result'];

$result = array(
 'sup_part' => '', // 供应商型
 'sup_id' => '', // 供应商ID
 'mfg_part' => '', // 制造商型号
 'mfg_name' => '', // 制造商名称
 'cat_name' => '', // 分类名称
 'para' => '', // 属性
 'desc' => '', // 描述
 'pdf_url' => '', // PDF地址
 'sup_stock' => '', // 库存
 'min_purch' => '', // 最小订购量
 'price' => '', // 价格
 'img_url' => '', // 图片地址
 'createtime' => '', // 创建时间
 'datacode' => '', // 批号
 'package' => '', // 封装
 'page_url' => '', // 页面地址
 );

// mfg_part
 $pattern = '/

  • [sn]*Part No:s*(.+)
  • /isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['mfg_part'] = trim($matches[1]);
     } else {file_put_contents('page.txt', $html);die('xxx');
     return array();
     }

    // mfg_name
     $pattern = '/
  • [sn]*Manufacturer: (.+)
  • /isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['mfg_name'] = trim($matches[1]);
     }

    // cat_name
     $pattern = '/displayCategory('(.[^']+)');/isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['cat_name'] = trim($matches[1]);
     $result['cat_name'] = str_replace('|', '>', $result['cat_name']);
     }

    // para
     $tablepattern = '/]*>(.+)/isU';
     if (preg_match($tablepattern, $html, $matches))
     {
     $pattern = '/[sn]*(.+)(.+)[sn]*/isU';
     if (preg_match_all($pattern, $matches[1], $matches))
     {
     foreach($matches[1] as $k=>$v)
     {
     $v = trim($v);
     if ('Package Type' == $v)
     {
     $result['package'] = trim($matches[2][$k]);
     continue;
     }
     $result['para'][$v] = trim($matches[2][$k]);
     }
     }
     }

    // desc
     $pattern = '/.+(.+)[sn]*
    /isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['desc'] = trim($matches[1]);
     }

    // pdf_url
     $pattern = '/[sn]*Datasheet: if (preg_match($pattern, $html, $matches))
     {
     $result['pdf_url'] = $host . trim($matches[1]);
     }

    // sup_stock
     $pattern = '/([d,]+)/isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['sup_stock'] = trim($matches[1]);
     $result['sup_stock'] = str_replace(',', '', $result['sup_stock']);
     }

    // min_purch
     $pattern = '/[sn]*Multiple:s*(.+)/isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['min_purch'] = trim($matches[1]);
     }

    // price
     $pattern = '/(.[^<]+)
    /isU';
     if (preg_match($pattern, $html, $matches))
     {
     $result['price'][1] = trim($matches[1]);
     }
     $pattern = '/[sn]*]+title="(.[^"]+)">/isU';
     if (preg_match($pattern, $html, $matches))
     {
     $priceurl = str_replace('&', '&', $matches[1]);
     $json = curl_get($priceurl);
     $json = $json['result'];
     if (! empty($json))
     {
     $jsonresult = json_decode($json, true);
     foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)
     {
     $result['price'][$v['minqty']] = $v['price'];
     }
     }
     }

    // img_url
     $pattern = '/[sn]* if (preg_match($pattern, $html, $matches))
     {
     $result['img_url'] = trim($matches[1]);
     }

    // page_url
     $result['page_url'] = $url;

    return $result;
     }


     function getData($keywords)
     {
     $hrefList = getListHrefAll($keywords);
     $result = array();

    foreach ($hrefList as $k=>$v)
     {
     $result[] = getDetail($v);
     }

    return $result;
     }

    // Test script
     $keywords = trim($_GET['keywords']);
     $result = getData($keywords);

    print_r($result);

    转载请注明:文章转载自 www.mshxw.com
    本文地址:https://www.mshxw.com/it/48760.html

    PHP相关栏目本月热门文章

    我们一直用心在做
    关于我们 文章归档 网站地图 联系我们

    版权所有 (c)2021-2022 MSHXW.COM

    ICP备案号:晋ICP备2021003244-6号