百度dict 采集样本
写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~
word = $word;
$symbol = $this -> Pronounced();
$pro = $this->getSay();
$example = $this -> getExample();
$explain = $this -> getExplain();
$synonym = $this -> getSynonym();
$phrase = $this -> getPhrase();
$result = array(
"symbol" => $symbol, //音标
"pro" => $pro, //发音
"example"=> $example, //例句
"explain"=> $explain, //简明释义
"synonym"=> $synonym, //同反义词
"phrase" => $phrase //短语数组
);
return $result;
}
private function getContent(){
$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
$ch = curl_init();
$url = "http://dict.baidu.com/s?wd=".$this->word;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT,$useragent);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER,1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
$result = curl_exec($ch);
if (curl_errno($curl)) {
echo 'Errno'.curl_error($curl);
}
curl_close($ch);
return $result;
}
private function Pronounced(){
$data = $this -> getContent();
preg_match_all("/"EN-US">(.*)/Ui",$data,$pronounced);
return array(
'en' => $pronounced[1][0],
'us' => $pronounced[1][1]
);
}
private function getSay(){
$data = $this -> getContent();
preg_match_all("/url="(.*)"/Ui",$data,$pronounced);
return array(
'en' => $pronounced[1][0],
'us' => $pronounced[1][1]
);
}
private function getExample(){
$str = "";
$data = $this -> getContent();
preg_match_all("/var example_data = (.*)];/Us",$data,$example);
$data1 = "[[[".ltrim($example[1][0],"[");
$data2 = explode("[[[",$data1);
$num = count(array_filter($data2));
foreach($data2 as $key => $value){
$data3 = explode("[[","[[".$value);
foreach ($data3 as $k => $v) {
preg_match_all("/["(.*)",/Us","[".$v, $match);
if(!empty($match[1])){
$str .= implode($match[1]," ")."@";
}
}
}
$data4 = trim($str,"@");
$data5 = explode("@", $data4);
$result = array_chunk($data5, 2);
return $result;
}
private function getExplain(){
$data = $this -> getContent();
preg_match_all("/id="en-simple-means">(.*)/Us",$data,$explain);
$r_data = $explain[1][0];
preg_match_all("/(?P.*)(?P.*)
/Us", $r_data, $a_data);
preg_match_all("/(?P[^>]+):(?P.*)/Us", $r_data, $b_data);
$result = array();
foreach ($a_data["adj"] as $key => $value) {
$result[$value] = $a_data["name"][$key];
}
$word_b = array();
foreach ($b_data["tag"] as $key => $value) {
$word_b[$value] = strip_tags($b_data["word"][$key]);
}
$result_data = array("x" => $result,"b" => $word_b);
return $result_data;
}
private function getSynonym(){
$data = $this -> getContent();
preg_match_all("/id="en-syn-ant">(.*)/Us",$data,$synonym);
$content = $synonym[1][0];
$data1 = explode("", $content);
$result = array();
$data2 = array();
foreach ($data1 as $key => $value) {
preg_match_all("/(?P.*) 


