注意:运行本程序,请正确配置mongodb数据库
#coding:utf-8
import requests,pymongo,threading,logging,time
from bs4 import BeautifulSoup
class Spider:
def __init__(self):
client=pymongo.MongoClient(host='localhost')
db=client.price
self.collection=db.result
def get_all_pages(self):
base_url="http://www.100ppi.com/mprice/plist-1-505-"
for i in range(1,11):
new_url=base_url+str(i)+'.html'
self.getPage(new_url)
def getPage(self,url):
r=requests.get(url)
r.encoding="utf-8"
soup=BeautifulSoup(r.text,'lxml')
s1=soup.select('.lp-table tr')
s2=s1[1:]
for tr in s2:
result={}
result['商品名称']=tr.select('.p-name a')[0].text
result['规格']=tr.select('.width100')[0].text.strip()
result['品牌产地']=tr.select('.txtc')[0].text
result['报价']=tr.select('.txtc')[1].text.strip()
result['报价类型']=tr.select('.txtc')[2].text.strip()
result['交货地']=tr.select('.txtc')[3].text.strip()
result['交易商']=tr.select('.txtc')[4].text.strip()
result['日期']=tr.select('.txtc')[5].text
logging.warning(f"本次数据输出为:{result}")
time.sleep(0.1)
self.store(result)
def store(self,result):
self.collection.insert_one(result)
if __name__ == '__main__':
myspider=Spider()
t=threading.Thread(target=myspider.get_all_pages())
t.start()
运行结果打印输出正常:
再查看mongodb数据库。结果正常10页300条数据。
希望这篇文章对大家有所帮助。也请大家提出宝贵的意见!我将继续努力,下一篇写得更好。



