源代码文件:
import scrapy
import json
from..items import Db250Item
class W666Spider(scrapy.Spider):
name = 'w666'
allowed_domains = ['movie.douban.com']
start_urls = ['http://movie.douban.com/top250']
page_num=0
def parse(self, response):
node_list=response.xpath('//div[@]')
if node_list:
for i in node_list:
movies_name=i.xpath('.//div/a/span/text()').get()
director = i.xpath('./div/p/text()').get().strip()
score = i .xpath('//span[@]/text()').get()
item=Db250Item()
item["movies_name"]=movies_name
item["director"]=director
item["score"]=score
yield item
self.page_num +=1
new_url='https://movie.douban.com/top250?start={}&filter='.format(self.page_num*25)
yield scrapy.Request(new_url,callback=self.parse)
else:
return
items.py:
import scrapy
class Db250Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
movies_name=scrapy.Field()
director=scrapy.Field()
score=scrapy.Field()
管道pipelines.py:
import json class Db250Pipeline: def open_spider(self,spider): self.f = open('wxin.txt','w',encoding='utf-8') def process_item(self, item, spider): json_str=json.dumps(dict(item),ensure_ascii=False)+'n' self.f.write(json_str) return item def close_spider(self): self.f.close()
settings.py需要修改以及激活的内容:
ROBOTSTXT_OBEY =False #这个默认的是Ture,必须该成Flase,要不然就爬取不到数据DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36' } #这个必须要去network里找到user-agent,然后添加进去,如果不加大概率是爬不到数据的ITEM_PIPELINES = { 'db250.pipelines.Db250Pipeline': 300, } #这个储存管道设定后必须激活的,才能进行持久化保存



