python使用urllib爬虫‍♀️‍♀️‍♀️

#  -*- coding = utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request, build_opener, HTTPcookieProcessor
from http.cookiejar import cookieJar
import csv

if __name__ == '__main__':
    url = "https://cs.5i5j.com/ershoufang/"
    req = Request(url, None, {'Connection': 'Keep-Alive',
                              'Accept': 'textml, application/xhtml+xml, */*',
                              'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                              'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'})
    # 爬这个网站需要伪造cookie
    cj = cookieJar()
    opener = build_opener(HTTPcookieProcessor(cj))
    response = opener.open(req)
    # 存入本地慢慢爬
    with open('download/textInfo.html', 'wb') as f:
        f.write(response.read())
    # 读取
    with open('download/textInfo.html', 'rb') as f:
        data = f.read()
    # print(data)

    
    # HouseInfo.csv用来存爬下来的信息
    f = open('HouseInfo.csv', 'wt', newline='', encoding='utf-8')
    writer = csv.writer(f)
    # 存消息头
    writer.writerow(('synopsis', 'totalPrice', 'priceSquare'))

    soup = BeautifulSoup(data, "html.parser")
    ul = soup.find("ul", class_="pList")
    lis = ul.findAll("li")
    for i in range(len(lis)):
        li = lis[i]
        h3 = li.find("h3", class_="listTit")
        # 房屋简介
        synopsis = h3.find("a").get_text()
        # print(synopsis)
        jia = li.find("div", class_="jia")
        price = jia.findAll("p")
        # 房屋总价
        totalPrice = price[0].get_text()
        # 房屋每平方价格
        priceSquare = price[1].get_text()
        # print(totalPrice)
        # print(priceSquare)
        writer.writerow((synopsis, totalPrice, priceSquare))
    f.close()
python使用urllib爬虫‍♀️‍♀️‍♀️

Python相关栏目本月热门文章