# -*- coding = utf-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request, build_opener, HTTPcookieProcessor
from http.cookiejar import cookieJar
import csv
if __name__ == '__main__':
url = "https://cs.5i5j.com/ershoufang/"
req = Request(url, None, {'Connection': 'Keep-Alive',
'Accept': 'textml, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'})
# 爬这个网站需要伪造cookie
cj = cookieJar()
opener = build_opener(HTTPcookieProcessor(cj))
response = opener.open(req)
# 存入本地慢慢爬
with open('download/textInfo.html', 'wb') as f:
f.write(response.read())
# 读取
with open('download/textInfo.html', 'rb') as f:
data = f.read()
# print(data)
# HouseInfo.csv用来存爬下来的信息
f = open('HouseInfo.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(f)
# 存消息头
writer.writerow(('synopsis', 'totalPrice', 'priceSquare'))
soup = BeautifulSoup(data, "html.parser")
ul = soup.find("ul", class_="pList")
lis = ul.findAll("li")
for i in range(len(lis)):
li = lis[i]
h3 = li.find("h3", class_="listTit")
# 房屋简介
synopsis = h3.find("a").get_text()
# print(synopsis)
jia = li.find("div", class_="jia")
price = jia.findAll("p")
# 房屋总价
totalPrice = price[0].get_text()
# 房屋每平方价格
priceSquare = price[1].get_text()
# print(totalPrice)
# print(priceSquare)
writer.writerow((synopsis, totalPrice, priceSquare))
f.close()