from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
import xlwt
# 该程序用来爬取链家的数据,房山的数据,把网址换一换就能改成其他的
def main(area,pageNum):
# baseurl = "https://bj.lianjia.com/ershoufang/fengtai/pg"
baseurl = "https://bj.lianjia.com/ershoufang/"+area+"/pg"
savepath = "./data/链家-"+area+".xls"
datalist = getData(pageNum, baseurl)
saveData(pageNum, datalist, savepath)
findName = re.compile(r'target="_blank">(.*?) (.*?)')
findSinglePrice = re.compile(r'data-price="(.*?)"')
findHouseInfo = re.compile(r'houseIcon">(.*?)')
def getData(pageNum, baseurl):
datalist = []
for i in range(0, pageNum): # 调用100次,每次30条
print("正在爬取第"+str(i+1)+"页")
url = baseurl + str(i+1)
html = askURL(url) # 每获取一个 就解析一个
# 逐一解析数据
soup = BeautifulSoup(html, "html.parser")
ii = 1
# 计数器,计数到30就完了
for item in soup.find_all('div', class_="title"):
# 这个循环是为了获取名称
name_list = []
item = str(item)
# print("这是第%d个item"%ii)
ii = ii+1
if ii > 31:
break
else:
name = re.findall(findName, item)[0]
# print(name)
name_list.append(name)
datalist.append(name_list)
for item in soup.find_all('div', class_='priceInfo'):
price_list = []
item = str(item)
sum_price = re.findall(findSumPrice, item)[0]
price_list.append(sum_price)
single_price = re.findall(findSinglePrice, item)[0]
price_list.append(single_price)
datalist.append(price_list)
for item in soup.find_all('div', class_='houseInfo'):
item = str(item)
houseinfo = re.findall(findHouseInfo, item)[0]
houseinfo = houseinfo.split('|')
# 以“|”为分隔符分开
# 房源信息这一块 列表长度不一定,为增强鲁棒性,进行以下判断,
# 目的是对齐7个房源属性
if len(houseinfo) == 4:
houseinfo.insert(3, "空")
houseinfo.insert(4, "空")
houseinfo.insert(5, "空")
if len(houseinfo) == 5:
houseinfo.insert(4, " ")
houseinfo.insert(5, "空")
if len(houseinfo) == 6:
houseinfo.insert(5, " ")
if len(houseinfo) == 8:
houseinfo.pop()
datalist.append(houseinfo)
return datalist
def askURL(url):
# 这就是个用户代理,表示我自己是个浏览器,而不是爬虫
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(pageNum, datalist, savepath):
# 一页三十个房源,一共100页,共计3000套
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('房山二手房', cell_overwrite_ok=True)
col = ('房源名称', "总价/万", "单价/元", "几居室", "面积", "朝向", "装修", "楼层", "建筑年代", "板楼or塔楼")
# 共计以上10个属性
print(datalist)
print("存储数据ing......")
for i in range(0, 10):
sheet.write(0, i, col[i])
for i in range(0, pageNum):
# 这个相当于是页码
for j in range(0, 30):
# 先把名称写进去
sheet.write(i*30+j+1, 0, datalist[j+30*i*3])
# 然后是总价
sheet.write(i*30+j+1, 1, float(datalist[j+30*i*3+30][0]))
# 然后是单价
sheet.write(i * 30 + j + 1, 2, int(datalist[j + 30 * i * 3 + 30][1]))
# 几居室
sheet.write(i * 30 + j + 1, 3, datalist[j + 30 * i * 3 + 60][0])
# 面积
sheet.write(i * 30 + j + 1, 4, datalist[j + 30 * i * 3 + 60][1])
# 朝向,
sheet.write(i * 30 + j + 1, 5, datalist[j + 30 * i * 3 + 60][2])
# 装修,
sheet.write(i * 30 + j + 1, 6, datalist[j + 30 * i * 3 + 60][3])
# 楼层,
sheet.write(i * 30 + j + 1, 7, datalist[j + 30 * i * 3 + 60][4])
# 建筑年代,
sheet.write(i * 30 + j + 1, 8, datalist[j + 30 * i * 3 + 60][5])
# 板楼or塔楼
sheet.write(i * 30 + j + 1, 9, datalist[j + 30 * i * 3 + 60][6])
book.save(savepath)
if __name__ == "__main__":
area = "huairou"
pageNum = 36
main(area, pageNum)
print("爬取并存储数据完毕")