栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

【Python 多线程+ip代理+正则+pyquery】爬取链家二手房信息

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

【Python 多线程+ip代理+正则+pyquery】爬取链家二手房信息

现在的lj已经不再公示房屋交易记录,所以只能爬一爬房屋基本信息。

 

用到的模块还是挺多的,一波导入

import requests 
import random
from pyquery import PyQuery as pq 
import re 
import pymysql 
from pymysql.converters import escape_string
import threading 
import time
#自行查找ua
uas = []
ua = random.choice(uas)

#请求头
headers = {
	'User-Agent':ua,
	'Host':'bj.lianjia.com',
	'Referer':'https://bj.lianjia.com/ershoufang/dongcheng/pg2/'
}

#这里放ip 格式ip:port
li = []
proxy = f'账号:密码@{random.choice(li)}'
proxies = {
	'http': 'http://' + proxy,
	'https': 'http://' + proxy,
}

#注意提前创建database和table
conn = pymysql.connect(user='root',password='123123123',host='localhost',port=3306,database='lianjia')
cursor = conn.cursor()

根据首页获取最大页数pagesize。这个过程非常麻烦,包含页码的a节点无法提取出来,最后不得不另辟蹊径用pyquery+re组合提取出来 

# get page size 
def get_pagesize():
	res = requests.get(url=url,headers=headers)
	res.encoding = 'utf8'
	html = res.text
	doc = pq(html)
	pagesize_list = doc('.page-box.house-lst-page-box').attr('page-data')
	str_ps_list = str(pagesize_list)
	#print(pagesize_list)
	ps_pattern = re.compile('"totalPage":(d+)')
	ps = ps_pattern.search(str_ps_list).group(1)
	#print(ps)
	return ps 

接下来定义解析函数。

【注意】因为在sql语句中没有直接用%加入相应变量,而是在execute中分开(sql,values),所以sql语句中的%s不添加双引号,负责会报错syntax error 

另【注意】lock.acquire的位置千万别放进for li 循环中,否则报错。

def parse_listpage(url):
	time.sleep(0.25)
	try:
		res = requests.get(url=url,headers=headers,proxies=proxies)
	except:
		print('Error')
		res = requests.get(url=url,headers=headers,proxies=proxies)
	semaphore.release()
	res.encoding = 'utf8'
	html = res.text
	doc = pq(html)
	#print(doc)
	sell_list = doc('.sellListContent')
	detail_list = sell_list('.clear.LOGVIEWDATA.LOGCLICKDATA').items()

	lock.acquire()
	for li in detail_list:
		#print(li,'n')
		title = li('.title').text()

		#position 继续拆分为街道和区域
		position = li('.flood').text()

		street = position.split('-')[0].strip()
		street = escape_string(street)

		region = position.split('-')[1].strip()

		#house info 继续拆分7部分
		house_info = li('.address').text()
		house_info = house_info.split('|')

		layout = house_info[0].strip()

		area = house_info[1].strip()
		area = escape_string(area)

		orientation = house_info[2].strip()
		deco = house_info[3].strip()

		floor = house_info[4].strip()
		floor = escape_string(floor)

		if len(house_info) > 5:
			built_time = house_info[5].strip()
		else:
			built_time = 'None'

		if len(house_info) > 6:
			built_structure = house_info[6].strip()
		else:
			built_structure = 'None'

		tag = li('.tag').text()

		price = li('.totalPrice.totalPrice2').text()

		unit_price = li('.unitPrice').text()
		unit_price = escape_string(unit_price)

		print(title,'--',street,'--',region,'--',layout,
			'--',area,'--',orientation,'--',deco,'--',
			floor,'--',built_time,'--',built_structure,'--',
			tag,'--',price,'--',unit_price)


		sql = "insert into bj_chaoyang(title,street,region,layout,area,orientation,deco,floor,built_time,built_structure,tag,price,unit_price) 
		values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 
		values = (title,street,region,layout,area,orientation,deco,floor,built_time,built_structure,tag,price,unit_price)
		cursor.execute(sql,values)
		conn.commit()

	lock.release()

主函数。

注意pagesize在列表解析中要int一下,因为re提取出来是str

另外rt.join()阻塞线程,免得一个子线程结束了先把conn给关掉了那么后面数据就没法进数据库了。

if __name__ == '__main__':
	url = 'https://bj.lianjia.com/ershoufang/chaoyang/'
	#parse_listpage(url)
	# num of last page 
	pagesize = get_pagesize()
	#print(pagesize)
	urls = [f'https://bj.lianjia.com/ershoufang/chaoyang/pg{i}/' for i in range(2,int(pagesize)+1)]
	print(urls)
	lock = threading.Lock()
	li_rt = []
	semaphore = threading.BoundedSemaphore(5)
	for url in urls:
		print(url,'n')
		semaphore.acquire()
		t= threading.Thread(target=parse_listpage,args=(url,))
		t.start()
		li_rt.append(t)

	for rt in li_rt:
		rt.join()

	conn.close()

成果展示

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/488315.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号