import requests
from pyquery import PyQuery as pq
import json
import pandas as pd
from multiprocessing.pool import Pool
columns = ['title', 'msg', 'price', 'per_meter']
def get_a_page(url):
req = requests.get(url)
doc = pq(req.text)
ul = doc('.sellListContent')
divs = ul.children('.clear .info.clear').items()
count = 0
titles = []
places = []
msgs = []
prices = []
per_meters = []
for div in divs:
count += 1
title = div.children('.title a').text()
place = div.children('.address .flood .positionInfo a').text()
msg = div.children('.address .houseInfo').text()
price = div.children('.address .priceInfo .totalPrice span').text()
per_meter = div.children('.address .priceInfo .unitPrice').attr('data-price')
dict = {
'title': title,
'place': place,
'msg': msg,
'price': price,
'per_meter': per_meter
}
titles.append(title)
places.append(place)
msgs.append(msg)
prices.append(price)
per_meters.append(per_meter)
print(str(count) + ':' + json.dumps(dict, ensure_ascii=False))
datas = {
'title': titles,
'place': places,
'msg': msgs,
'price': prices,
'per_meter': per_meters
}
df = pd.Dataframe(data=datas, columns=columns)
df.to_csv('xaesf.csv', mode='a', index=False, header=False)
if __name__ == '__main__':
pool = Pool(10)
group = ([f'https://xa.ke.com/ershoufang/pg{x}' for x in range(1, 101)])
pool.map(get_a_page, group)
pool.close()
pool.join()