栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

使用tkinter爬取二手房交易网站信息

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

使用tkinter爬取二手房交易网站信息

此程序经过测试非常的稳定,无网络状态下会提示,网络正常就可以爬取,每次最多50页,可根据需要选择。问题就是代码过于冗长,看得人眼花缭乱,估计只有我自己能看懂,水平有限,见谅了。做这个程序是想练习下综合的实战应用,界面页相对简洁友好,整个程序也都做个异常处理,当然一定会有不够完善的地方,也请各位同仁批评指正。 界面如下:

 

 代码如下(不含代理IP的API):
from tkinter import *
import requests
from lxml import html
import random
import xlwt
import time
from requests.adapters import HTTPAdapter
from retry import *
from tkinter.filedialog import askdirectory
import threading
import tkinter.messagebox  # 弹窗库

root = Tk()
global list_title

import os

'''
文件路径选择
'''

xVariable = tkinter.IntVar()


def selectPath():
    path_ = askdirectory(title="路径选择")

    if path_ == "":
        path.get()
    else:
        path_ = path_.replace("/", "\")  # 实际在代码中执行的路径为““ 所以替换一下
        path.set(path_)


def openPath():
    dir = os.path.dirname(path.get() + "\")
    os.system('start ' + dir)  # start+空格 相当于windows里CMD命令打开文件夹


path = StringVar()
path.set(os.path.abspath("."))
text = Listbox(root, font=('微软雅黑', 15), width=52, height=13)
text.grid_configure(row=1, columnspan=6)
sc = Scrollbar(root, command=text.yview)
sc.grid_configure(row=1, column=7, sticky=S + E + N)
sc1 = Scrollbar(root, orient="horizontal", command=text.xview,)
sc1.grid_configure(row=2, columnspan=6, sticky=EW)
text.config(yscrollcommand=sc.set)
text.config(xscrollcommand=sc1.set)


def download():
    try:
        response = requests.get("http://www.xiongmaodaili.cn/").status_code
        if response == 200:
            lujing["state"] = DISABLED
            spbox["state"] = DISABLED
            button["text"] = "正在爬取"
            button["state"] = DISABLED
            savapath = path.get()
            page = xVariable.get()

            def get_ip():

                try:
                    url1 = '代理IP的地址,这里删除了...'
                    response = requests.get(url1)
                    if response.status_code == 200:
                        while response.text[0] == '{' or response.text[0] == "":
                            time.sleep(2)
                            response = requests.get(url1)
                        return [x for x in response.text.split('r')]
                        # print('获取ip失败')

                        # 此处返回的内容是多行的字符串,使用列表表达式使其拆分成组合成列表

                    else:
                        text.insert(END, '请求失败')
                except Exception as e:
                    text.insert(END, e)

            i = 0

            @retry(tries=4, delay=1, backoff=1, jitter=0, max_delay=1)
            def my_request(url):

                requests.adapters.DEFAULT_RETRIES = 15
                s = requests.session()
                s.keep_alive = False  # 关闭之前的连接,避免连接过多
                global r
                try:
                    ips = get_ip()
                    proxy = {'https': ips[0]}
                    text.insert(END, proxy)
                    r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
                    r.encoding = 'utf-8'

                except baseException:  # 捕获异常的时候,这里粗略的写了baseException,根据需要可写的更具体。
                    text.insert(url, "请求失败,开始重试")
                    ips = get_ip()
                    proxy = {'https': ips[0]}
                    text.insert(proxy)
                    r = requests.get(url, headers=head, proxies=proxy, verify=False, timeout=5)
                    r.encoding = 'utf-8'
                return r

            global r
            head = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
                "accept": "application/json, text/plain, */*", "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh-CN,zh;q=0.9"}
            work_book = xlwt.Workbook(encoding="utf-8")
            sheet = work_book.add_sheet("巴州二手房信息")
            sheet.write(0, 3, "小区名称")
            sheet.write(0, 4, "区域1")
            sheet.write(0, 5, "区域2")
            sheet.write(0, 6, "地址")

            sheet.write(0, 7, "总价(万元)")
            sheet.write(0, 8, "单价(元/㎡)")
            sheet.write(0, 2, "房子大小(㎡)")
            sheet.write(0, 1, "房型")
            sheet.write(0, 0, "标题")
            row_num = 1
            z = 0
            v = 0

            # page = xVariable.get()

            for i in range(0, page):
                v += 1
                i += 1
                try:
                    url = "https://bygl.58.com/ershoufang/p" + str(i) + "/"
                    text.insert(END, url)
                    text.see(END)
                    text.update()
                    my_request(url)

                except Exception as e:
                    text.insert(END, e)
                    text.insert(END, "第" + str(i) + "页出错!")
                    text.insert(END, "--------------------------")

                    continue

                else:

                    preview_html = html.fromstring(r.text)

                    list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                                    "@class='property-content-info-comm-name']/text()|//p[ "
                                                    "@class='property-content-info-comm-address']//span/text()|//span[ "
                                                    "@class='property-price-total-num']/text()|//p["
                                                    "@class='property-price-average']/text()|//p["
                                                    "@class='property-content-info-text'][1]/text()|//p["
                                                    "@class='property-content-info-text property-content-info-attribute']//span//text()")
                    list_title = [str(x) for x in list_title]
                    while not list_title:
                        text.insert(END, "列表为空,重新获取代理IP:")
                        my_request(url)

                        list_title = preview_html.xpath("//div[@class='property-content-title']/h3/text()|//p["
                                                        "@class='property-content-info-comm-name']/text()|//p[ "
                                                        "@class='property-content-info-comm-address']//span/text()|//span[ "
                                                        "@class='property-price-total-num']/text()|//p["
                                                        "@class='property-price-average']/text()|//p["
                                                        "@class='property-content-info-text'][1]/text()|//p["
                                                        "@class='property-content-info-text property-content-info-attribute']//span//text()")
                        list_title = [str(x) for x in list_title]

                        time.sleep(random.random() * 2)
                        break

                    text.insert(END, "成功爬取第" + str(i) + "页数据")

                    z += 1
                    text.insert(END, "抓取成功率:{:.2%}".format(z / v))
                    text.insert(END, "----------------------------")
                    text.see(END)
                    text.update()
                for j in range(len(list_title)):

                    if j % 14 == 0:
                        title = list_title[j + 8]
                        area1 = list_title[j + 9]
                        biaoti = list_title[j]
                        area2 = list_title[j + 10]
                        area3 = list_title[j + 11]
                        totalnum = list_title[j + 12]
                        avg = list_title[j + 13]
                        size = list_title[j + 7].strip().strip('n')
                        house_type = list_title[j + 1] + list_title[j + 2] + list_title[j + 3] + list_title[j + 4] + 
                                     list_title[
                                         j + 5] + list_title[j + 6]
                        # print(type(list_title[j + 6]))

                        sheet.write(row_num, 3, title)
                        sheet.write(row_num, 4, area1)
                        sheet.write(row_num, 5, area2)
                        sheet.write(row_num, 6, area3)
                        sheet.write(row_num, 7, totalnum)
                        sheet.write(row_num, 8, avg)
                        sheet.write(row_num, 2, size)
                        sheet.write(row_num, 1, house_type)
                        sheet.write(row_num, 0, biaoti)
                        row_num += 1
                time.sleep(3)
            # file_name = r"F:巴州二手房爬取.xls"

            file_name = savapath + "\" + time.strftime("%Y-%m-%d") + "共" + str(page) + "页数据.xls"
            print(file_name)

            if os.path.exists(file_name):

                k = 1
                name, suffix = file_name.split(".")  # 数据处理,以点号进行前后分割
                name = name + "(" + str(k) + ")"
                file_name = name + "." + suffix

                if os.path.exists(file_name):

                    print(file_name)
                    while os.path.exists(file_name):
                        name2 = name.split("(")
                        name1 = name2[1].split(")")
                        k = int(name1[0]) + k
                        file_name = name2[0] + "(" + str(k) + ")" + "." + suffix
                        # print(name1)
                        # print(name)
                        # print(k)

                        print(file_name)

                    work_book.save(file_name)
                else:
                    work_book.save(file_name)

            else:
                work_book.save(file_name)
            lujing["state"] = ACTIVE
            text.insert(END, "文件生成成功:" + file_name)
            text.insert(END, "n")
            tkinter.messagebox.showinfo("提示", "文件生成成功:" + file_name)
            button["text"] = "开始爬虫"
            button["state"] = ACTIVE
            spbox["state"] = "readonly"


    except:

        text.insert(END,"网络异常,请检查后重试")
        text.see(END)


def spiderok():
    page = xVariable.get()
    if tkinter.messagebox.askokcancel("", "确定爬取巴州房产" + str(page) + "页的数据n并保存至" + path.get(), default="cancel"):
        download()


def thread_it(func):
    t = threading.Thread(target=func)
    t.setDaemon(True)
    t.start()


root.title("巴音郭楞蒙古自治州58同城二手房数据爬取")
root.geometry("820x600+600+230")


def tuichu():
    if tkinter.messagebox.askokcancel(title="提示", message="您确定要退出吗?", default="cancel"):
        root.quit()


root.protocol("WM_DELETE_WINDOW", tuichu)

button = Button(root, text="开始爬虫", font=("微软雅黑", 10), command=lambda: thread_it(spiderok))
button.grid(row=3, sticky=W)
lb = Label(root, text="页数选择:")
lb.grid(row=4, column=0, sticky=W)

spbox = Spinbox(root, from_=1, to=50, increment=1, justify="center", textvariable=xVariable, state="readonly", width=8)
spbox.grid(row=4, column=1, sticky=W)
button1 = Button(root, text="退出", padx=10, font=("微软雅黑", 10), command=tuichu)
button1.grid(row=3, column=4, sticky=E)
Label(root, text="存储路径:").grid(row=5, column=0, sticky=W)
Entry(root, textvariable=path, state="readonly").grid(row=5, column=1, ipadx=170)

lujing = Button(root, text="更改路径", command=selectPath)
lujing.grid(row=5, column=3, sticky=E)
Button(root, text="显示路径", command=openPath).grid(row=5, column=4, sticky=E)
root.update()
root.mainloop()

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/299737.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号