#!/usr/bin/env python
import sys
import random
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format("root", "123456", "localhost", 3306, "flush")
engine = create_engine(connect_info)
from config import config
class Flush(object):
def __init__(self):
self.PAGE_TRACK = 1
self.MAX_PAGE = 5
self.PROXY_POOL_API = "http://127.0.0.1:5555/random"
self.PAGE_LIST = []
self.proxy_con = 0
self.MAX_PAGE_flag = True
def downloader(self, url, num_retries=1):
headers = config.get_headers()
bord_list = []
try:
time.sleep(random.random() * 1) # 设置延时
respons = requests.get(url, headers=headers, timeout=4)
html = str(respons.content, encoding="gbk")
soup = BeautifulSoup(html, 'html.parser')
# 按标签名查找
print(soup.title)
# 读取title属性
cate_group = soup.select(".cate_group .cate_items a")
for group in cate_group:
bord_dic = {}
bord = group.string
bord_dic["name"] =bord
bord_dic["bord"] =group['href'].split("/")[-2]
# 需要再加
bord_list.append(bord_dic)
df1 = pd.Dataframe(bord_list)
print(bord_list)
df1.to_sql('concept_bord', engine, if_exists='append', index=False)
print('PAGGE is {} , URL is:{} to Mysql table successfully!'.format(self.PAGE_TRACK, url))
except Exception as e:
print("异常{}, 重新下载{}".format(e, url))
def main():
try:
flush = Flush()
flush.downloader("http://q.10jqka.com.cn/gn/")
except Exception as err:
print(err)
if __name__ == "__main__":
try:
main()
finally:
sys.exit()