import re
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
'cookie':
}
url = "https://www.wdzj.com/daohang.html"
response = requests.get(url, headers=header)
response.encoding = response.apparent_encoding
response = response.text #文本1
soup = BeautifulSoup(response,'lxml')
text = soup.find_all('a', class_='narr_01') # 网页原始文本
links = [] #
txts = [] #网站名称
url1 = [] #最终网站url
for tag in text:
links.append(tag.get('href'))
txts.append(tag.get_text())
#
for i in range(0, len(links)):
res1 = resquests.get('https://www.wdzj.com'+links[i], headers=header)
res1.encoding = res1.apparent_encoding
res1 = res1.text #关联网站的HTML
soup1 = BeautifulSoup(res2,'lxml')
te = soup1.find_all('a',class_="tag tag-site")
for i in te:
url = x.get('href')
url1.append(url[0])
if len(url1) == len(txts):
for i in range(0,len(url1)):
print(url1[i], ' ', txts[i])
"""
http://www.hzed.com ***
https://s.growingio.com/YqyeRg ***
https://www.rjs.com/index.html?rjspromote=wdzjadnav ***
http://www.penging.com/ ***
"""