栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

【python数据分析】2.爬虫

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

【python数据分析】2.爬虫

1.爬虫介绍



2.准备工作







3.构建流程

# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm

import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3

def main():
    print("这是main")
    baseurl = "http://movie.douban.com/top250?start="
    #1.爬取网页
    datalist = getData(baseurl)
    savapath = ".\豆瓣电影Top250.xls"

    #3.保存数据
    savaData(savapath)
    
#爬取数据
def getData(baseurl):
    datalist = []
    return datalist

def savaData(savapath):
    print()


if __name__=="__main__":
    main()













4.urllib

getData&askURL

# coding:utf-8
# @Time : 22/5/1 15:00
# @Author : Justha
# @File : testurllib.py
# @Software: PyCharm
# 想放弃了就去给我做运动!

import urllib.request

url = "http://www.douban.com/"
headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
5.获取数据
# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm

import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3

def main():
    print("这是main")
    baseurl = "http://movie.douban.com/top250?start="
    #1.爬取网页
    datalist = getData(baseurl)
    savapath = ".\豆瓣电影Top250.xls"

    #3.保存数据
    savaData(savapath)
    
#爬取数据
def getData(baseurl):
    datalist = []
    for i in range(0,10):
        url=baseurl+str(i*25)
        print(url)
        html=askURL(url)
        datalist.append(html)
    print(datalist[9])
    return datalist


def savaData(savapath):
    print()

# 获取网页内容
def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
    }
    request = urllib.request.Request(url,headers=head)
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    return html

if __name__=="__main__":
    main()













6.beautifulsoup
# coding:utf-8
# @Time : 22/5/1 16:04
# @Author : Justha
# @File : testBs4.py
# @Software: PyCharm
# 想放弃了就去给我做运动!
import re

from bs4 import BeautifulSoup

file = open("./baidu.html","rb")
html=file.read()
bs=BeautifulSoup(html,"html.parser")

# # 1.Tag 为获取到的第一个标签及其内容
# print(type(bs))
# print(type(bs.meta))    #类型为Tag
# print(bs.meta)
#
# # 2.NavigableString 标签里的内容(字符串)
# print(bs.a.string)
# print(bs.a.attrs)       #拿到标签的属性
#
# # 3.BeautifulSoup 整个文件
# # print(bs)
#
# # 4.注释(会将string里的注释去掉,拿到内容)
# print(bs.a.string)      #这里输出为新闻1,而不是
#
# # 文档遍历
# print(bs.head.contents)
# print(bs.head.contents[1])

# 文档搜索

# # 1)find_all        findAll???
# # 字符串过滤,只会查找完全一致的内容
# a_list=bs.find_all("a",limit=3)       #标签   limit限制数量
# print(a_list)
# a_list=bs.find_all(id="head")   #id
# print(a_list)
# a_list=bs.find_all(class_=True)     #class
# print(a_list)
# a_list=bs.find_all(text=re.compile("d"))     #标签里所有包含数字的内容
# print(a_list)


# # 2)正则表达式
# a_list2=bs.find_all(re.compile("a"))
# print(a_list2)

# 3)select
# list3=bs.select("a")    #标签
# list3=bs.select(".mnav")    #类
# list3=bs.select("#head")    #id
# list3=bs.select("a[class='bri']")    #  a.bri
# list3=bs.select("head>title")    #  head title
list3=bs.select(".mnav ~ .bri']")    #  mnav的所有同级兄弟里的br

for item in list3:
    print(item)

7.正则表达式


至保存sql前代码
# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm

import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3

def main():
    print("这是main")
    baseurl = "http://movie.douban.com/top250?start="
    #1.爬取网页
    datalist = getData(baseurl)
    # savapath = ".\豆瓣电影Top250.xls"

    #3.保存数据
    savaData(datalist)
    
#爬取数据
def getData(baseurl):
    datalist = []
    findTitle = re.compile(r')
    findLink=re.compile(r'')
    findImgSrc=re.compile(r',re.S)    #?代表非贪婪模式,找到一个就停止,re.S代表不管换行符
    findScore=re.compile(r'(.*)')
    findCommentNum=re.compile(r'(d*)人评价')
    findInq=re.compile(r'(.*?)')
    findBd=re.compile(r'.*

(.*?)

',re.S) for i in range(0,10): url=baseurl+str(i*25) print(url) html=askURL(url) soup = bs4.BeautifulSoup(html, "html.parser") for item in soup.find_all('div',class_="item"): data = [] item=str(item) # 搜索item中匹配正则表达式的内容 title = re.findall(findTitle, item)[0] score = re.findall(findScore,item)[0] commentNum=re.findall(findCommentNum,item)[0] try: inq = re.findall(findInq, item)[0] except IndexError as e: print(e) bd=re.findall(findBd,item)[0] link = re.findall(findLink,item)[0] imgSrc = re.findall(findImgSrc,item)[0] # 输出那些内容 print("%s %s %s人评价 %sn%sn%sn%s"%(title,score,commentNum,inq,link,imgSrc,bd)) # 下载图片 # savePlace="./downloadFile/"+title+".jpg" #图片下载保持地址 # urllib.request.urlretrieve(imgSrc,savePlace) # 保存数据 data.append(title) data.append(score) data.append(commentNum) data.append(inq) data.append(link) data.append(imgSrc) # data.append(bd) # print(data) datalist.append(data) # print(datalist) return datalist def savaData(datalist): workbook = xlwt.Workbook(encoding="utf-8",style_compression=0) worksheet = workbook.add_sheet('sheet666',cell_overwrite_ok=True) title=('电影名','分数','评论人数','描述','电影链接','电影图片链接') for i in range(0,6): worksheet.write(0,i,title[i]) for j in range(0,250): for k in range(0,6): worksheet.write(j+1,k,datalist[j][k]) workbook.save('豆瓣Top250.xls') # 获取网页内容 def askURL(url): head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36" } request = urllib.request.Request(url,headers=head) try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html if __name__=="__main__": main() print("爬取完毕!")
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/866827.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号