1 爬取搜狗首页的页面数据
import requests
url = 'https://www.sogou.com/' # 搜狗页面
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
page_text = requests.get(url=url, headers=headers).text # 获取页面HTML数据
# 进行持久化存储
with open('./sogou.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('data saved successfully')
2 爬取搜狗指定词条对应的搜索结果页面
import requests
url = 'https://www.sogou.com/web' # 基础url
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
kw = input('enter a word:') # 输入需要查询的对象
param = {'query':kw}
# 此处需要有HTTP基本原理的了解(相当于重新构建了一个新的URL)
page_text = requests.get(url=url, params=param, headers=headers).text
# 持久化存储
fileName = './' + kw + '.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('data saved successfully')
3 爬取百度翻译
import requests
import json
post_url = 'https://fanyi.baidu.com/sug' # post请求
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
word = input('enter a word:') # 输入翻译对象
data = {'kw':word}
# post的data参数也同样是构造了一个新的url
response = requests.post(url=post_url, data=data, headers=headers)
# 获取的是json数据
dic_obj = response.json()
# 持久化存储
fileName = './' + word + '.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_obj, fp=fp, ensure_ascii=False) # ensure_ascii保证中文文本正常
print('data saved successfully')
4 爬取豆瓣电影分类排行榜
import requests
import json
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '20',
'interval_id': '100:90',
'action':'',
'start': '0', # 页数
'limit': '20' # 每页的数量
}
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
response = requests.get(url=url, params=param , headers=headers)
list_data = response.json()
# json文件存储
fp = open('./douban.json', 'w', encoding='utf-8')
json.dump(list_data, fp=fp, ensure_ascii=False)
print('data saved successfully')
5 爬取肯德基餐厅位置
import requests
import json
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
kw = input('enter a word:') # 输入地理位置
data = {
'cname':'',
'pid':'' ,
'keyword': kw,
'pageIndex':'1', # 页码
'pageSize':'10' # 每页数量
}
kfc_data = requests.post(url=url, data=data, headers=headers).text
fp = open('./kfccda.json', 'w', encoding='utf-8')
json.dump(kfc_data, fp=fp, ensure_ascii=False)
print('data saved successfully')
6 爬取国家药品管理监察管理总局化妆品生产许可相关数据
import requests
import json
"""实现:
1.获取列表页的所有公司的id
2.获取每一个公司的详细信息
"""
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList' # Ajax页面渲染
headers = {
'User-Agent':'Mozilla/4.0 (compatible;MSIE 5.5;Windows NT)'
}
"""第一步实现"""
ids = []
for page in range(1,3):
data = {
'on': 'true',
'page': str(page),
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname': '',
'applysn':''
}
list_data = requests.post(url=url, data=data, headers=headers).json()
for id in list_data['list']:
ids.append(id['ID'])
"""第二步实现"""
all_data = []
detail_url ='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById' # 详情页的url
for id in ids:
id_data = { 'id':id}
one_data = requests.post(url=detail_url, data=id_data, headers=headers).json()
print(one_data)
all_data.append(one_data)
# 持久化存储
fp = open('./huazhuangping.json', 'w', encoding='utf-8')
json.dump(all_data, fp=fp, ensure_ascii=False)
print('data saved successfully')



