- 前期准备工作
所需要的依赖 import pytesseract from PIL import Image import requests from lxml import etree import urllib.request from io import StringIO, BytesIO import re import datetime import json import mysql.connector import time import base64 import os
pytesseract重点依赖引入方法
废话不多说,直接开干!
首先安装库
然后按照tesseract程序下载安装
tessercat下载地址:https://digi.bib.uni-mannheim.de/tesseract/ //请依据自己的操作系统下载exe文件安装
用户变量,系统变量都添加:PATH C:Program Files (x86)Tesseract-OCR; //这是tesseract的安装目录
系统变量添加:TESSDATA_PREFIX C:Program Files (x86)Tesseract-OCR
pip install pytesseract pytesseract依赖安装命令
再找到pytesseract.py文件
修改添加tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r'C:Program FilesTesseract-OCRtesseract.exe'
- 创建一个保持登录凭据和mysql初始化
# requests.session,创建一个保持登录凭据的session实例 login_session = requests.session() mydb = mysql.connector.connect( host="######", user="######", passwd="######", database="######" ) mycursor = mydb.cursor()
- 首页url并获取Set-cookie
# 首页请求头
header = {
"Host": "#############",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,**;q=0.8",
"Referer": "##############",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"cookie": token_value[0:43] //首页url返回的cookie
}
imgbase64str=login_session.get(url="验证码url", headers=headers)
获取bs64流并保存到本地
request = imgbase64str.content
with open(r"D:\文件\7.png",'wb') as f:
f.write(request)
# login_session.get(url="http://222.134.6.165:8090/GenerateImage.jsp", headers=headers)
# items = html.xpath("//div[@class='ui-form-explain']//a//img//@src" )
# print(items[0])
# r = urllib.request.urlopen(r"D:\文件\7.png")
# f = open('VCode.jpg', 'wb') #这里是将验证码图片写入到本地文件
# f.write(r.read())
# f.close()
# imgBuf = BytesIO(r.read()) # 采用StringIO直接将验证码文件写到内存,省去写入硬盘
验证码图片的背景处理
img = Image.open(r"D:\文件\7.png") # PIL库加载图片
img = img.convert('RGBA') # 转换为RGBA
pix = img.load() # 读取为像素
for x in range(img.size[0]): # 处理上下黑边框
pix[x, 0] = pix[x, img.size[1] - 1] = (255, 255, 255, 255)
for y in range(img.size[1]): # 处理左右黑边框
pix[0, y] = pix[img.size[0] - 1, y] = (255, 255, 255, 255)
for y in range(img.size[1]): # 二值化处理,这个阈值为R=95,G=95,B=95
for x in range(img.size[0]):
if pix[x, y][0] < 95 or pix[x, y][1] < 95 or pix[x, y][2] < 95:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
img.save(r"D:\文件\5.png") # 由于tesseract限制,这里必须存到本地文件
使用tesseract去识别处理后的图片
text=pytesseract.image_to_string(r"D:\文件\5.png")
print(text[:-1])
验证码保存到本地处理之前
处理之后
获取验证码文字
- 拿到验证码进登录url并获取Set-cookie
headerse = {
"Host": "#############",
"Connection": "keep-alive",
"Content-Length": "79",
"Accept": "**; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Mobile Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "############",
"Referer": "######################",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"cookie": token_value[0:43] //登录url返回的cookie
}
def getYesterday():
today=datetime.date.today()
oneday=datetime.timedelta(days=1)
yesterday=today-oneday
return yesterday
def getyes():
now = datetime.datetime.now()
year = now.year
month = now.month
return datetime.date(year,month,1)
login_url = "数据url"
data = {
"startTime": "2022-02-10",
"endTime": "2022-02-10",
"monthTime": "2022-01",
"dateRange": "day",
"outfallId": "",
"rgnCode": "",
"yearTime": 2022,
"isShowAllData": 1,
"quarterTime": 1,
"orderType": "fromSmallToBig",
"outfallIds": "177511,21,176152,177508,177503,176151,177506,177510,177504,177505,177501,177509,177500,177512",
"halfYearTime": 1
}
data['startTime'] = getyes()
data['endTime'] = getYesterday()
print(data)
# 发送Post请求,提交用户名密码,注意不要忘记携带data
login_response = login_session.post(url=login_url, headers=headerss, data=data)
print(login_response.text)
data = json.loads(login_response.text)
# print(data[0])
val = []
for i in range(len(data)):
print("序号%s 值%s" % (i + 1, data[i]))
val.append(tuple(data[i].values()))
print(tuple(data[i].values()))
# #插入新数据
print(val)
sql = "INSERT INTO 表名称 (loadSo2,code,outfallId,dataCountO3,chromaSo2,chromaO3,dataCountPm25,invalidSo2,invalidNo2,apiO3,invalidCo,maxApi,loadNo2,dataCountCo,chromaPm25,invalidPm10,loadPm10,apiPm10,invalidO3,totalApi,dataCountNo2,outfallName,loadCo,apiCo,orderzh,apiPm25,dataCountPm10,chromaNo2,chromaCo,apiNo2,rgnName,chromaPm10,invalidPm25,loadPm25,mainPollution,apiSo2,dataCountSo2,loadO3) VALUES (%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s, %s, %s, %s,%s, %s, %s,%s, %s, %s,%s,%s)"
mycursor.executemany(sql, val)
mydb.commit() # 数据表内容有更新,必须使用到该语句
参考链接
https://www.cnblogs.com/chenlove/p/14038580.html
https://blog.csdn.net/qiushi_1990/article/details/78041375
https://www.jb51.net/article/187678.htm
https://blog.csdn.net/xiaxianba/article/details/89450855
https://blog.csdn.net/purvispanwu/article/details/107099452
https://www.polarxiong.com/archives/python-tesseract-verification-code.html
https://www.cnblogs.com/zhangb8042/articles/10410263.html



