__author__ = 'Jeff.xie'
# _*_ encoding:utf-8 _*_
import os,time,sys
import fitz
import xlsxwriter
import openpyxl
import shutil
class Ref():
def __init__(self, source_file, Customer_ID, Ref_list):
self.Customer_ID = Customer_ID
self.source_file = source_file
self.Ref_list = Ref_list
class Shot():
ref_summary = []
sheet_list = []
Image_path = ""
def get_file_shot(self, sourcefile, customer_id, ref_list, image_dir):
doc = fitz.open(sourcefile)
for ref in ref_list:
if "eive money" in ref:
continue
self.ref_summary.append(ref)
elif ref in self.ref_summary:
continue
else:
self.ref_summary.append(ref)
# print("start shot: ", ref)
self.get_ref_shot(doc, customer_id, ref, image_dir)
doc.close()
def get_ref_shot(self, doc, customer_id, ref_text, image_dir):
for pi in range(doc.pageCount):
page = doc[pi]
text_instances = page.searchFor(ref_text)
# 数字越小,截取的高度越小
five_percent_height = (page.rect.br.y - page.rect.tl.y) * 0.02
two_percent_height = (page.rect.br.y - page.rect.tl.y) * 0.015
for inst in text_instances:
highlight = page.addHighlightAnnot(inst)
# define a suitable cropping box which spans the whole page
# and adds padding around the highlighted text
# print(page.rect.tl.x)
# print(page.rect.br.x)
tl_pt = fitz.Point(page.rect.tl.x+60, max(page.rect.tl.y, inst.tl.y - five_percent_height))
br_pt = fitz.Point(page.rect.br.x-50, min(page.rect.br.y, inst.br.y + two_percent_height))
hl_clip = fitz.Rect(tl_pt, br_pt)
zoom_mat = fitz.Matrix(3.2, 3.2) # 数字越大,截取图片的清晰度越高
pix = page.getPixmap(matrix=zoom_mat, clip=hl_clip)
file_name = customer_id + "_" + ref_text.strip() + ".png"
pix.writePNG(image_dir + file_name)
def write_picture(self,excel_dir,image_dir,customer_list):
book = xlsxwriter.Workbook(excel_dir + "/picture_export.xlsx") # 保存的文件名
picture_files = os.listdir(image_dir)
book.add_worksheet("Test Result")
for c in customer_list:
sheet_pic = book.add_worksheet(c)
# print("Start to get picture for {}".format(c))
index = 0
for f in picture_files:
if f.startswith(c):
self.write_picture_to_excel(sheet_pic, os.path.join(image_dir, f), index)
index += 1
book.close()
def generate_picture(self,sheet,refs_list,wk,image_dir):
for i in range(1, sheet.max_row):
row = [item.value for item in list(sheet.rows)[i]]
# print('第{}行值'.format(str(i)),row)
refs = row[3].strip()
refs = refs[1:len(refs) - 1]
refss = refs.split(",")
p = row[2].replace(r"/", "\")
abs_path = os.path.join(p, row[1]).strip()
refs_list.append(Ref(abs_path, row[0].strip(), refss))
self.sheet_list.append(row[0].strip())
wk.close()
for refs in refs_list:
shot.get_file_shot(refs.source_file, refs.Customer_ID, refs.Ref_list, image_dir)
def write_picture_to_excel(self, sheet, picture_file, index):
# picture_file =r"D:Projecte-Statementestatement_pdf2021_06_08_estatementImage8000013533_FT21141H18Q8.jpg"
# sheet.insert_image('A12', picture_file, {'x_offset': 15, 'y_offset': 10}) # 存入表格的位置和图片的路径
sheet.insert_image('A{}'.format(index * 10+1), picture_file) # 存入表格的位置和图片的路径,位置只能从A1开始,没有A0
def main_shot(self, path):
refs_list = []
wk = openpyxl.load_workbook(path)
# sheet = wk.get_sheet_by_name('Result') #这种方式有warning
sheet = wk['Result']
# row3=[item.value for item in list(sheet.rows)[2]]
# print('第3行值',row3)
col1 = [item.value for item in list(sheet.columns)[0]]
# print('第1列值',col1)
customer_list = col1[1:]
excel_dir = os.path.split(path)[0]
image_dir = os.path.split(path)[0] + "/Image/"
# print(os.path.split(path)[0])#获取字符串中的文件夹绝对路径
# print(os.path.split(path)[1])#获取字符串中的文件名
if not os.path.exists(image_dir):
os.mkdir(image_dir)
else:
shutil.rmtree(image_dir)
time.sleep(0.2)
os.mkdir(image_dir)
print("image_dir is exist")
time.sleep(0.2)
self.generate_picture(sheet,refs_list,wk,image_dir)
self.write_picture(excel_dir,image_dir,customer_list)
if __name__ == '__main__':
start_time = time.time()
shot = Shot()
try:
ref_file_path = sys.argv[1]
# ref_file_path = r"D:/Project/e-Statement/estatement_pdf/2021_07_27_estatement/Result_fail_Refs.xlsx"
shot.main_shot(ref_file_path)
except:
print("did not execute")
end_time = time.time()
print("cost time: {}".format(end_time-start_time))