# shuang
# 开发时间:2023/12/9 22:03
import pdfplumber
import re
import os
import pandas as pd
import datetime
def re_text(bt, text):
# re 搜索正则匹配 包含re.compile包含的文字内容
m1 = re.search(bt, text)
if m1 is not None:
return re_block(m1[0])
return None
def re_block(text):
# 去掉空格、中英文小括号、中文冒号变英文冒号;去掉中文全角空格
return text.replace(' ', '').replace(' ', '').replace(')', '').replace(')', '').replace(':', ':')
def get_pdf(dir_path):
pdf_file = []
for root, sub_dirs, file_names in os.walk(dir_path):
for name in file_names:
if name.endswith('.pdf'):
filepath = os.path.join(root, name)
pdf_file.append(filepath)
return pdf_file
def read(xlsx_path, pdf_root):
# 构建excel writer 写入器
writer = pd.ExcelWriter(xlsx_path)
# 如果字段不通用 则需要单独拎出来判断,这里我全部拎出来做了if判断
all_fields = {
"序号": [],
"服务商": [],
"车型": [],
"上车日期": [],
"上车时间": [],
"城市": [],
"起点": [],
"终点": [],
"金额(元)": [],
}
filenames = get_pdf(pdf_root)
for filename in filenames:
print(f"正在读取:{filename}")
with pdfplumber.open(filename) as pdf:
first_page = pdf.pages[0]
pdf_text = first_page.extract_text()
print(pdf_text)
table = first_page.extract_tables()[0]
# 纳税人识别号 购买方
for (id, t) in enumerate(table):
if id > 0:
curr1 = t[0].split(' ')
curr = []
for item in curr1:
curr.append(item.replace('\n', ''))
# t_ = str(t).replace(" ", "")
all_fields["序号"].append(curr[0])
all_fields["服务商"].append(curr[1])
all_fields["车型"].append(curr[2])
all_fields["上车时间"].append(curr[3])
all_fields["上车日期"].append(curr[4])
all_fields["城市"].append(curr[5])
all_fields["起点"].append(curr[6])
all_fields["终点"].append(curr[7])
if len(curr) > 8:
all_fields["金额(元)"].append(curr[8])
else:
all_fields["金额(元)"].append(curr[7])
df = pd.DataFrame(all_fields)
df.to_excel(writer)
writer._save()
return
pdf_root = r"行程单"
now = datetime.datetime.now()
now = str(now).split(' ')[0]
print(now)
xlsx_path = f"行程单{now}.xlsx"
read(xlsx_path, pdf_root)