获取多个PDF文件的内容并保存到excel上

发布时间:2024年01月12日

# shuang
# 开发时间:2023/12/9 22:03

import pdfplumber
import re
import os
import pandas as pd
import datetime


def re_text(bt, text):
    # re 搜索正则匹配 包含re.compile包含的文字内容
    m1 = re.search(bt, text)
    if m1 is not None:
        return re_block(m1[0])
    return None


def re_block(text):
    # 去掉空格、中英文小括号、中文冒号变英文冒号;去掉中文全角空格
    return text.replace(' ', '').replace(' ', '').replace(')', '').replace(')', '').replace(':', ':')


def get_pdf(dir_path):
    pdf_file = []
    for root, sub_dirs, file_names in os.walk(dir_path):
        for name in file_names:
            if name.endswith('.pdf'):
                filepath = os.path.join(root, name)
                pdf_file.append(filepath)
    return pdf_file


def read(xlsx_path, pdf_root):
    # 构建excel writer 写入器
    writer = pd.ExcelWriter(xlsx_path)
    # 如果字段不通用 则需要单独拎出来判断,这里我全部拎出来做了if判断
    all_fields = {
        "序号": [],
        "服务商": [],
        "车型": [],
        "上车日期": [],
        "上车时间": [],
        "城市": [],
        "起点": [],
        "终点": [],
        "金额(元)": [],
    }

    filenames = get_pdf(pdf_root)
    for filename in filenames:
        print(f"正在读取:{filename}")
        with pdfplumber.open(filename) as pdf:
            first_page = pdf.pages[0]
            pdf_text = first_page.extract_text()
            print(pdf_text)

            table = first_page.extract_tables()[0]

            # 纳税人识别号 购买方
            for (id, t) in enumerate(table):
                if id > 0:
                    curr1 = t[0].split(' ')
                    curr = []
                    for item in curr1:
                        curr.append(item.replace('\n', ''))
                    # t_ = str(t).replace(" ", "")
                    all_fields["序号"].append(curr[0])
                    all_fields["服务商"].append(curr[1])
                    all_fields["车型"].append(curr[2])
                    all_fields["上车时间"].append(curr[3])
                    all_fields["上车日期"].append(curr[4])
                    all_fields["城市"].append(curr[5])
                    all_fields["起点"].append(curr[6])
                    all_fields["终点"].append(curr[7])
                    if len(curr) > 8:
                        all_fields["金额(元)"].append(curr[8])
                    else:
                        all_fields["金额(元)"].append(curr[7])

    df = pd.DataFrame(all_fields)
    df.to_excel(writer)
    writer._save()

    return



pdf_root = r"行程单"
now = datetime.datetime.now()
now = str(now).split(' ')[0]
print(now)
xlsx_path = f"行程单{now}.xlsx"

read(xlsx_path, pdf_root)

文章来源:https://blog.csdn.net/weixin_64974855/article/details/135561842
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。