python实现PDF表格与文本分别导出EXCEL
现需将pdf 转换至Excel ,
目前实现方式:将PDF的TABLE部分与 非 TABLE部分分别导出至Excel两个sheet中
1)、识别PDF中的表格块
2)、将PDF转换为Word格式
3)、提取Word中非表格的文本数据
4)、对文本与表格重复的行进行去重
5)、合并导出至Excel不同sheet页中
# coding=UTF8
import datetime
from docx import Document
from pdf2docx import Converter
import pandas as pd
import numpy as np
import pdfplumber
import os
import fitz
# TODO 输出PDF表格数据至Excel
def extractTables(filepath):
with pdfplumber.open(filepath) as pdf:
tables = []
for i in range(0, len(pdf.pages)):
page = pdf.pages[i]
tables.append(page.extract_tables())
df = pd.DataFrame()
df_seperation = pd.DataFrame([np.nan, np.nan]) # 创建空白的,用于充当分隔行
for i in range(0, len(tables)):
tabular = tables[i] # 选取第i页的表格
if len(tabular) > 0: # 如果该页存在表格的话
for j in range(0, len(tabular)): # j 表示第几个表格
df_temp = pd.DataFrame(tabular[j])
df = pd.concat([df, df_seperation, df_temp]) # 更新总表格
return df
# TODO pdf 转Word
def extractWord(pdffilepath, wordfilepath):
cv = Converter(pdffilepath)
cv.convert(wordfilepath)
cv.close()
# TODO 获取非表格内容
def getDocLines(wordfilepath):
doc = Document(wordfilepath)
paragraphs = doc.paragraphs
lines = []
for paragraph in paragraphs:
line = paragraph.text.strip()
if not line:
continue
lines.append(line)
# aspose用的体验板,带有页眉
# lines = lines[100:]
# print(lines)
df = pd.DataFrame(lines)
return df
# TODO 删除与table重复的行数据
def txt(tabledf, txtdf):
lines = []
for line in tabledf[0]:
lines.append(str(line))
# 获取txt与Excel重复的数据
repeat_txt = []
for line in lines:
for txt in txtdf[0]:
if line.find(txt) != -1:
repeat_txt.append(txt)
txtdf = txtdf.drop(txtdf[txtdf[0] == txt].index)
return txtdf
if __name__ == '__main__':
filepath = 'D:\develop_python\Python_Demo\PDF_TO_EXCEL\YM2021\\'
outpath = 'D:\develop_python\Python_Demo\PDF_TO_EXCEL\YM2021\\'
pdffile = '南通-2.21'
pdffilepath = filepath + str(pdffile) + '.pdf'
excelfilepath = filepath + str(pdffile) + '.xlsx'
wordfilepath = filepath + str(pdffile) + '.docx'
# 获取PDF表格数据
try:
starttime = datetime.datetime.now()
print('执行开始', starttime)
if not os.path.exists(outpath):
os.makedirs(outpath)
print()
print('正在读取表格数据........')
tabledf = extractTables(pdffilepath)
print('表格数据读取完成........')
print()
print('正在转换Word......')
extractWord(pdffilepath, wordfilepath)
print('Word转换完成......')
print()
print('正在解析非表格文本数据......')
txtdf = getDocLines(wordfilepath)
# 删除与table重复的行数据
df = txt(tabledf, txtdf)
print('非表格数据解析完成......')
print()
print('正在输出Excel文件......')
writer = pd.ExcelWriter(excelfilepath)
tabledf.to_excel(writer, sheet_name='EXCEL', index=False)
# txtdf.to_excel(writer, sheet_name='txt', index=False)
df.to_excel(writer, sheet_name='txt', index=False)
writer.save()
print('Excel文件输出成功......')
print()
endtime = datetime.datetime.now()
print('执行结束', endtime)
print('耗时', endtime - starttime)
except Exception as e:
print(Exception, e.args)