pdf文件中的表格无损提取方案(pdf转Excel),非OCR
非OCR方案,基于java:
aspose 21.11版本(网上有破解方法,或者参考我另外一篇文章)
转换pdf(含表格)为excel文件,然后可以使用poi对得到的excel文件进行微调。
但是上述方案,无法解决pdf的表格中,有比较多的横向、纵向的合并单元格的情况,例如下图这种pdf中的复杂表格
网上找到github上一位大拿的方法,使用python,对表格数据进行了识别,识别边框,单元格,然后重新构造出原始的表格内容,包括合并单元格的信息(这也会导致表格的样式,尤其是列宽和行高,并不能完全跟原表格保持一致,这里重点关注单元格和单元格的数据)
识别算法中,对原始(未进行合并单元格之前)的单元格进行恢复,识别出的图片示意图,如下
最后会构造合并单元格,然后这里的例子是输出为图片的,若是输出为excel或其他的表格格式的数据,还需要做处理,这里未给出具体实现代码。但是就这个能够完全还原出原来的复杂表格的单元格和数据,就感觉已经非常NB了。
python实现的,单元格合并识别算法参考,输出图片格式
Handling merged cells (possible solution) · Issue #84 · jsvine/pdfplumber · GitHub
识别算法图示:
使用如下的字符,来标记当前单元格的边框,上下左右四个角,以及临近单元格的信息(向哪个方向延展可以得到下一个单元格),定义这种数据结构,实现对于表格数据结构的定义和存储
上述定义的,不同数据的图示
主要代码:
https://github.com/shuratn/py_pdf_stm/blob/master/TableExtractor.py
import math
from operator import itemgetter
import pdfplumber
from PIL import ImageDraw, ImageFont, Image
from pdfplumber.table import TableFinder
from DataSheetParsers.DataSheet import *
def almost_equals(num1, num2, precision=5.0):
return abs(num1 - num2) < precision
class Point:
r = 4
hr = r / 2
tail = 5
def __init__(self, *xy):
if len(xy) == 1:
xy = xy[0]
self.x, self.y = xy
self.x = math.ceil(self.x)
self.y = math.ceil(self.y)
self.down = False
self.up = False
self.left = False
self.right = False
@property
def symbol(self):
direction_table = {
(False, False, False, False): '◦',
(True, False, False, False): '↑',
(False, True, False, False): '↓',
(True, True, False, False): '↕',
(True, True, True, False): '⊢',
(True, True, False, True): '⊣',
(False, False, True, False): '→',
(False, False, False, True): '←',
(False, False, True, True): '↔',
(True, False, True, True): '⊥',
(False, True, True, True): '⊤',
(True, True, True, True): '╋',
(True, False, True, False): '┗',
(True, False, False, True): '┛',
(False, True, True, False): '┏',
(False, True, False, True): '┛',
}
return direction_table[(self.up, self.down, self.right, self.left)]
def __repr__(self):
return "Point<X:{} Y:{}>".format(self.x, self.y)
def distance(self, other: 'Point'):
return math.sqrt(((self.x - other.x) ** 2) + ((self.y - other.y) ** 2))
@property
def as_tuple(self):
return self.x, self.y
def draw(self, canvas: ImageDraw.ImageDraw, color='red'):
canvas.ellipse((self.x - self.hr, self.y - self.hr, self.x + self.hr, self.y + self.hr), fill=color)
if self.down:
canvas.line(((self.x, self.y), (self.x, self.y + self.tail)), 'blue')
if self.up:
canvas.line(((self.x, self.y), (self.x, self.y - self.tail)), 'blue')
if self.left:
canvas.line(((self.x, self.y), (self.x - self.tail, self.y)), 'blue')
if self.right:
canvas.line(((self.x, self.y), (self.x + self.tail, self.y)), 'blue')
def points_to_right(self, other_points: List['Point']):
sorted_other_points = sorted(other_points, key=lambda other: other.x)
filtered_other_points = filter(lambda o: almost_equals(o.y, self.y) and o != self and o.x > self.x,
sorted_other_points)
return list(filtered_other_points)
def points_below(self, other_points: List['Point']):
sorted_other_points = sorted(other_points, key=lambda other: other.y)
filtered_other_points = filter(lambda o: almost_equals(o.x, self.x) and o != self and o.y > self.y,
sorted_other_points)
return list(filtered_other_points)
def on_same_line(self, other: 'Point'):
if self == other:
return False
if almost_equals(self.x, other.x) or almost_equals(self.y, other.y):
return True
return False
def is_above(self, other: 'Point'):
return self.y < other.y
def is_to_right(self, other: 'Point'):
return self.x > other.x
def is_below(self, other: 'Point'):
return self.y > other.y
def is_to_left(self, other: 'Point'):
return self.x < other.x
def get_right(self, others: List['Point']):
others = self.points_to_right(others)
for point in others:
if point.down:
return point
return None
def get_bottom(self, others: List['Point'], left=False, right=False):
others = self.points_below(others)
for point in others:
if point.up:
if left:
if not point.right:
continue
if right:
if not point.left:
continue
return point
return None
def has_above(self, others: List['Point']):
others = list(filter(lambda p: p.up, others))
point = list(sorted(others, key=lambda p: p.y))[0]
if point.is_above(self) and point.up:
return True
return False
def copy(self, other: 'Point'):
self.down = other.down
self.up = other.up
self.left = other.left
self.right = other.right
def merge(self, other: 'Point'):
self.up |= other.up
self.down |= other.down
self.left |= other.left
self.right |= other.right
def __eq__(self, other: 'Point'):
if not almost_equals(self.x, other.x):
return False
return almost_equals(self.y, other.y)
def __hash__(self):
return hash((self.x, self.y))
class Line:
def __init__(self, p1: 'Point', p2: 'Point'):
self.p1 = p1
self.p2 = p2
self.vertical = almost_equals(self.x, self.cx)
if self.vertical:
if self.p1.is_above(self.p2):
pass
else:
self.p1, self.p2 = self.p2, self.p1
else:
if self.p2.is_to_right(self.p1):
pass
else:
self.p1, self.p2 = self.p2, self.p1
if self.vertical:
self.p1.down = True
self.p2.up = True
else:
self.p1.right = True
self.p2.left = True
def __hash__(self):
return hash((self.p1, self.p2, self.vertical))
@property
def x(self):
return self.p1.x
@property
def y(self):
return self.p1.y
@property
def cx(self):
return self.p2.x
@property
def cy(self):
return self.p2.y
@property
def length(self):
return self.p1.distance(self.p2)
def __repr__(self):
return 'Line<p1:{} p2:{} {}>'.format(self.p1, self.p2, 'vertical' if self.vertical else 'horizontal')
def draw(self, canvas: ImageDraw.ImageDraw, color='blue'):
x, y = self.x, self.y
cx, cy = self.cx, self.cy
canvas.line(((x, y), (cx, cy)), color, width=2)
@property
def as_tuple(self):
return (self.x, self.y), (self.cx, self.cy)
def infite_intersect(self, other: 'Line'):
line1 = self.as_tuple
line2 = other.as_tuple
x_diff = (line1[0][0] - line1[1][0], line2[0][0] - line2[1][0])
y_diff = (line1[0][1] - line1[1][1], line2[0][1] - line2[1][1]) # Typo was here
def det(point_a, point_b):
return point_a[0] * point_b[1] - point_a[1] * point_b[0]
div = det(x_diff, y_diff)
if div == 0:
return None, None
d = (det(*line1), det(*line2))
x = det(d, x_diff) / div
y = det(d, y_diff) / div
return x, y
def intersect(self, other: 'Line', print_fulness=False) -> bool:
""" this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
returns a tuple: (xi, yi, valid, r, s), where
(xi, yi) is the intersection
r is the scalar multiple such that (xi,yi) = pt1 + r*(pt2-pt1)
s is the scalar multiple such that (xi,yi) = pt1 + s*(ptB-ptA)
valid == 0 if there are 0 or inf. intersections (invalid)
valid == 1 if it has a unique intersection ON the segment """
point_1 = self.x, self.y
point_2 = self.cx, self.cy
point_a = other.x, other.y
point_b = other.cx, other.cy
if self.vertical:
if self.y > self.cy:
if self.y >= other.y >= self.cy:
pass
else:
return False
else:
if other.y > other.cy:
if other.y >= self.y >= other.cy:
pass
else:
return False
det_tolerance = 0.0001
x1, y1 = point_1
x2, y2 = point_2
dx1 = x2 - x1
dy1 = y2 - y1
x, y = point_a
xb, yb = point_b
dx = xb - x
dy = yb - y
det = (-dx1 * dy + dy1 * dx)
if math.fabs(det) < det_tolerance:
return False
det_inv = 1.0 / det
r = det_inv * (-dy * (x - x1) + dx * (y - y1))
s = det_inv * (-dy1 * (x - x1) + dx1 * (y - y1))
if print_fulness:
print('self segment', r)
print('other segment', s)
if r > 1 or s > 1: # can't be higher than 1, 1 means they are NOT intersecting
return False
if r > -0.1 and s > -0.1: # This can happen on edges, so we allow small inaccuracy
return True
return False
def intersection(self, other: 'Line', print_fulness=False) -> (int, int):
""" this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)
returns a tuple: (xi, yi, valid, r, s), where
(xi, yi) is the intersection
r is the scalar multiple such that (xi,yi) = pt1 + r*(pt2-pt1)
s is the scalar multiple such that (xi,yi) = pt1 + s*(ptB-ptA)
valid == 0 if there are 0 or inf. intersections (invalid)
valid == 1 if it has a unique intersection ON the segment """
point_1 = self.x, self.y
point_2 = self.cx, self.cy
point_a = other.x, other.y
point_b = other.cx, other.cy
det_tolerance = 1
x1, y1 = point_1
x2, y2 = point_2
dx1 = x2 - x1
dy1 = y2 - y1
x, y = point_a
xb, yb = point_b
dx = xb - x
dy = yb - y
det = (-dx1 * dy + dy1 * dx)
if math.fabs(det) < det_tolerance:
return None, None
det_inv = 1.0 / det
r = det_inv * (-dy * (x - x1) + dx * (y - y1))
s = det_inv * (-dy1 * (x - x1) + dx1 * (y - y1))
xi = (x1 + r * dx1 + x + s * dx) / 2.0
yi = (y1 + r * dy1 + y + s * dy) / 2.0
if print_fulness:
print('self segment', r)
print('other segment', s)
return (round(xi), round(yi)), round(r, 4), round(s, 4)
def is_between(self, point: 'Point'):
pt1 = self.p1
pt2 = self.p2
cross_product = (point.y - pt1.y) * (pt2.x - pt1.x) - (point.x - pt1.x) * (pt2.y - pt1.y)
# compare versus epsilon for floating point values, or != 0 if using integers
if abs(cross_product) > math.e:
return False
dot_product = (point.x - pt1.x) * (pt2.x - pt1.x) + (point.y - pt1.y) * (pt2.y - pt1.y)
if dot_product < 0:
return False
squared_length_ba = (pt2.x - pt1.x) * (pt2.x - pt1.x) + (pt2.y - pt1.y) * (pt2.y - pt1.y)
if dot_product > squared_length_ba:
return False
return True
def on_line(self, point: 'Point'):
if self.vertical:
if almost_equals(self.p1.x, point.x):
return True
else:
if almost_equals(self.p1.y, point.y):
return True
return False
def __contains__(self, other: {'Line', 'Point'}):
if type(other) == Line:
if self.vertical == other.vertical:
return False
return self.intersect(other)
if type(other) == Point:
return self.is_between(other)
pass
def on_same_line(self, other: 'Line'):
if other.vertical != self.vertical:
return False
if self.vertical:
return self.x == other.x
else:
return self.y == other.y
def __eq__(self, other: 'Line'):
return self.on_same_line(other)
def corner(self, other: 'Line'):
if self.p1 == other.p1 or self.p2 == other.p2 or self.p1 == other.p2:
return True
return False
def connected(self, other: 'Line'):
return other.p1 in self or other.p2 in self
def parallel(self, other: 'Line'):
return self.vertical == other.vertical
def on_corners(self, other: 'Point'):
return other == self.p1 or other == self.p2
def test_intersection(self, other: 'Line'):
""" prints out a test for checking by hand... """
print('Testing intersection of:')
print('\t', self)
print('\t', other)
result = self.intersection(other, True)
print("\t Intersection result =", Point(result[0]))
print()
class Cell:
"""P1-------P2
| |
| |
| |
| |
P4-------P3
"""
try:
font = ImageFont.truetype('arial', size=9)
except:
font = ImageFont.load_default()
def __init__(self, p1, p2, p3, p4):
self.p1: Point = p1
self.p2: Point = p2
self.p3: Point = p3
self.p4: Point = p4
self.text = ''
self.words = [] # type: List[str]
def __repr__(self):
return 'Cell <"{}"> '.format(self.text.replace('\n', ' '))
def get_text(self):
return ''.join(map(itemgetter('text'),self.words))
@property
def clean_text(self) -> str:
return self.text.replace('\n', ' ')
def __hash__(self):
return hash(self.text) + hash(self.as_tuple)
def on_same_line(self, other: 'Cell'):
return self.p1.on_same_line(other.p1)
def on_same_row(self, other: 'Cell'):
return self.p1.y == other.p1.y
@property
def as_tuple(self):
return self.p1.as_tuple, self.p2.as_tuple, self.p3.as_tuple, self.p4.as_tuple
def __eq__(self, other: 'Cell'):
if self.p1 == other.p1 and self.p2 == other.p2 and self.p3 == other.p3 and self.p4 == other.p4:
return True
if self.p1 == other.p2 and self.p2 == other.p3 and self.p3 == other.p4 and self.p4 == other.p1:
return True
if self.p1 == other.p3 and self.p2 == other.p4 and self.p3 == other.p1 and self.p4 == other.p2:
return True
if self.p1 == other.p4 and self.p2 == other.p1 and self.p3 == other.p2 and self.p4 == other.p3:
return True
@property
def center(self):
x = [p.x for p in [self.p1, self.p2, self.p3, self.p4]]
y = [p.y for p in [self.p1, self.p2, self.p3, self.p4]]
centroid = Point(sum(x) / 4, sum(y) / 4)
return centroid
def draw(self, canvas: ImageDraw.ImageDraw, color='black', width=1, text_color='black'):
# canvas.rectangle((self.p1.as_tuple, self.p3.as_tuple), outline=color,)
canvas.line((self.p1.as_tuple, self.p2.as_tuple), color, width)
canvas.line((self.p2.as_tuple, self.p3.as_tuple), color, width)
canvas.line((self.p3.as_tuple, self.p4.as_tuple), color, width)
canvas.line((self.p4.as_tuple, self.p1.as_tuple), color, width)
if self.text:
canvas.text((self.p1.x + 3, self.p1.y + 3), self.text, fill=text_color, font=self.font)
def print_cell(self):
buffer = ''
longest = max([len(word) for word in self.text.split("\n")])
buffer += '┼' + "─" * longest + '┼\n'
for text_line in self.text.split('\n'):
buffer += "│" + text_line + ' ' * (longest - len(text_line))
buffer += "│\n"
buffer += '┼' + "─" * longest + '┼\n'
print(buffer)
def point_inside_polygon(self, point: 'Point', include_edges=True):
"""
Test if point (x,y) is inside polygon poly.
poly is N-vertices polygon defined as
[(x1,y1),...,(xN,yN)] or [(x1,y1),...,(xN,yN),(x1,y1)]
(function works fine in both cases)
Geometrical idea: point is inside polygon if horizontal beam
to the right from point crosses polygon even number of times.
Works fine for non-convex polygons.
"""
x, y = point.as_tuple
x1, y1 = self.p1.as_tuple
x2, y2 = self.p3.as_tuple
return x1 < x < x2 and y1 < y < y2
class Table:
def __init__(self, cells: List[Cell], skeleton: List[List[Cell]], ugly_table: List[List[str]], words, canvas=None):
self.cells = cells
self.canvas = canvas
self.words = words
self.skeleton = skeleton
self.ugly_table = ugly_table
self.global_map = {}
def build_table(self):
for y, (text_row, skeleton_row) in enumerate(zip(self.ugly_table, self.skeleton)):
self.global_map[y] = {}
for x, (text, cell) in enumerate(zip(text_row, skeleton_row)):
for t_cell in self.cells:
if t_cell.point_inside_polygon(cell.center):
t_cell.text += text if text else ''
self.global_map[y][x] = t_cell
processed_cells = []
for cell in tqdm(self.cells, desc='Analyzing cells', unit='cells'):
if cell in processed_cells:
continue
in_words = list(filter(lambda char: cell.point_inside_polygon(
Point(char['x0'], char['top'])), self.words))
cell.words = in_words
processed_cells.append(cell)
if self.canvas:
for cell in self.cells:
# print(cell.get_text())
cell.draw(self.canvas)
def get_col(self, col_id) -> List[Cell]:
col = []
for row in self.global_map.values():
col.append(row[col_id])
return col
def get_row(self, row_id) -> List[Cell]:
return list(self.global_map[row_id].values())
def get_cell(self, x, y) -> Cell:
return self.global_map[y][x]
def get_cell_span(self, cell):
temp = {}
for row_id, row in self.global_map.items():
for col_id, t_cell in row.items():
if t_cell == cell:
if not temp.get(row_id, False):
temp[row_id] = {}
temp[row_id][col_id] = True
row_span = len(temp)
col_span = len(list(temp.values())[0])
return row_span, col_span
class TableExtractor:
def __init__(self, path):
self.pdf = pdfplumber.open(path)
self.draw = False
self.debug = False
@staticmethod
def filter_lines(lines: List[Line]):
new_lines = []
lines = list(set(lines))
la = new_lines.append
for line1 in tqdm(lines, desc='Filtering lines', unit='lines'):
if line1 in new_lines:
continue
la(line1)
new_lines = list(set(new_lines))
return new_lines
@staticmethod
def add_skeleton_points(points, line):
points.append(line.p1)
points.append(line.p2)
def build_skeleton(self, lines):
skeleton_points = []
skeleton = []
temp_point = Point(0, 0)
temp_point.down = temp_point.up = temp_point.left = temp_point.right = True
vertical = list(filter(lambda l: l.vertical, lines))
horizontal = list(filter(lambda l: not l.vertical, lines))
for line1 in tqdm(vertical, desc='Building table skeleton', unit='lines'):
sys.stdout.flush()
if line1.length < 3.0:
continue
self.add_skeleton_points(skeleton_points, line1)
for line2 in horizontal:
if line1 == line2:
continue
self.add_skeleton_points(skeleton_points, line2)
if line1.infite_intersect(line2):
p1 = Point(line1.infite_intersect(line2))
if p1 not in skeleton_points:
skeleton_points.append(p1)
for n, p in enumerate(skeleton_points):
skeleton_points[n].copy(temp_point)
if p == p1:
p1.copy(p)
skeleton_points[n] = p1
skeleton_points = list(set(skeleton_points))
sorted_y_points = sorted(skeleton_points, key=lambda other: other.y)
for p1 in tqdm(sorted_y_points, desc='Building skeleton cells', unit='point'):
p2 = p1.get_right(skeleton_points)
if p2:
p3 = p2.get_bottom(skeleton_points, right=True)
p4 = p1.get_bottom(skeleton_points, left=True)
if p3 and p4:
cell = Cell(p1, p2, p3, p4)
if cell not in skeleton:
skeleton.append(cell)
else:
continue
return skeleton_points, skeleton
@staticmethod
def skeleton_to_2d_table(skeleton: List[Cell]) -> List[List[Cell]]:
rows = []
for cell in tqdm(skeleton, desc='Analyzing cell positions', unit='cells'):
row = tuple(sorted(filter(lambda c: cell.on_same_row(c), skeleton), key=lambda c: c.p1.x))
rows.append(row)
rows = list(sorted(list(set(rows)), key=lambda c: c[0].p1.y))
rows = [list(row) for row in rows]
return rows
def parse_page(self, page_n):
if self.debug:
print('Parsing page', page_n)
page = self.pdf.pages[page_n]
if self.debug:
print('Rendering page')
if self.debug:
print('Finding tables')
tables = TableFinder(page, {'snap_tolerance': 3, 'join_tolerance': 3})
if self.debug:
print('Found', len(tables.tables), 'tables')
beaut_tables = []
if self.draw:
p_im = page.to_image(resolution=100)
p_im.draw_lines(page.lines)
p_im.save('page-{}-lines.png'.format(page_n + 1))
if len(tables.tables) > 5:
return []
for n, table in enumerate(tables.tables):
if self.draw:
p_im.reset()
im = Image.new('RGB', (page.width, page.height), (255,) * 3)
canvas = ImageDraw.ImageDraw(im)
ugly_table = table.extract()
lines = [] # type: List[Line]
cells = [] # type: List[Cell]
for cell in tqdm(table.cells, desc='Parsing cells', unit='cells'):
# p_im.draw_rect(cell)
x1, y1, x2, y2 = cell
p1 = Point(x1, y1)
p1.right = True
p1.down = True
p2 = Point(x2, y1)
p2.left = True
p2.down = True
p3 = Point(x2, y2)
p3.up = True
p3.left = True
p4 = Point(x1, y2)
p4.up = True
p4.right = True
line1 = Line(p1, p2)
line2 = Line(p2, p3)
line3 = Line(p3, p4)
line4 = Line(p4, p1)
lines.append(line1)
lines.append(line2)
lines.append(line3)
lines.append(line4)
cell = Cell(p1, p2, p3, p4)
cells.append(cell)
# for line in lines:
# p_im.draw_line(line.as_tuple)
lines = self.filter_lines(lines)
# for line in lines:
# line.draw(canvas, color='green')
if self.draw:
p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))
im.save('page-{}-{}.png'.format(page_n + 1, n))
skeleton_points, skeleton = self.build_skeleton(lines.copy())
if not skeleton_points:
continue
skeleton = self.skeleton_to_2d_table(skeleton)
# for p in points:
# p.draw(canvas)
beaut_table = Table(cells, skeleton, ugly_table, page.extract_words())
beaut_table.build_table()
if self.draw:
for cell in beaut_table.cells:
cell.draw(canvas)
if self.debug:
print('Saving rendered table')
if self.draw:
p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))
im.save('page-{}-{}.png'.format(page_n + 1, n))
if self.draw:
canvas.rectangle((0,0,page.width,page.height),fill='white') #cleaning canvas
for row_id, row in enumerate(skeleton):
for cell_id, cell in enumerate(row):
cell.text = '{}-{}'.format(row_id, cell_id)
cell.draw(canvas, color='green',text_color='red')
im.save('page-{}-{}-skeleton.png'.format(page_n + 1, n))
beaut_tables.append(beaut_table)
return beaut_tables
# def pdfplumber_table_to_table():
if __name__ == '__main__':
# datasheet = DataSheet(r"D:\PYTHON\py_pdf_stm\datasheets\stm32L\stm32L431\stm32L431_ds.pdf")
# pdf_interpreter = PDFInterpreter(r"/mnt/d/PYTHON/py_pdf_stm/datasheets/stm32L/stm32L476/stm32L476_ds.pdf")
# pdf_interpreter = TableExtractor(r"D:\PYTHON\py_pdf_stm\datasheets\stm32L\stm32L476\stm32L476_ds.pdf")
# pdf_interpreter = PDFInterpreter(r"/mnt/d/PYTHON/py_pdf_stm/datasheets/KL/KL17P64M48SF6_ds.pdf")
pdf_interpreter = TableExtractor(r"D:\PYTHON\py_pdf_stm\datasheets\STM32F\stm32f777.pdf")
# pdf_interpreter = PDFInterpreter(r"D:\PYTHON\py_pdf_stm\datasheets\KL\KL17P64M48SF6_ds.pdf")
pdf_interpreter.draw = True
pdf_interpreter.debug = True
# pdf_interpreter = PDFInterpreter(pdf.table_root.childs[table])
# print(pdf_interpreter.content)
# tables = pdf_interpreter.parse_page(5)
tables = pdf_interpreter.parse_page(16)
print(tables)
# pdf_interpreter.parse_page(1)
# pdf_interpreter.save()
# pdf_interpreter.table.print_table()