tesseract-ocr 文本识别开发指南
简介
Tesseract是由Google公司开发的光学识别引擎,chat-gpt底层也使用的是Tesseract,本人在项目中使用该插件配合百度的Paddle-ocr进行文字识别,作用为进行文字倾斜度、旋转角度的识别,如下:
参考资料:
Tesseract 安装、使用、训练模型教程简介
Tesseract 安装与环境变量配置
Linux环境搭建OpenCV运行java-cv代码
tesseract-ocr 的使用
Tesseract java
Python的调用:
上面是环境的搭建和使用,下面是关于Tesseract的Python调用方法,以获取图片的旋转角度为例:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
print(sys.path)
import uvicorn
import cv2
from pytesseract import Output
import pytesseract
from fastapi import FastAPI, Request, Form, UploadFile, File
from paddleocr import PaddleOCR, PPStructure
import numpy as np
from starlette.responses import FileResponse, StreamingResponse
from fastapi.responses import JSONResponse
import uuid
ocr_sever = FastAPI()
#0.95 用于 ocr识别 1 用于 版面分析
scaling_ocr = 0.95
scaling_structure = 1
def rotate_bound(image, angle, scaling):
(h, w) = image.shape[:2]
(cX, cY) = (w / 2, h / 2)
# 抓住旋转矩阵(应用角度的负数顺时针旋转),然后抓住正弦和余弦(即矩阵的旋转分量
M = cv2.getRotationMatrix2D((cX, cY), -angle, scaling)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# compute the new bounding dimensions of the image 计算图像的新边界尺寸
nW = int((h * sin) + (w * cos))
nH = int((h * cos) + (w * sin))
# adjust the rotation matrix to take into account translation 调整旋转矩阵以考虑平移
M[0, 2] += (nW / 2) - cX
M[1, 2] += (nH / 2) - cY
# perform the actual rotation and return the image 执行实际旋转并返回图像
return cv2.warpAffine(image, M, (nW, nH), borderValue=(255, 255, 255))
# 流程整合成一体
@ocr_sever.post("/imgInfos/")
async def img_infos(fileName: str = Form(...)):
print("输入文件名为:{}".format(fileName))
# 拼接成 图片的 相对路径
file_path = "./doc/imgs/"+fileName;
image = cv2.imread(file_path)
# 二值化后的图像 识别率会提高
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
# 然后进行图像的旋转
rotated_ocr = rotate_bound(image, angle=results["rotate"], scaling=scaling_ocr)
rotated_structure = rotate_bound(image, angle=results["rotate"], scaling=scaling_structure)
# 然后进行 paddle的识别
ocr = PaddleOCR(use_angle_cls=True, lang="ch")
result = ocr.ocr(rotated_ocr, cls=True)
#result2 = ocr.ocr(image, cls=True)
print(result)
#print(result2)
# 然后进行 ppstructure 版面分析
table_engine = PPStructure(show_log=True, type='structure', image_orientation=True)
structResult = table_engine(rotated_structure)
struct = []
for line in structResult:
# 去除 img元素
line.pop('img')
print(line)
struct.append(line)
#然后把 两个结果 打包成 json 进行返回
data = {"ocr": result, "structure": struct}
return JSONResponse(data);
# 获取图片的偏转角度
@ocr_sever.post("/imgAngle/")
async def img_angle(file: UploadFile = File(...)):
print("imgAngle 输入文件名为:{}".format(file.filename))
file_path = "./doc/imgs/"+file.filename
with open(file_path, 'wb') as f:
f.write(await file.read())
image = cv2.imread(file_path)
# 二值化后的图像 识别率会提高
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
return results
# 获取翻转后图片
@ocr_sever.post("/imgRotate/")
async def img_rotate(file: UploadFile = File(...)):
print("输入文件名为:{}".format(file.filename))
file_path = "./doc/imgs/"+file.filename
with open(file_path, 'wb') as f:
f.write(await file.read())
# 二值化后的图像 识别率会提高
image = cv2.imread(file_path)
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = pytesseract.image_to_osd(rgb, output_type=Output.DICT)
# 然后进行图像的旋转
rotated = rotate_bound(image, angle=results["rotate"], scaling=scaling_ocr)
newFilePath = "./doc/imgs/"+str(uuid.uuid1())+".jpg"
cv2.imwrite(newFilePath, rotated)
response = StreamingResponse(get_file_byte(newFilePath))
return response
def get_file_byte(filename): # filename可以是文件,也可以是压缩包
with open(filename, "rb") as f:
while True:
content = f.read(1024)
if content:
yield content
else:
break
# 获取 paddleocr的解析结果 就是原先的接口
if __name__ == "__main__":
print('开始加载orc')
host = '0.0.0.0'
port = 9999
workers = 1
# 这里一定要改 文件名 test04
uvicorn.run(app='test04:ocr_sever',
host=host,
port=int(port))
其中核心代码其实就一段,其他的是对它的综合应用
pytesseract.image_to_osd(rgb, output_type=Output.DICT)