PaddleOCR 截图自动文字识别
春节假期在家无聊,撸了三个小工具:PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的,github上也有很多开源的,不过总有点或多或少的小问题,不利于自己的使用。脚本的编写尽量减少对三方库的使用。
已全部完成,这是其中的一个,后续将三个集成在在一个工具中。
import tkinter as tk
from tkinter import ttk, messagebox, font
from PIL import Image, ImageTk, ImageGrab
import sys
import logging
import tempfile
import threading
from pathlib import Path
import ctypes # 导入 ctypes 库
import logging.handlers # 用于日志轮转
# 最小化控制台窗口
def minimize_console():
ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)
minimize_console() # 调用最小化函数
# 获取脚本所在目录路径
def get_script_directory():
return Path(__file__).parent
# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
filename=log_file_path,
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)
# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
image.save(temp_file.name)
return Path(temp_file.name)
class OCRApp:
def __init__(self):
try:
self.root = tk.Tk()
self.root.withdraw()
self.screenshot = None
self.ocr_model = None # 延迟初始化
self.recognized_text = ""
self.main_frame = None
# 启动后台线程加载OCR模型以优化性能,使run脚本后能马上进入截图状态
threading.Thread(target=self.load_ocr_model, daemon=True).start()
# 立即开始截图选择
self.start_selection()
except Exception as e:
self.show_crash_message(f"程序启动失败: {str(e)}")
sys.exit(1)
def load_ocr_model(self):
from paddleocr import PaddleOCR
try:
self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
except Exception as e:
logger.error(f"OCR模型加载失败: {str(e)}")
# 开始截图选择区域
def start_selection(self):
self.selection_win = tk.Toplevel()
self.selection_win.attributes("-fullscreen", True)
self.selection_win.attributes("-alpha", 0.3)
self.selection_win.bind("<Escape>", self.on_escape)
self.canvas = tk.Canvas(
self.selection_win,
cursor="cross",
bg="gray30",
highlightthickness=0
)
self.canvas.pack(fill=tk.BOTH, expand=True)
self.start_x = self.start_y = 0
self.rect_id = None
self.crosshair_ids = []
self.canvas.bind("<Button-1>", self.on_mouse_down)
self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
self.canvas.bind("<Motion>", self.on_mouse_move)
self.escape_label = tk.Label(
self.selection_win,
text="按ESC键退出截图",
fg="yellow",
bg="gray20",
font=("Helvetica", 12, "bold")
)
self.escape_label.place(x=10, y=10)
self.update_crosshair(0, 0)
# 鼠标按下事件处理
def on_mouse_down(self, event):
self.start_x = event.x
self.start_y = event.y
self.clear_crosshair()
if self.rect_id:
self.canvas.delete(self.rect_id)
self.rect_id = None
# 鼠标拖动事件处理
def on_mouse_drag(self, event):
current_x = event.x
current_y = event.y
if self.rect_id:
self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
else:
self.rect_id = self.canvas.create_rectangle(
self.start_x, self.start_y,
current_x, current_y,
outline="blue", width=2, fill="gray75", tags="rect"
)
# 鼠标释放事件处理
def on_mouse_up(self, event):
try:
x1 = min(self.start_x, event.x)
y1 = min(self.start_y, event.y)
x2 = max(self.start_x, event.x)
y2 = max(self.start_y, event.y)
if (x2 - x1) < 10 or (y2 - y1) < 10:
raise ValueError("选区过小,请选择更大的区域")
if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
raise ValueError("选区过大,请选择更小的区域")
self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
self.selection_win.destroy()
self.initialize_ocr_and_process()
except Exception as e:
logger.error(f"截图错误: {str(e)}")
messagebox.showerror("截图错误", str(e))
self.restart_selection()
# 初始化OCR引擎并处理截图
def initialize_ocr_and_process(self):
try:
if self.ocr_model is None:
self.load_win = self.show_loading("OCR模型正在加载中,请稍后...")
self.root.after(100, self.check_ocr_model) # 每100毫秒检查一次
else:
self.process_ocr()
self.setup_main_ui()
self.root.deiconify()
except Exception as e:
logger.error(f"OCR初始化失败: {str(e)}")
self.load_win.destroy()
self.handle_ocr_init_error(str(e))
def check_ocr_model(self):
if self.ocr_model is None:
self.root.after(100, self.check_ocr_model) # 每100毫秒检查一次
else:
self.load_win.destroy()
self.process_ocr()
self.setup_main_ui()
self.root.deiconify()
# 执行OCR处理
def process_ocr(self):
try:
temp_image_path = save_temp_image(self.screenshot)
result = self.ocr_model.ocr(str(temp_image_path), cls=True)
self.recognized_text = "\n".join([line[1][0] for line in result[0]])
temp_image_path.unlink() # 确保临时文件被删除
except Exception as e:
logger.error(f"OCR处理失败: {str(e)}")
messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
self.restart_selection()
# 设置主界面UI
def setup_main_ui(self):
if self.main_frame is None:
self.main_frame = ttk.Frame(self.root, padding=20)
self.main_frame.pack(fill=tk.BOTH, expand=True)
self.img_label = ttk.Label(self.main_frame)
self.img_label.pack(pady=10)
# 定义字体
custom_font = font.Font(family="Microsoft YaHei", size=9) #
self.text_area = tk.Text(
self.main_frame,
height=15,
wrap=tk.WORD,
font=custom_font # 设置字体
)
self.text_area.pack(pady=10, expand=True, fill=tk.BOTH)
btn_frame = ttk.Frame(self.main_frame)
btn_frame.pack(pady=10)
ttk.Button(
btn_frame,
text="重新选择",
command=self.restart_selection
).pack(side=tk.LEFT, padx=5)
ttk.Button(
btn_frame,
text="复制结果",
command=self.copy_result
).pack(side=tk.LEFT, padx=5)
ttk.Button(
btn_frame,
text="退出",
command=self.safe_exit
).pack(side=tk.RIGHT, padx=5)
# 设置窗口标题
self.root.title("文字识别")
self.update_image_display()
self.text_area.delete(1.0, tk.END)
self.text_area.insert(tk.END, self.recognized_text.strip())
# 设置窗口总是最顶层
self.root.attributes('-topmost', True)
# 更新图片显示
def update_image_display(self):
if self.screenshot:
photo = ImageTk.PhotoImage(self.screenshot)
self.img_label.config(image=photo)
self.img_label.image = photo
# 显示加载中的窗口
def show_loading(self, message):
load_win = tk.Toplevel()
load_win.title("请稍候")
frame = ttk.Frame(load_win, padding=20)
frame.pack()
ttk.Label(frame, text=message).pack(pady=10)
progress = ttk.Progressbar(frame, mode='indeterminate')
progress.pack(pady=5)
progress.start()
return load_win
# 处理OCR初始化错误
def handle_ocr_init_error(self, error_msg):
choice = messagebox.askretrycancel(
"OCR初始化失败",
f"{error_msg}\n\n是否重试?",
icon='error'
)
if choice:
threading.Thread(target=self.initialize_ocr_and_process).start()
else:
self.safe_exit()
# 重新开始截图选择
def restart_selection(self):
if self.root.winfo_exists():
self.root.withdraw()
self.screenshot = None
self.recognized_text = ""
self.clear_ui()
self.start_selection()
# 清理UI界面
def clear_ui(self):
if hasattr(self, 'img_label'):
self.img_label.config(image='')
self.img_label.image = None
if hasattr(self, 'text_area'):
self.text_area.delete(1.0, tk.END)
# 复制识别结果到剪贴板
def copy_result(self):
self.root.clipboard_clear()
self.root.clipboard_append(self.recognized_text)
messagebox.showinfo("成功", "已复制到剪贴板")
# 安全退出程序
def safe_exit(self):
if self.root.winfo_exists():
self.root.destroy()
sys.exit(0)
# 显示程序崩溃错误信息
def show_crash_message(self, message):
crash_win = tk.Tk()
crash_win.withdraw()
messagebox.showerror("致命错误", message)
crash_win.destroy()
# 按下ESC键时退出程序
def on_escape(self, event):
self.selection_win.destroy()
self.safe_exit()
# 鼠标移动事件处理
def on_mouse_move(self, event):
current_x = event.x
current_y = event.y
self.update_crosshair(current_x, current_y)
# 更新十字线位置
def update_crosshair(self, x, y):
self.clear_crosshair()
self.crosshair_ids.append(
self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
tags="crosshair", fill="yellow", width=2))
self.crosshair_ids.append(
self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
tags="crosshair", fill="yellow", width=2))
# 清除十字线
def clear_crosshair(self):
for crosshair_id in self.crosshair_ids:
self.canvas.delete(crosshair_id)
self.crosshair_ids = []
# 运行主循环
def run(self):
self.root.mainloop()
if __name__ == "__main__":
app = OCRApp()
app.run()