当前位置: 首页 > article >正文

PaddleOCR 截图自动文字识别

春节假期在家无聊,撸了三个小工具:PC截图+编辑/PC录屏(用于meeting录屏)/PC截屏文字识别。因为感觉这三个小工具是工作中常常需要用到的,github上也有很多开源的,不过总有点或多或少的小问题,不利于自己的使用。脚本的编写尽量减少对三方库的使用。

已全部完成,这是其中的一个,后续将三个集成在在一个工具中。

import tkinter as tk
from tkinter import ttk, messagebox, font
from PIL import Image, ImageTk, ImageGrab
import sys
import logging
import tempfile
import threading
from pathlib import Path
import ctypes  # 导入 ctypes 库
import logging.handlers  # 用于日志轮转

# 最小化控制台窗口
def minimize_console():
    ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 6)

minimize_console()  # 调用最小化函数

# 获取脚本所在目录路径
def get_script_directory():
    return Path(__file__).parent

# 配置日志文件路径和日志级别
log_file_path = get_script_directory() / 'ocr_errors.log'
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# 添加日志轮转
handler = logging.handlers.RotatingFileHandler(log_file_path, maxBytes=1024*1024*5, backupCount=3)
logger = logging.getLogger()
logger.addHandler(handler)

# 保存临时图片到磁盘
def save_temp_image(image, suffix='.png'):
    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
        image.save(temp_file.name)
        return Path(temp_file.name)

class OCRApp:
    def __init__(self):
        try:
            self.root = tk.Tk()
            self.root.withdraw()

            self.screenshot = None
            self.ocr_model = None  # 延迟初始化
            self.recognized_text = ""
            self.main_frame = None

            # 启动后台线程加载OCR模型以优化性能,使run脚本后能马上进入截图状态
            threading.Thread(target=self.load_ocr_model, daemon=True).start()

            # 立即开始截图选择
            self.start_selection()

        except Exception as e:
            self.show_crash_message(f"程序启动失败: {str(e)}")
            sys.exit(1)

    def load_ocr_model(self):
        from paddleocr import PaddleOCR
        try:
            self.ocr_model = PaddleOCR(use_angle_cls=True, show_log=False, lang='ch')
        except Exception as e:
            logger.error(f"OCR模型加载失败: {str(e)}")

    # 开始截图选择区域
    def start_selection(self):
        self.selection_win = tk.Toplevel()
        self.selection_win.attributes("-fullscreen", True)
        self.selection_win.attributes("-alpha", 0.3)

        self.selection_win.bind("<Escape>", self.on_escape)

        self.canvas = tk.Canvas(
            self.selection_win,
            cursor="cross",
            bg="gray30",
            highlightthickness=0
        )
        self.canvas.pack(fill=tk.BOTH, expand=True)

        self.start_x = self.start_y = 0
        self.rect_id = None
        self.crosshair_ids = []

        self.canvas.bind("<Button-1>", self.on_mouse_down)
        self.canvas.bind("<B1-Motion>", self.on_mouse_drag)
        self.canvas.bind("<ButtonRelease-1>", self.on_mouse_up)
        self.canvas.bind("<Motion>", self.on_mouse_move)

        self.escape_label = tk.Label(
            self.selection_win,
            text="按ESC键退出截图",
            fg="yellow",
            bg="gray20",
            font=("Helvetica", 12, "bold")
        )
        self.escape_label.place(x=10, y=10)

        self.update_crosshair(0, 0)

    # 鼠标按下事件处理
    def on_mouse_down(self, event):
        self.start_x = event.x
        self.start_y = event.y
        self.clear_crosshair()
        if self.rect_id:
            self.canvas.delete(self.rect_id)
            self.rect_id = None

    # 鼠标拖动事件处理
    def on_mouse_drag(self, event):
        current_x = event.x
        current_y = event.y

        if self.rect_id:
            self.canvas.coords(self.rect_id, self.start_x, self.start_y, current_x, current_y)
        else:
            self.rect_id = self.canvas.create_rectangle(
                self.start_x, self.start_y,
                current_x, current_y,
                outline="blue", width=2, fill="gray75", tags="rect"
            )

    # 鼠标释放事件处理
    def on_mouse_up(self, event):
        try:
            x1 = min(self.start_x, event.x)
            y1 = min(self.start_y, event.y)
            x2 = max(self.start_x, event.x)
            y2 = max(self.start_y, event.y)

            if (x2 - x1) < 10 or (y2 - y1) < 10:
                raise ValueError("选区过小,请选择更大的区域")
            if (x2 - x1) > self.canvas.winfo_width() or (y2 - y1) > self.canvas.winfo_height():
                raise ValueError("选区过大,请选择更小的区域")

            self.screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))
            self.selection_win.destroy()
            self.initialize_ocr_and_process()

        except Exception as e:
            logger.error(f"截图错误: {str(e)}")
            messagebox.showerror("截图错误", str(e))
            self.restart_selection()

    # 初始化OCR引擎并处理截图
    def initialize_ocr_and_process(self):
        try:
            if self.ocr_model is None:
                self.load_win = self.show_loading("OCR模型正在加载中,请稍后...")
                self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
            else:
                self.process_ocr()
                self.setup_main_ui()
                self.root.deiconify()

        except Exception as e:
            logger.error(f"OCR初始化失败: {str(e)}")
            self.load_win.destroy()
            self.handle_ocr_init_error(str(e))

    def check_ocr_model(self):
        if self.ocr_model is None:
            self.root.after(100, self.check_ocr_model)  # 每100毫秒检查一次
        else:
            self.load_win.destroy()
            self.process_ocr()
            self.setup_main_ui()
            self.root.deiconify()

    # 执行OCR处理
    def process_ocr(self):
        try:
            temp_image_path = save_temp_image(self.screenshot)
            result = self.ocr_model.ocr(str(temp_image_path), cls=True)
            self.recognized_text = "\n".join([line[1][0] for line in result[0]])
            temp_image_path.unlink()  # 确保临时文件被删除
        except Exception as e:
            logger.error(f"OCR处理失败: {str(e)}")
            messagebox.showerror("识别错误", f"OCR处理失败: {str(e)}")
            self.restart_selection()

    # 设置主界面UI
    def setup_main_ui(self):
        if self.main_frame is None:
            self.main_frame = ttk.Frame(self.root, padding=20)
            self.main_frame.pack(fill=tk.BOTH, expand=True)

            self.img_label = ttk.Label(self.main_frame)
            self.img_label.pack(pady=10)

            # 定义字体
            custom_font = font.Font(family="Microsoft YaHei", size=9)  #

            self.text_area = tk.Text(
                self.main_frame,
                height=15,
                wrap=tk.WORD,
                font=custom_font  # 设置字体
            )
            self.text_area.pack(pady=10, expand=True, fill=tk.BOTH)

            btn_frame = ttk.Frame(self.main_frame)
            btn_frame.pack(pady=10)

            ttk.Button(
                btn_frame,
                text="重新选择",
                command=self.restart_selection
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="复制结果",
                command=self.copy_result
            ).pack(side=tk.LEFT, padx=5)

            ttk.Button(
                btn_frame,
                text="退出",
                command=self.safe_exit
            ).pack(side=tk.RIGHT, padx=5)

        # 设置窗口标题
        self.root.title("文字识别")

        self.update_image_display()
        self.text_area.delete(1.0, tk.END)
        self.text_area.insert(tk.END, self.recognized_text.strip())

        # 设置窗口总是最顶层
        self.root.attributes('-topmost', True)

    # 更新图片显示
    def update_image_display(self):
        if self.screenshot:
            photo = ImageTk.PhotoImage(self.screenshot)
            self.img_label.config(image=photo)
            self.img_label.image = photo

    # 显示加载中的窗口
    def show_loading(self, message):
        load_win = tk.Toplevel()
        load_win.title("请稍候")

        frame = ttk.Frame(load_win, padding=20)
        frame.pack()

        ttk.Label(frame, text=message).pack(pady=10)
        progress = ttk.Progressbar(frame, mode='indeterminate')
        progress.pack(pady=5)
        progress.start()

        return load_win

    # 处理OCR初始化错误
    def handle_ocr_init_error(self, error_msg):
        choice = messagebox.askretrycancel(
            "OCR初始化失败",
            f"{error_msg}\n\n是否重试?",
            icon='error'
        )
        if choice:
            threading.Thread(target=self.initialize_ocr_and_process).start()
        else:
            self.safe_exit()

    # 重新开始截图选择
    def restart_selection(self):
        if self.root.winfo_exists():
            self.root.withdraw()
        self.screenshot = None
        self.recognized_text = ""
        self.clear_ui()
        self.start_selection()

    # 清理UI界面
    def clear_ui(self):
        if hasattr(self, 'img_label'):
            self.img_label.config(image='')
            self.img_label.image = None
        if hasattr(self, 'text_area'):
            self.text_area.delete(1.0, tk.END)

    # 复制识别结果到剪贴板
    def copy_result(self):
        self.root.clipboard_clear()
        self.root.clipboard_append(self.recognized_text)
        messagebox.showinfo("成功", "已复制到剪贴板")

    # 安全退出程序
    def safe_exit(self):
        if self.root.winfo_exists():
            self.root.destroy()
        sys.exit(0)

    # 显示程序崩溃错误信息
    def show_crash_message(self, message):
        crash_win = tk.Tk()
        crash_win.withdraw()
        messagebox.showerror("致命错误", message)
        crash_win.destroy()

    # 按下ESC键时退出程序
    def on_escape(self, event):
        self.selection_win.destroy()
        self.safe_exit()

    # 鼠标移动事件处理
    def on_mouse_move(self, event):
        current_x = event.x
        current_y = event.y
        self.update_crosshair(current_x, current_y)

    # 更新十字线位置
    def update_crosshair(self, x, y):
        self.clear_crosshair()
        self.crosshair_ids.append(
            self.canvas.create_line(0, y, self.canvas.winfo_width(), y,
                                   tags="crosshair", fill="yellow", width=2))
        self.crosshair_ids.append(
            self.canvas.create_line(x, 0, x, self.canvas.winfo_height(),
                                    tags="crosshair", fill="yellow", width=2))

    # 清除十字线
    def clear_crosshair(self):
        for crosshair_id in self.crosshair_ids:
            self.canvas.delete(crosshair_id)
        self.crosshair_ids = []

    # 运行主循环
    def run(self):
        self.root.mainloop()

if __name__ == "__main__":
    app = OCRApp()
    app.run()


http://www.kler.cn/a/529960.html

相关文章:

  • VSCode插件Live Server
  • React
  • 三、js笔记
  • 2025 年,链上固定收益领域迈向新时代
  • hexo部署到github page时,hexo d后page里面绑定的个人域名消失的问题
  • 动态规划DP 背包问题 完全背包问题(题目分析+C++完整代码)
  • JavaWeb入门-请求响应(Day3)
  • Kafka SASL/SCRAM介绍
  • 缓存的今生今世
  • python-leetcode-二叉树的右视图
  • 【算法】回溯算法专题② ——组合型回溯 + 剪枝 python
  • 31.Word:科技论文的译文审交稿【31】
  • Vue - Suspense的使用
  • AWS EMR使用Apache Kylin快速分析大数据
  • 第三篇:模型压缩与量化技术——DeepSeek如何在边缘侧突破“小而强”的算力困局
  • 《Origin画百图》之脊线图
  • 精品PPT | 企业大数据治理平台统一指标库建设方案
  • IM 即时通讯系统-51-MPush开源实时消息推送系统
  • 手写单层RNN网络,后续更新
  • K8S集群架构及主机准备
  • SQL索引优化_提高系统响应速度的秘诀
  • Deepseek R1 本地化部署指南:跨平台实战
  • react redux监测值的变化
  • 硕成C语言1笔记
  • Linux - 进程间通信(3)
  • IOC三种实现方式的区别