一款基于Python的从常规文档里提取图片的简单工具开发方案

1. 环境准备
安装必需库
pip install python-docx PyMuPDF openpyxl beautifulsoup4 pillow
pip install pdfplumber
pip install tk
工具选择
- 开发环境:VSCode + Python插件
- 调试工具:Python IDLE(初学者友好)
- 打包工具:pyinstaller(可选,用于生成exe)
2. 项目架构设计
image-extractor/
├── main.py # 主程序入口
├── core/
│ ├── docx_extractor.py
│ ├── pdf_extractor.py
│ ├── excel_extractor.py
│ └── html_extractor.py
└── outputs/ # 默认输出目录
3. 核心功能实现
(1) Word文档提取 (docx_extractor.py
)
import zipfile
import os
from PIL import Image
def extract_docx_images(file_path, output_dir):
with zipfile.ZipFile(file_path, 'r') as zip_ref:
image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
for img_file in image_files:
zip_ref.extract(img_file, output_dir)
src = os.path.join(output_dir, img_file)
dst = os.path.join(output_dir, os.path.basename(img_file))
os.rename(src, dst)
return len(image_files)
(2) PDF文件提取 (pdf_extractor.py
)
import fitz
import os
def extract_pdf_images(file_path, output_dir):
doc = fitz.open(file_path)
img_count = 0
for page_num in range(len(doc)):
page = doc.load_page(page_num)
images = page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
img_data = base_image["image"]
img_path = os.path.join(output_dir, f"pdf_page{page_num}_img{img_index}.png")
with open(img_path, "wb") as f:
f.write(img_data)
img_count += 1
return img_count
(3) Excel文件提取 (excel_extractor.py
)
from openpyxl import load_workbook
import os
def extract_excel_images(file_path, output_dir):
wb = load_workbook(file_path)
img_count = 0
for sheet in wb.worksheets:
for image in sheet._images:
img = image._data
img_path = os.path.join(output_dir, f"excel_{sheet.title}_img{img_count}.png")
with open(img_path, "wb") as f:
f.write(img)
img_count += 1
return img_count
(4) HTML文件提取 (html_extractor.py
)
import requests
from bs4 import BeautifulSoup
import os
import base64
def extract_html_images(html_path, output_dir):
if html_path.startswith('http'):
response = requests.get(html_path)
soup = BeautifulSoup(response.text, 'html.parser')
else:
with open(html_path, 'r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
img_tags = soup.find_all('img')
img_count = 0
for img in img_tags:
src = img.get('src')
if src.startswith('data:image'):
header, data = src.split(',', 1)
img_format = header.split('/')[1].split(';')[0]
img_data = base64.b64decode(data)
img_path = os.path.join(output_dir, f"html_img{img_count}.{img_format}")
with open(img_path, 'wb') as f:
f.write(img_data)
img_count += 1
return img_count
4. 交互界面开发 (main.py
)
import tkinter as tk
from tkinter import filedialog, messagebox
from core import docx_extractor, pdf_extractor, excel_extractor, html_extractor
import os
class ImageExtractorApp:
def __init__(self, root):
self.root = root
self.root.title("多格式图片提取工具")
self.file_path = tk.StringVar()
self.output_dir = tk.StringVar(value="outputs")
self.create_widgets()
def create_widgets(self):
tk.Label(self.root, text="选择文件:").grid(row=0, column=0, padx=5, pady=5)
tk.Entry(self.root, textvariable=self.file_path, width=40).grid(row=0, column=1)
tk.Button(self.root, text="浏览", command=self.select_file).grid(row=0, column=2)
tk.Label(self.root, text="输出目录:").grid(row=1, column=0)
tk.Entry(self.root, textvariable=self.output_dir, width=40).grid(row=1, column=1)
tk.Button(self.root, text="选择目录", command=self.select_output_dir).grid(row=1, column=2)
tk.Button(self.root, text="开始提取", command=self.start_extraction).grid(row=2, column=1, pady=10)
self.log_text = tk.Text(self.root, height=10, width=50)
self.log_text.grid(row=3, column=0, columnspan=3)
def select_file(self):
file_types = [
('支持的文件类型', '*.docx *.pdf *.xlsx *.html'),
('Word文档', '*.docx'),
('PDF文件', '*.pdf'),
('Excel文件', '*.xlsx'),
('网页文件', '*.html')
]
self.file_path.set(filedialog.askopenfilename(filetypes=file_types))
def select_output_dir(self):
self.output_dir.set(filedialog.askdirectory())
def start_extraction(self):
file_path = self.file_path.get()
output_dir = self.output_dir.get()
if not os.path.exists(output_dir):
os.makedirs(output_dir)
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == '.docx':
count = docx_extractor.extract_docx_images(file_path, output_dir)
elif ext == '.pdf':
count = pdf_extractor.extract_pdf_images(file_path, output_dir)
elif ext == '.xlsx':
count = excel_extractor.extract_excel_images(file_path, output_dir)
elif ext == '.html':
count = html_extractor.extract_html_images(file_path, output_dir)
else:
messagebox.showerror("错误", "不支持的文件类型")
return
self.log_text.insert(tk.END, f"成功提取 {count} 张图片到 {output_dir}\n")
except Exception as e:
messagebox.showerror("错误", f"提取失败: {str(e)}")
if __name__ == "__main__":
root = tk.Tk()
app = ImageExtractorApp(root)
root.mainloop()
5. 使用说明
操作步骤
- 运行
main.py
- 点击 浏览 选择文件 (支持.docx/.pdf/.xlsx/.html)
- 选择输出目录(默认 outputs)
- 点击 开始提取
- 查看底部日志区域的提取结果
效果示例
成功提取 5 张图片到 outputs/
成功提取 3 张图片到 outputs/
6. 常见问题解决
Q1: Excel图片无法提取?
- 原因:openpyxl只能提取嵌入式图片,无法提取浮动图片
- 解决方案:改用
xlrd
+图像坐标识别(需更复杂处理)
Q2: PDF提取的图片模糊?
- 原因:PDF内嵌低分辨率图片
- 解决方案:使用
pdfplumber
的更高精度提取模式
Q3: 程序无响应?
- 原因:大文件处理耗时阻塞主线程
- 解决方案:改用多线程处理(参考
threading
模块)
7. 项目扩展建议
- 增加批量处理:支持文件夹批量导入
- 添加图片预览:在界面中显示缩略图
- 支持压缩包:直接解压ZIP/RAR文件并处理内容
- 增加格式转换:自动转换HEIC/WEBP等特殊格式