当前位置: 首页 > article >正文

JAVA实现将PDF转换成word文档

POM.xml

 

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
       <groupId>org.springframework.boot</groupId>
       <artifactId>spring-boot-starter-parent</artifactId>
<!--       <version>3.2.1</version>-->
       <version>2.3.9.RELEASE</version>
       <relativePath/> <!-- lookup parent from repository -->
    </parent>
    <groupId>com.jack</groupId>
    <artifactId>jackDemo</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>jackDemo</name>
    <description>jackDemo</description>
    <properties>
       <java.version>1.8</java.version>
    </properties>
    <dependencies>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-data-mongodb</artifactId>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-web</artifactId>
       </dependency>

       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-devtools</artifactId>
          <scope>runtime</scope>
          <optional>true</optional>
       </dependency>
       <dependency>
          <groupId>org.springframework.boot</groupId>
          <artifactId>spring-boot-starter-test</artifactId>
          <scope>test</scope>
       </dependency>

       <dependency>
          <groupId>com.alibaba</groupId>
          <artifactId>fastjson</artifactId>
          <version>1.2.75</version>
       </dependency>


       <dependency>
          <groupId>org.openpnp</groupId>
          <artifactId>opencv</artifactId>
          <version>4.5.3-4</version>
       </dependency>

       <!-- Apache POI for Excel files -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi-ooxml</artifactId>
          <version>5.2.3</version> <!-- 请检查并使用最新版本 -->
       </dependency>

       <!-- Apache POI dependencies (these may be included automatically by Maven, but it's good to be explicit) -->
       <dependency>
          <groupId>org.apache.poi</groupId>
          <artifactId>poi</artifactId>
          <version>5.2.3</version> <!-- 与poi-ooxml版本保持一致 -->
       </dependency>

       <!-- Apache Commons Collections (required by POI) -->
       <dependency>
          <groupId>org.apache.commons</groupId>
          <artifactId>commons-collections4</artifactId>
          <version>4.4</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- Apache Commons IO (optional, but useful for file handling) -->
       <dependency>
          <groupId>commons-io</groupId>
          <artifactId>commons-io</artifactId>
          <version>2.11.0</version> <!-- 确保版本与你的项目兼容 -->
       </dependency>

       <!-- PDFBox for reading PDF files -->
       <dependency>
          <groupId>org.apache.pdfbox</groupId>
          <artifactId>pdfbox</artifactId>
          <version>2.0.24</version>
       </dependency>

       <!-- docx4j for creating Word documents -->
       <dependency>
          <groupId>org.docx4j</groupId>
          <artifactId>docx4j</artifactId>
          <version>3.2.1</version>
       </dependency>

       <dependency>
          <groupId>javax.xml.bind</groupId>
          <artifactId>jaxb-api</artifactId>
          <version>2.3.1</version>
       </dependency>
       <dependency>
          <groupId>org.glassfish.jaxb</groupId>
          <artifactId>jaxb-runtime</artifactId>
          <version>2.3.1</version>
       </dependency>

    </dependencies>


    <build>
       <plugins>
          <plugin>
             <groupId>org.springframework.boot</groupId>
             <artifactId>spring-boot-maven-plugin</artifactId>
          </plugin>
       </plugins>
    </build>

</project>

java 文件:
 

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.wml.Body;
import org.docx4j.wml.P;
import org.docx4j.wml.R;
import org.docx4j.wml.Text;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

public class PdfToWordConverter {

    public static void main(String[] args) throws Exception{
        String pdfFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.pdf"; // 替换为你的PDF文件路径
        String wordFilePath = "D:\\word\\何以为父影响彼此一生的父子关系.docx"; // 生成的Word文件路径

        try {
            // 读取PDF文件内容
            String pdfText = extractTextFromPdf(pdfFilePath);

            // 将内容写入Word文档
            createWordDocument(wordFilePath, pdfText);

            System.out.println("PDF to Word conversion completed successfully!");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static String extractTextFromPdf(String filePath) throws IOException {
        PDDocument document = PDDocument.load(new FileInputStream(filePath));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        return pdfStripper.getText(document);
    }

    public static void createWordDocument(String filePath, String content) throws Exception {
        WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
        MainDocumentPart mainDocumentPart = wordMLPackage.getMainDocumentPart();
        Body body = mainDocumentPart.getContents().getBody();

        // 将内容按段落分割并添加到Word文档中
        String[] paragraphs = content.split("\\r?\\n");
        for (String paragraph : paragraphs) {
            P p = new P();
            R r = new R();
            Text text = new Text();
            text.setParent(paragraph);
            r.getContent().add(text);
            p.getContent().add(r);
            body.getContent().add(p);
        }

        // 保存Word文档
        try (FileOutputStream out = new FileOutputStream(new File(filePath))) {
            wordMLPackage.save(out);
        }
    }
}


http://www.kler.cn/a/411928.html

相关文章:

  • ubuntu设置程序开机自启动
  • 【AI系统】Tensor Core 架构演进
  • godot游戏引擎_瓦片集和瓦片地图介绍
  • 数据库编程(sqlite3)
  • DAY133权限提升-Windows权限提升篇溢出漏洞土豆家族通杀全系补丁对比EXP筛选
  • scala模式匹配
  • 【Z2400011】基于Java+SpringBoo+mysql实现的自习室预订系统
  • <javascript><html>在两个html页面间跳转时使用浏览器本地存储localStorage传递共享参数
  • C# 调用系统级方法复制、移动和删除等操作界面
  • STM32 外设简介
  • 前端开发工程师需要学什么?
  • podman 源码 5.3.1编译
  • ollama教程——在Linux上运行大型语言模型的完整指南
  • C#.Net筑基 - 常见类型
  • 基于FPGA的FM调制(载波频率、频偏、峰值、DAC输出)-带仿真文件-上板验证正确
  • 使用Python 在Excel中创建和取消数据分组 - 详解
  • Vue框架开发一个简单的购物车(Vue.js)
  • 零基础学安全--蓝队基础知识学习
  • Java设计模式 —— 【创建型模式】工厂模式(简单工厂、工厂方法模式、抽象工厂)详解
  • 【大模型】LLaMA-Factory的环境配置、微调模型与测试
  • 【论文复现】偏标记学习+图像分类
  • [游戏开发]【unity】角色设计1- 从概念到3D:主角Shelley的设计与制作流程
  • Linux入门攻坚——39、Nginx入门
  • ubuntu22开机自动登陆和开机自动运行google浏览器自动打开网页
  • Vue项目练习之简单的小相册
  • 三、计算机视觉_07YOLO图像分类