当前位置: 首页 > article >正文

Java 实现将Word 转换成markdown

日常的开发中,需要将word 等各类文章信息转换成格式化语言,因此需要使用各类语言将word 转换成Markdown

1、引入 jar包

  <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>

2、使用以下代码进行编写

package com.nd.ai.test.service.utils;

import com.alibaba.fastjson.JSONObject;
import com.nd.ai.test.service.dto.apitest.FileMarkdownDTO;
import org.apache.commons.compress.utils.Lists;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.List;
import java.util.UUID;

/**
 * @ClassName WordToMarkdownConverter
 * @Author Administrator
 */
public class WordToMarkdownConverter {


    public static FileMarkdownDTO convertWordToMarkdown(String wordPath) throws IOException {

        List<String> imagePathList = Lists.newArrayList();

        FileMarkdownDTO dto = new FileMarkdownDTO();


        try  {
            File file = new File(wordPath);
            String uuid = UUID.randomUUID().toString();
            String outputFilePath = file.getParent() + File.separator + uuid + ".md";
            File outputFile = new File(outputFilePath);
            FileInputStream fis = new FileInputStream(file); // Corrected to read the input file
            XWPFDocument document = new XWPFDocument(fis);
            FileWriter writer = new FileWriter(outputFile);

            dto.setMdPath(outputFilePath);

            String imageDir = file.getParent() + File.separator + uuid + "-images";
            new File(imageDir).mkdirs(); // Create image directory

            for (IBodyElement element : document.getBodyElements()) {
                if (element instanceof XWPFParagraph) {
                    processParagraph((XWPFParagraph) element, writer, imageDir, imagePathList);
                } else if (element instanceof XWPFTable) {
                    processTable((XWPFTable) element, writer, imageDir, imagePathList);
                }
            }

            writer.close();
            // 增加读取mardkown 文件内容
            // Read the content of the generated Markdown file
            StringBuilder markdownContent = new StringBuilder();
            try (BufferedReader br = new BufferedReader(new FileReader(outputFile))) {
                String line;
                while ((line = br.readLine()) != null) {
                    markdownContent.append(line).append("\n");
                }
            }

            dto.setMarkdownContent(markdownContent.toString());
            dto.setParserStatus("success");
        } catch (IOException e){
            dto.setParserStatus("error");
            throw new IOException(e);
        }
        dto.setFileImagePathList(imagePathList);
        return dto;
    }

    private static void processParagraph(XWPFParagraph paragraph, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        String content = processParagraphContent(paragraph, imageDir,imageNamePath);
        if (content.isEmpty()) return;

        // 处理标题和列表样式
        String style = paragraph.getStyle();
        if (style != null && style.startsWith("Heading")) {
            int level = Math.min(Character.getNumericValue(style.charAt(7)), 6);
            StringBuilder heading = new StringBuilder();
            for (int i = 0; i < level; i++) {
                heading.append("#");
            }
            heading.append(" ").append(content).append("\n\n");
            writer.write(heading.toString());
//            writer.write("#".repeat(level) + " " + content + "\n\n");
        } else if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            writer.write("* " + listMark + " " + content + "\n");
        } else {
            writer.write(content + "\n\n");
        }
    }

    private static String getListMark(XWPFParagraph para) {
        int indentLevel = para.getNumIlvl() != null ? para.getNumIlvl().intValue() : 0;
        String numFmt = para.getNumFmt(); // 获取列表编号格式

        // 处理有序列表
        if ("decimal".equals(numFmt) || "upperRoman".equals(numFmt)) {
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            String prefix = prefixBuilder.toString();



            return prefix + ".";
        }
        // 处理无序列表
        else {
            String bullet;
            switch (para.getNumFmt()) {
                case "bullet":
                    bullet = "•"; // 实心圆点
                    break;
                default:
                    bullet = "-"; // 默认用减号
                    break;
            }
            StringBuilder prefixBuilder = new StringBuilder();
            for (int i = 0; i < indentLevel * 4; i++) {
                prefixBuilder.append(" ");
            }
            return prefixBuilder.toString() + bullet;
        }
    }

    private static boolean isListParagraph(XWPFParagraph paragraph) {
        return isOrderedList(paragraph) || isUnorderedList(paragraph); // 如果没有找到对应的样式,则不可能是列表段落
    }

    private static boolean isOrderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "decimal".equals(numFmt) || "upperRoman".equals(numFmt) || "lowerRoman".equals(numFmt) ||
                    "upperLetter".equals(numFmt) || "lowerLetter".equals(numFmt);
        }
        return false;
    }

    private static boolean isUnorderedList(XWPFParagraph paragraph) {
        if (paragraph.getNumFmt() != null) {
            String numFmt = paragraph.getNumFmt();
            return "bullet".equals(numFmt);
        }
        return false;
    }

    private static void processTable(XWPFTable table, FileWriter writer, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder mdTable = new StringBuilder();
        List<XWPFTableRow> rows = table.getRows();

        for (int i = 0; i < rows.size(); i++) {
            XWPFTableRow row = rows.get(i);
            mdTable.append("|");

            // 处理每个单元格
            for (XWPFTableCell cell : row.getTableCells()) {
                StringBuilder cellContent = new StringBuilder();
                // 处理单元格内的段落
                for (XWPFParagraph para : cell.getParagraphs()) {
                    cellContent.append(processParagraphContent(para, imageDir,imageNamePath).replace("\n", "<br>"));
                }
                mdTable.append(cellContent.toString().trim()).append("|");
            }
            mdTable.append("\n");


            // 添加表头分隔线
            if (i == 0) {
                mdTable.append("|");
                for (int j = 0; j < row.getTableCells().size(); j++) {
                    mdTable.append(" --- |");
                }
                mdTable.append("\n");
            }
        }
        writer.write(mdTable + "\n\n");
    }

    private static String processParagraphContent(XWPFParagraph paragraph, String imageDir,List<String> imageNamePath) throws IOException {
        StringBuilder sb = new StringBuilder();

        for (XWPFRun run : paragraph.getRuns()) {
            // 处理图片
            for (XWPFPicture picture : run.getEmbeddedPictures()) {
                sb.append(saveImage(picture, imageDir,imageNamePath)).append(" ");
            }
            // 处理文本样式
            String text = run.getText(0);
            if (text == null) continue;

            text = applyTextStyles(run, text);
            sb.append(text);
        }

        String content = sb.toString().trim();
        // 处理有序列表和无序列表
        if (isListParagraph(paragraph)) {
            String listMark = getListMark(paragraph);
            content ="* " + listMark + " " + content;
        }
        return content;
    }

    private static String applyTextStyles(XWPFRun run, String text) {
        if (run.isBold()) text = "**" + text + "**";
        if (run.isItalic()) text = "*" + text + "*";
        if (run.getUnderline() != UnderlinePatterns.NONE) text = "__" + text + "__";
        return text;
    }

    private static String saveImage(XWPFPicture picture, String imageDir,List<String> imageNamePath) throws IOException {
        XWPFPictureData picData = picture.getPictureData();
        String fileName = "img_" + UUID.randomUUID() + "." + picData.suggestFileExtension();
        File output = new File(imageDir, fileName);
        imageNamePath.add(output.getPath());

        try (FileOutputStream fos = new FileOutputStream(output)) {
            fos.write(picData.getData());
        }
        return "![" + fileName + "](" + imageDir + "/" + fileName + ")";
    }


    public static void main(String[] args) throws Exception {
        System.out.println(JSONObject.toJSONString( convertWordToMarkdown("word path")));
    }
}

获得信息

{
"fileImagePathList":["文件中图片路径1","文件中图片路径2"],
"markdownContent": "markdwon 信息",
"mdPath": "markdown文件地址"
}

运行上方的程序将会得到
1、解析文件中所有图片信息,保存到下方的地址
2、将word 文档转换成markdown
3、获取markdown 文件


http://www.kler.cn/a/613467.html

相关文章:

  • fyrox 2D和3D游戏的制作
  • uvm factory
  • android studio调试aosp手机userdebug版本无法查看局部变量和参数问题如何解决?
  • 设置github 代理
  • 解决PLC通信会断然后报错的问题
  • 相对位置2d矩阵和kron运算的思考
  • MFC中的窗口线程安全性与CWnd类
  • 从 YOLO11 模型格式导出到TF.js 模型格式 ,环境爬坑,依赖关系已经贴出来了
  • 智慧养老时代:老年人慢性病预防与生活方式优化
  • 【今日EDA行业分析】2025年3月28日
  • 基于扩散模型的光照编辑新突破:IC-Light方法解析与优化
  • DeepSeek大模型应用开发新模式
  • 智能舵机:AI融合下的自动化新纪元
  • ADZS-ICE-2000和AD-ICE2000仿真器在线升级固件
  • Error:Flash Download failed
  • AIGC-广告助手创作智能体完整指令(DeepSeek,豆包,千问,Kimi,GPT)
  • Ubuntu与CentOS操作指令的主要区别详解
  • 【力扣hot100题】(004)盛水最多的容器
  • 【go微服务】如何快速掌握grpc开发
  • 计算机二级WPS Office第十二套WPS演示