当前位置: 首页 > article >正文

开发手账(一)

一、 关于设计

(一)数据库

  1. 确定外键标识,需判断该外键是否有可能被修改。如菜单id,菜单code,菜单名,前两者都可做外键,后面一个则不应做外键。

二、关于组件

(一)POI

1. 文档页数统计

import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.ofdrw.reader.OFDReader;
import org.springframework.web.multipart.MultipartFile;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
@Slf4j
public class LvDocPageCounter {
    public static final String DOCUMENT_PAGE_TEMP = "DOCUMENT_PAGE_TEMP";
    public static int getPageCount(String filePath) {
        String fileType = getFileType(filePath);
        try {
            switch (fileType) {
                case "pdf":
                    return getPdfPageCount(filePath);
                case "docx":
                    return getDocxPageCount(filePath);
                case "doc":
                    return getDocPageCount(filePath);
                case "ofd":
                    return getOfdPageCount(filePath);
                // Add more cases for other document types as needed
                default:
                    log.warn("不支持的文件类型:{}", filePath);
                    return 1;
//                throw new IllegalArgumentException("Unsupported file type");
            }
        } catch (Exception e) {
            log.warn("读取文件异常:{},{}", filePath,e);
            return 0;
        }
    }

    /**
     * 文件类型
     * @param filePath
     * @return
     */
    private static String getFileType(String filePath) {
        int dotIndex = filePath.lastIndexOf('.');
        if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
            log.warn("文件名中没有找到扩展名:{}", filePath);
            return "";
        }
        return filePath.substring(dotIndex + 1).toLowerCase();
    }
    /**
     * 获取PDF文档页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getPdfPageCount(String filePath) throws IOException {
        try (PDDocument document = Loader.loadPDF(new File(filePath))) {
//            PDDocument document = new PDDocument();
            int numberOfPages = document.getNumberOfPages();
            document.close();
            return numberOfPages;
        }
    }

    /**
     * 获取doc文档页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getDocPageCount(String filePath) throws IOException {
//        try (InputStream inputStream = new FileInputStream(filePath);
//             HWPFDocument document = new HWPFDocument(inputStream)) {
//            int pageCount = document.getSummaryInformation().getPageCount();
//            document.close();
//            return pageCount;
//        }
        try (InputStream inputStream = new FileInputStream(filePath)) {
            com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
            int num = doc.getPageCount();
            doc.cleanup();
            return num;
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }
    }

    /**
     * 获取docx页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getDocxPageCount(String filePath) throws IOException {
//        try (InputStream inputStream = new FileInputStream(filePath);
//             XWPFDocument document = new XWPFDocument(inputStream)) {
//            int pages = document.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();
//            document.close();
//            return pages;
//        }
        try (InputStream inputStream = new FileInputStream(filePath)) {
            com.aspose.words.Document doc = new com.aspose.words.Document(inputStream);
            int num = doc.getPageCount();
            doc.cleanup();
            return num;
        } catch (Exception e) {
            e.printStackTrace();
            return 0;
        }

    }

    /**
     * pdf页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static int getOfdPageCount(String filePath) throws IOException {
        Path ofdFile = Paths.get(filePath);
        OFDReader ofdReader = new OFDReader(ofdFile);
        int numberOfPages = ofdReader.getNumberOfPages();
        ofdReader.close();
        return numberOfPages;
    }


    /**
     * 获取缓存文件页数
     * @param inputStream
     * @param originalFilename
     * @return
     */
    public static Integer getPageCount(MultipartFile inputStream, String originalFilename) {
        try (InputStream inputStream1 = inputStream.getInputStream()) {
            return getPageCount(inputStream1,originalFilename);
        } catch (IOException e) {
            log.warn("读取文件异常:{},{}", originalFilename,e);
            return 0;
        }
    }

// Add methods for other document types as needed
}

2. 文本提取

import cn.hutool.core.io.FileUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.ofdrw.converter.export.TextExporter;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * @author yilv
 * @version 1.0
 * @description: TODO
 * @date 2023/11/16 16:12
 */
@Slf4j
public class LvDocTxTHunter {
    private static AtomicInteger  UPPER_LIMIT=new AtomicInteger(50);
    /**
     * 读取文档内容
     * @param filePath
     * @return
     */
    public static String readText(String filePath) {

        int pageCount = LvDocPageCounter.getPageCount(filePath);
        if (pageCount >UPPER_LIMIT.get()) {
            log.warn("文件过大:{},{}", filePath,pageCount);
            return "";
        }
        String fileType = getFileType(filePath);
        try {
            switch (fileType) {
                case "pdf":
                    return readPdfText(filePath);
                case "doc":
                    return readDocText(filePath);
                case "docx":
                    return readDocxText(filePath);
                case "ofd":
                    return readOfdText(filePath);
                // Add more cases for other document types as needed
                default:
                    log.warn("不支持的文件类型:{}", filePath);
                    return "";
            }
        } catch (IOException e) {
            log.warn("读取文件异常:{},{}", filePath,e);
            return "";
        }

    }

    /**
     * 获取文件类型
     * @param filePath
     * @return
     */
    private static String getFileType(String filePath) {
        int dotIndex = filePath.lastIndexOf('.');
        if (dotIndex == -1 || dotIndex == filePath.length() - 1) {
            log.warn("文件名中没有找到扩展名:{}", filePath);
            return "";
        }
        return filePath.substring(dotIndex + 1).toLowerCase();
    }

    /**
     * 获取pdf文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readPdfText(String filePath) throws IOException {
        try (PDDocument document = Loader.loadPDF(filePath)) {
            String text = new PDFTextStripper().getText(document);
            document.close();
            return text;
        }
    }

    /**
     * 获取doc文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readDocText(String filePath) throws IOException {
        try (InputStream inputStream = new FileInputStream(filePath);
             HWPFDocument document = new HWPFDocument(inputStream)) {
            WordExtractor extractor = new WordExtractor(document);
            String text = extractor.getText();
            document.close();
            return text;
        }
    }

    /**
     * 获取docx文本
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readDocxText(String filePath) throws IOException {
        try (InputStream inputStream = new FileInputStream(filePath);
             XWPFDocument document = new XWPFDocument(inputStream)) {
            XWPFWordExtractor extractor = new XWPFWordExtractor(document);
            String text = extractor.getText();
            document.close();
            return text;
        }
    }
    /**
     * pdf页数
     * @param filePath
     * @return
     * @throws IOException
     */
    private static String readOfdText(String filePath) throws IOException {
        Path txtPath = Paths.get("DOCUMENT_PAGE_TEMP", FilenameUtils.getBaseName(filePath) + ".txt");
        TextExporter textExporter = new TextExporter(Paths.get(filePath), txtPath);
        textExporter.export();
        String s = FileUtil.readUtf8String(txtPath.toFile());
        textExporter.close();
        return s;
    }

    /**
     * 获取文件文本
     * @param tempFile
     * @return
     */
    public static String readText(File tempFile) {
        return readText(tempFile.getPath());
    }
    // Add methods for other document types as needed
}

3. 文案转换

  • ofd转换
    • ①启动加载字体
    /**
     * 前置系统数据加载
     */
    private static void systemInit() {
        FontLoader preload = FontLoader.Preload();
        preload.scanFontDir(Paths.get(FileUtil.local, "font"));
        Field namePathMapping = ReflectUtil.getField(FontLoader.class, "fontNamePathMapping");
        Map<String, String> fontNamePathMapping = (Map<String, String>) ReflectUtil.getFieldValue(preload,namePathMapping);
        System.out.println("加载字体:" + JSONUtil.toJsonStr(fontNamePathMapping.keySet()));
    }
    • ②使用ofdrw进行pdf转换
    /**
     * 将OFD转换为PDF
     *
     * @param ofdPath OFD路径
     * @param distPath 输出路径
     * @param pdfPath 输出PDF路径
     * @throws IOException
     */
    public static void convertOfdToPDFByBridge(String ofdPath, String distPath, String pdfPath) throws IOException {

        log.debug("解析文件:{}",ofdPath);
        Path ofdFilePath = Paths.get(ofdPath);
        Path dir = Paths.get(distPath);
        PDFExporterIText exporter = new PDFExporterIText(ofdFilePath, Paths.get(pdfPath));
        exporter.export();
        exporter.close();
    }

http://www.kler.cn/a/137367.html

相关文章:

  • vs2022编译opencv 4.10.0
  • Oracle库锁表处理
  • 快速掌握Elasticsearch检索之二:滚动查询(scrool)获取全量数据(golang)
  • 原生js封装ajax请求以及css实现提示效果和禁止点击效果
  • 安装教程:慧集通集成平台(DataLinkX)智能体客户端安装操作(Linux/windows/mac)
  • springboot+vue实现SSE服务器发送事件
  • 为什么STM32在中国这么出名?
  • SpringBoot 注解开发
  • 深入了解Java 8 新特性:Stream流的实践应用(二)
  • FFmpeg常用命令行讲解及实战一
  • redis的高可用
  • 循环链表3
  • 数据类型扩展02
  • Java修仙记之记录一次与前端女修士论道的经历
  • 数据治理之springboot项目入门
  • 3.6 Windows驱动开发:内核进程汇编与反汇编
  • 安装和初步使用 nn-Meter
  • 通过AppLink把拼多多热门榜单商品同步至小红书
  • Python基础学习__测试报告
  • 单元测试实战(四)MyBatis-Plus 的测试
  • 人机交互复习专题
  • 【阿里云】图像识别
  • 猫罐头牌子哪个好一点?精选5款口碑好的猫罐头推荐!
  • 在Python中调用imageJ开发
  • 【算法基础】高精度运算
  • 虾皮网同行数据丨虾皮数据工具-知虾:监控竞争对手数据的利器