java通过ocr实现识别pdf中的文字
需求:识别pdf文件中的中文
根据github项目mymonstercat 改造,先将pdf文件转为png文件存于临时文件夹,然后通过RapidOcr转为文字,最后删除临时文件夹
1、引入依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.3</version>
</dependency>
<!-- ocr图片识别 -->
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr</artifactId>
<version>0.0.7</version>
</dependency>
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-onnx-platform</artifactId>
<version>0.0.7</version>
</dependency>
<!-- 本地测试可不引 , 服务器部署linux x86架构 下引入 ,其他环境部署可搜maven -->
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-onnx-linux-x86_64</artifactId>
<version>1.2.2</version>
</dependency>
2、工具类
import org.springframework.util.StringUtils;
import com.benjaminwan.ocrlibrary.OcrResult;
import com.benjaminwan.ocrlibrary.TextBlock;
import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.UUID;
@Service
public class PdfOCRConverter {
//临时输出png文件路径
private static final String outputDirs = "D:/pdfToImg/temp/";
public static void main(String[] args) throws IOException {
List<String> fileNameList = getWords("D:/Download/123.pdf");
for (String fileName : fileNameList) {
System.out.println(fileName);
}
}
public static List<String> getWords(String pdfFilePath) throws IOException {
String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", "");
List<String> fileNameList = convertPdfToImage(pdfFilePath, outputDir);
List<String> wordsList = new ArrayList<>();
for (String fileName : fileNameList) {
System.out.println("识别图片:"+fileName);
if (StringUtils.isEmpty(fileName)){break;}
List<String> words = runOcr(fileName);
for (String word : words) {
System.out.println(word);
wordsList.add(word);
}
}
deleteDirectory(outputDir);
return wordsList;
}
public static List<String> runOcr(String path) {
List<String> results = new ArrayList<>();
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
OcrResult ocrResult = engine.runOcr(path);
for (TextBlock textBlock : ocrResult.getTextBlocks()) {
results.add(textBlock.getText());
}
return results;
}
public static List<String> convertPdfToImage(String pdfFilePath, String outputDir) {
// 设置DPI(越高图片越清晰,但文件也会更大)
int dpi = 300;
List<String> fileNameList = new ArrayList<>();
File file = new File(pdfFilePath);
try (PDDocument document = Loader.loadPDF(file)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
String pdfFileName = file.getName().replace(".pdf", "");
String name = pdfFileName;
for (int page = 0; page < document.getNumberOfPages(); page++) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, dpi);
String folder = createFolder(outputDir + "/" + name);
String fileName = folder + "/" + pdfFileName + "_page_" + (page + 1) + ".png";
ImageIO.write(bim, "png", new File(fileName));
fileNameList.add(fileName);
System.out.println("生成图片:"+fileName);
}
} catch (IOException e) {
e.printStackTrace();
}
return fileNameList;
}
public static void deleteDirectory(String path) throws IOException {
// 如果路径不指向一个目录,则抛出异常
Path directory = Paths.get(path);
if (!Files.isDirectory(directory)) {
throw new IOException("The provided path is not a directory.");
}
// 遍历目录中的所有文件和子目录
Files.walkFileTree(directory, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
// 删除文件
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
// 所有内容被删除后删除目录本身
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
// 如果访问文件失败,则抛出异常
throw exc;
}
});
}
public static String createFolder(String folderPath) {
String txt = folderPath;
try {
File myFilePath = new File(txt);
txt = folderPath;
if (!myFilePath.exists()) {
myFilePath.mkdirs();
}
} catch (Exception e) {
e.printStackTrace();
}
return txt;
}
public static List<String> getWordsByBase64(String base64) throws IOException {
List<String> words = new ArrayList<>();
if (StringUtils.isEmpty(base64)) {
return null;
}
String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", "");
// 解码Base64字符串
byte[] decodedBytes = Base64.getDecoder().decode(base64);
createFolder(outputDir);
// 输出的PDF文件名
String outputFilePath = outputDir+"/output.pdf";
try (FileOutputStream fos = new FileOutputStream(outputFilePath)) {
// 将解码后的字节数组写入文件
fos.write(decodedBytes);
System.out.println("PDF文件已成功生成: " + outputFilePath);
words = getWords(outputFilePath);
} catch (Exception e) {
e.printStackTrace();
}
deleteDirectory(outputDir);
return words;
}
}