springboot集成pdfbox解析pdf文件
- 1、引入依赖
- 2、获取PDF文件
- 3、获取需要的字段信息
- 4、获取多字段时,步骤3需要优化,以下为优化后代码
1、引入依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.24</version> <!-- 请检查最新版本 -->
</dependency>
2、获取PDF文件
String filePath = "C:\\xx\\a.pdf";
PDDocument document = PDDocument.load(new File(filePath));
PDFTextStripper pdfStripper = new PDFTextStripper();
pdfStripper.setSortByPosition(true);
String text = pdfStripper.getText(document);
document.close();
3、获取需要的字段信息
private Map<String,String> pdfStr(String invoiceInfo) {
Map<String,String> map=new HashMap<>();
invoiceInfo = invoiceInfo.replaceAll("(","(").replaceAll(")",")");
Pattern patternInvoiceNumber = Pattern.compile("发票号码:(\\d+)");
Matcher matcherInvoiceNumber = patternInvoiceNumber.matcher(invoiceInfo);
if (matcherInvoiceNumber.find()) {
map.put("invoiceNumber",matcherInvoiceNumber.group(1));
}
return map;
}
4、获取多字段时,步骤3需要优化,以下为优化后代码
public static Map<String, String> pdfStr(String invoiceInfo) {
Map<String, String> result = new HashMap<>();
invoiceInfo = invoiceInfo.replaceAll("(", "(").replaceAll(")", ")");
Map<String, String> patterns = new HashMap<>();
patterns.put("invoiceNumber", "发票号码:(\\d+)");
patterns.put("invoiceDate", "开票日期:(\\d{4}年\\d{1,2}月\\d{1,2}日)");
patterns.put("buyerName", "购 名称:(.+?) 销 名称:(.+?)\n");
patterns.put("itemDetails", "税 额\\s+(.*?)合 计");
patterns.put("total", "\\(小写\\)¥(\\d+(\\.\\d+)?)");
patterns.put("batchNumber", "批号:(.+?)/");
patterns.put("productionDate", "生产日期:(\\d{4}-\\d{1,2}-\\d{1,2})/");
patterns.put("expirationDate", "有效期至:(\\d{4}-\\d{1,2}-\\d{1,2})/");
patterns.put("taxIncludedPrice", "含税单价:(\\d+(\\.\\d+)?)");
patterns.put("manufacturer", "生产厂家:(.+?)/");
patterns.put("approvalNumber", "批准文号:(.+?)/");
patterns.put("issuer", "开票人:(.+)");
for (Map.Entry<String, String> entry : patterns.entrySet()) {
Pattern pattern = Pattern.compile(entry.getValue(), Pattern.DOTALL);
Matcher matcher = pattern.matcher(invoiceInfo);
if (matcher.find()) {
result.put(entry.getKey(), matcher.group(1).trim());
}
}
if (result.containsKey("itemDetails")) {
String[] details = result.get("itemDetails").replace("\n", " ").split(" ");
if (details.length >= 8) {
result.put("productName", details[0].trim() + (details.length > 8 ? details[8].trim() : ""));
result.put("specification", details[1].trim());
result.put("unit", details[2].trim());
result.put("quantity", details[3].trim());
result.put("unitPrice", details[4].trim());
result.put("amount", details[5].trim());
result.put("taxRate", details[6].trim());
result.put("taxAmount", details[7].trim());
}
}
return result;
}