Java解析word中的表格或者文本
接到一个需求,要将word里面表格某些列的字段解析出来,这些列名是固定的,但是表格可能增加列,因此不能直接用列对应的索引去直接取列,要做成动态的。
整体思路就是先解析出word里面的表格,因为表格可能有多个,找到自己要的那个表格,再根据要的列名找到这些列的索引,找到之后遍历表格设置对象属性
1、maven依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
<!--POI包 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
2、读取流数据
@PostMapping(value = "/read")
public void upload(@RequestPart("file") MultipartFile multipartFile) throws IOException {
handlerDocx(multipartFile.getInputStream());
}
3、处理逻辑
//解析字段关联的对象字段
private static final Map<String, String> needColumnsMap = new HashMap<>();
static {
needColumnsMap.put("姓名", "name");
needColumnsMap.put("单位", "unit");
needColumnsMap.put("月数", "month");
}
private void handlerDocx(InputStream inputStream) throws IOException {
XWPFDocument xwpf = new XWPFDocument(inputStream);
// 获取word中的所有段落与表格
List<IBodyElement> elements = xwpf.getBodyElements();
// 解析表格后续不解析
for (IBodyElement element : elements) {
// 段落
if (element instanceof XWPFParagraph) {
String paragraphText = DocUtils.getParagraphText((XWPFParagraph) element);
log.info("paragraphText = {}", paragraphText);
} else if (element instanceof XWPFTable) {
// 表格
List<List<String>> tableText
= DocUtils.getTableText((XWPFTable) element);
log.info("tableText = {}", tableText);
//列索引关联的字段
Map<Integer, String> needColumnIndexRelaName = new HashMap<>();
for (List<String> strings : tableText) {
if (strings.containsAll(needColumnsMap.keySet())) {
//找出每个字段的索引,再找出这些索引对应的值
for (String needColumn : needColumnsMap.keySet()) {
needColumnIndexRelaName.put(strings.indexOf(needColumn), needColumnsMap.get(needColumn));
}
}
}
if (CollUtil.isNotEmpty(needColumnIndexRelaName)) {
List<UserInfo> userInfos = new ArrayList<>();
for (List<String> strings : tableText) {
//过滤掉表头
if (strings.containsAll(needColumnsMap.keySet())) {
continue;
}
UserInfo info = new UserInfo();
//根据列索引关联的字段设置对象属性
for (Integer needColumnIndex : needColumnIndexRelaName.keySet()) {
ReflectUtil.setFieldValue(info, needColumnIndexRelaName.get(needColumnIndex), strings.get(needColumnIndex));
}
userInfos.add(info);
}
log.info("{}", userInfos);
}
} else {
log.info("其他内容");
}
}
}
}
@Data
class UserInfo {
@TableField(value = "姓名")
private String name;
@TableField(value = "单位")
private String unit;
@TableField(value = "月数")
private String month;
}
4、工具类
public class DocUtils {
/**
* 获取表格内容
* @param table
* @return
*/
public static List<List<String>> getTableText(XWPFTable table) {
List<List<String>> result = new ArrayList<>();
List<XWPFTableRow> rows = table.getRows();
for (XWPFTableRow row : rows) {
String key = null;
List<String> list = new ArrayList<>();
int i = 0;
List<XWPFTableCell> cells = row.getTableCells();
for (XWPFTableCell cell : cells) {
// 简单获取内容(简单方式是不能获取字体对齐方式的)
StringBuffer sb = new StringBuffer();
// 一个单元格可以理解为一个word文档,单元格里也可以加段落与表格
List<XWPFParagraph> paragraphs = cell.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
sb.append(DocUtils.getParagraphText(paragraph));
}
if (i == 0) {
key = sb.toString();
list.add(key == null || Objects.deepEquals(key, "") ? null : key.replace(",", ""));
} else {
String value = sb.toString();
list.add(value == null || Objects.deepEquals(value, "") ? null : value.replace(",", ""));
}
i++;
}
result.add(list);
}
return result;
}
/**
* docx 获取段落字符串
* 获取段落内容
*
* @param paragraph
*/
public static String getParagraphText(XWPFParagraph paragraph) {
StringBuffer runText = new StringBuffer();
// 获取段落中所有内容
List<XWPFRun> runs = paragraph.getRuns();
if (runs.size() == 0) {
return runText.toString();
}
for (XWPFRun run : runs) {
runText.append(run.text());
}
return runText.toString();
}
/**
* doc 格式的字段解析表格
* @param tb
* @return
*/
public static Map<String, List<String>> getTabelDocText(Table tb) {
Map<String, List<String>> result = new HashMap<>();
//迭代行,默认从0开始,可以依据需要设置i的值,改变起始行数,也可设置读取到那行,只需修改循环的判断条件即可
for (int i = 0; i < tb.numRows(); i++) {
List<String> list = new ArrayList<>();
int x = 0;
TableRow tr = tb.getRow(i);
String key = null;
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
//取得单元格
TableCell td = tr.getCell(j);
StringBuffer sb = new StringBuffer();
//取得单元格的内容
for (int k = 0; k < td.numParagraphs(); k++) {
Paragraph paragraph = td.getParagraph(k);
String s = paragraph.text();
//去除后面的特殊符号
if (null != s && !"".equals(s)) {
s = s.substring(0, s.length() - 1);
}
sb.append(s);
}
if (x == 0) {
key = sb.toString();
} else {
String value = sb.toString();
list.add(value == null || Objects.deepEquals(value, "") ? null : value.replace(",", ""));
}
x++;
}
result.put(key, list);
}
return result;
}
}
5、最后表格里的字段映射到实体类的结果
[UserInfo(name=王, unit=电科院, month=4), UserInfo(name=邵, unit=电科院, month=4), UserInfo(name=王, unit=电科院, month=12), UserInfo(name=张, unit=有限公司, month=4) ]