全文检索工具 Lucene 入门
最近在了解 Halo 博客后端源码,而全文检索是 Halo 做的比较差的一块内容,仅通过数据库的模糊查询来实现文章检索。对于搜索引擎之前了解的也不多,所以开始入门 Lucene
检索引擎,如果可以的话准备将该引擎应用于 Halo 之上。
整体而言全文检索是一件很费资源的事。
一、Deam 案例
这里举例一个用 Lucene
实现文章查询的例子。
Maven
配置如下:
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.2.0</version>
</dependency>
<!--中文分词器-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.2.0</version>
</dependency>
<!--对分词索引查询解析-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.2.0</version>
</dependency>
<!--检索关键字高亮显示-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
代码实现:
package com.nineya.lucene;
import com.nineya.lucene.entity.Post;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
public class LuceneMain {
private static final List<Post> POSTS = new ArrayList<>();
// 索引存储位置
private static final String INDEX_PATH = "./data/lucene/indexData";
/**
* 创建数据
*/
private static void buildData() {
POSTS.add(new Post(1L, "Dream 主题之 Halo 2.0 适配,以及适配前后的一些异同", "我的项目",
new String[]{"dream"}, "经过一段时间的适配,目前 Dream 已经发布了基于 Halo 2.x 的第一个预发版本。"));
POSTS.add(new Post(2L, "互联网新理念,对于WEB 3.0 你怎么看?", "生活",
new String[]{"IDEA", "区块链"}, "WEB 3.0 这个名词走进大众视野已经有一段时间了,也曾在各个圈子里火热一时,至今各大互联网企业任旧在 WEB 3.0 上不断探索。"));
POSTS.add(new Post(3L, "GCC编译环境升级部署", "运维",
new String[]{"应用部署"}, "近期经常遇到使用源码编译的部署方式进行应用部署,在 GCC 编译环境上遇到各种问题,本文对升级部署 GCC 编译环境的流程以及遇到的一些问题进行记录。"));
POSTS.add(new Post(4L, "有一片草原", "生活",
new String[]{"故事"}, "从前,有一片广阔无垠的大草原,和风旭日,青草芳美。"));
}
// 创建索引
public static void createIndex() {
// 创建索引配置
IndexWriterConfig config = new IndexWriterConfig(new SmartChineseAnalyzer());
// 索引的打开方式:没有则创建,有则打开
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
// 指定目录创建索引
try (Directory directory = FSDirectory.open(Paths.get(INDEX_PATH));
IndexWriter indexWriter = new IndexWriter(directory, config)) {
if (indexWriter.numDocs() != 0) {
System.out.println("WARN: 已经初始化过数据");
return;
}
for (Post post : POSTS) {
// 将数据转换成文档
Document document = new Document();
document.add(new TextField("title", post.getTitle(), Field.Store.YES));
// StringField 不做分词
document.add(new StringField("categories", post.getCategories(), Field.Store.YES));
document.add(new TextField("content", post.getContent(), Field.Store.YES));
// 加入到索引中
indexWriter.addDocument(document);
}
// 将提交,保存到硬盘
indexWriter.commit();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 查询
* @param keys
* @return
*/
public static void search(String keys, String categories) {
try(DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_PATH)))) {
IndexSearcher searcher = new IndexSearcher(reader);
System.out.println(searcher.getTopReaderContext().reader().numDocs());
// 组合查询
BooleanQuery.Builder builder = new BooleanQuery.Builder();
// 给所title项指定查询关键词
builder.add(new QueryParser("title", new SmartChineseAnalyzer()).parse(keys), BooleanClause.Occur.MUST);
if (categories != null) {
// 指定文章分类进行查询
builder.add(new TermQuery(new Term("categories", categories)), BooleanClause.Occur.MUST);
}
BooleanQuery query = builder.build();
// 获取符合条件的前两条记录
TopDocs docs = searcher.search(query, 2);
System.out.println("符合条件的条数为:" + docs.totalHits);
// 解析查询结果
for (ScoreDoc scoreDoc : docs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("title = " + doc.get("title"));
System.out.println("- categories = " + doc.get("categories"));
System.out.println("- content = " + doc.get("content"));
}
} catch (ParseException | IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
// 创建数据
buildData();
// 创建索引
createIndex();
// 查询
Scanner in = new Scanner(System.in);
System.out.print("请输入查询关键词:");
while (in.hasNext()) {
String keys = in.nextLine();
String categories = null;
if (keys.contains(" ")) {
int index = keys.lastIndexOf(" ");
categories = keys.substring(index + 1);
keys = keys.substring(0, index);
}
System.out.printf("keys = %s, categories = %s\n", keys, categories);
search(keys, categories);
System.out.print("请输入查询关键词:");
}
}
}
二、问题解决
2.1 数据检索不到
需要注意 StringField
是不会做分词的,这个字段就不能通过 QueryParser
进行检索,而需要通过 TermQuery
进行检索。
2.2 IKAnalyzer 报错
报错内容如下,去查了一下,这是 IKAnalyzer
长期没有维护的原因导致的,和 Lucene
版本不兼容,网上有相关的继承 IKAnalyzer
类解决该报错的方案。
xception in thread "main" java.lang.AbstractMethodError: org.apache.lucene.analysis.Analyzer.createComponents(Ljava/lang/String;)Lorg/apache/lucene/analysis/Analyzer$TokenStreamComponents;
at org.apache.lucene.analysis.Analyzer.tokenStream(Analyzer.java:198)
at org.apache.lucene.document.Field.tokenStream(Field.java:505)
at org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:730)
at org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430)
at org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392)
at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240)
at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496)
at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729)
at org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464)
at com.nineya.lucene.LuceneMain.createIndex(LuceneMain.java:69)
at com.nineya.lucene.LuceneMain.main(LuceneMain.java:116)