大家好,本算法采用AC自动机(Aho-Corasick automaton)进行实现是在笔者,为解决主副关键词组匹配问题速度问题临时编写的程序,如果速度还慢可以引入多线程进行解决此问题,大家可以在此基础上进行扩展。
import java.util.*;
import java.util.concurrent.*;
/**
* Trie树节点类
*/
class TrieNode {
Map<Character, TrieNode> children; // 子节点映射表
boolean isEndOfWord; // 是否是关键词结尾
TrieNode fail; // 失败指针,指向其他节点
Set<String> mainKeywords; // 主关键词集合
Set<String> subKeywords; // 副关键词集合
public TrieNode() {
children = new HashMap<>();
isEndOfWord = false;
fail = null;
mainKeywords = new HashSet<>();
subKeywords = new HashSet<>();
}
}
/**
* Trie树类
*/
class Trie {
private TrieNode root;
public Trie() {
root = new TrieNode();
}
/**
* 插入主关键词到Trie树
* @param mainKeywords 主关键词,逗号分隔
*/
public void insertMainKeywords(String mainKeywords) {
String[] keywordsArray = mainKeywords.split(", ");
TrieNode current = root;
// 插入主关键词
for (String keyword : keywordsArray) {
for (char ch : keyword.toCharArray()) {
current.children.putIfAbsent(ch, new TrieNode());
current = current.children.get(ch);
}
current.isEndOfWord = true;
current.mainKeywords.add(keyword);
// 重置到根节点,以插入下一个关键词
current = root;
}
}
/**
* 插入副关键词到Trie树
* @param subKeywords 副关键词,逗号分隔
*/
public void insertSubKeywords(String subKeywords) {
String[] keywordsArray = subKeywords.split(", ");
TrieNode current = root;
// 插入副关键词
for (String keyword : keywordsArray) {
for (char ch : keyword.toCharArray()) {
current.children.putIfAbsent(ch, new TrieNode());
current = current.children.get(ch);
}
current.isEndOfWord = true;
current.subKeywords.add(keyword);
// 重置到根节点,以插入下一个关键词
current = root;
}
}
/**
* 构建Trie树的失败指针,用于KMP算法
*/
public void buildFailPointers() {
Queue<TrieNode> queue = new LinkedList<>();
for (TrieNode child : root.children.values()) {
child.fail = root;
queue.add(child);
}
while (!queue.isEmpty()) {
TrieNode current = queue.poll();
for (Map.Entry<Character, TrieNode> entry : current.children.entrySet()) {
char ch = entry.getKey();
TrieNode child = entry.getValue();
TrieNode failNode = current.fail;
while (failNode != null && !failNode.children.containsKey(ch)) {
failNode = failNode.fail;
}
if (failNode == null) {
child.fail = root;
} else {
child.fail = failNode.children.get(ch);
if (!child.mainKeywords.isEmpty()) {
child.mainKeywords.addAll(child.fail.mainKeywords);
}
if (!child.subKeywords.isEmpty()) {
child.subKeywords.addAll(child.fail.subKeywords);
}
}
queue.add(child);
}
}
}
/**
* 在文本中搜索主关键词,并返回匹配的主关键词集合
* @param text 要匹配的文本串
* @return 匹配的主关键词集合
*/
public Set<String> searchMainKeywords(String text) {
TrieNode current = root;
Set<String> matchedMainKeywords = new HashSet<>();
StringBuilder matchedMainKeyword = new StringBuilder();
for (char ch : text.toCharArray()) {
while (current != root && !current.children.containsKey(ch)) {
current = current.fail;
}
if (current.children.containsKey(ch)) {
current = current.children.get(ch);
matchedMainKeyword.append(ch);
if (current.isEndOfWord) {
matchedMainKeywords.addAll(current.mainKeywords);
}
} else {
current = root;
matchedMainKeyword.setLength(0);
}
}
return matchedMainKeywords;
}
/**
* 在文本中搜索副关键词,并返回匹配的副关键词集合
* @param text 要匹配的文本串
* @return 匹配的副关键词集合
*/
public Set<String> searchSubKeywords(String text) {
TrieNode current = root;
Set<String> matchedSubKeywords = new HashSet<>();
StringBuilder matchedSubKeyword = new StringBuilder();
for (char ch : text.toCharArray()) {
while (current != root && !current.children.containsKey(ch)) {
current = current.fail;
}
if (current.children.containsKey(ch)) {
current = current.children.get(ch);
matchedSubKeyword.append(ch);
if (current.isEndOfWord) {
matchedSubKeywords.addAll(current.subKeywords);
}
} else {
current = root;
matchedSubKeyword.setLength(0);
}
}
return matchedSubKeywords;
}
/**
* 在文本中搜索主副关键词,并返回匹配的主副关键词集合
* @param text 要匹配的文本串
* @return 匹配的主副关键词集合
*/
public Set<Map.Entry<String, String>> searchMainSubPairs(String text) {
Set<String> matchedMainKeywords = searchMainKeywords(text);
Set<String> matchedSubKeywords = searchSubKeywords(text);
Set<Map.Entry<String, String>> matchedPairs = new HashSet<>();
// 主副关键词都必须匹配到
for (String mainKeyword : matchedMainKeywords) {
for (String subKeyword : matchedSubKeywords) {
matchedPairs.add(new AbstractMap.SimpleEntry<>(mainKeyword, subKeyword));
}
}
return matchedPairs;
}
}
public class ChineseKeywordMatcher1 {
public static void main(String[] args) throws InterruptedException, ExecutionException {
// 定义主关键词
String mainKeywords = "人工智能, 隐私计算";
// 定义副关键词
String subKeywords = "AI, 联邦学习, 可信执行环境";
// 创建Trie树并插入主关键词和副关键词
Trie trie = new Trie();
trie.insertMainKeywords(mainKeywords);
trie.insertSubKeywords(subKeywords);
// 构建Trie树的失败指针,用于KMP算法
trie.buildFailPointers();
// 获取用户输入的文本
Scanner scanner = new Scanner(System.in);
System.out.print("请输入中文文本:");
String userInput = scanner.nextLine();
scanner.close();
// 搜索主副关键词,并返回匹配
Set<Map.Entry<String, String>> matchedPairs = trie.searchMainSubPairs(userInput);
// 输出匹配的主副关键词
if (!matchedPairs.isEmpty()) {
System.out.println("匹配的主副关键词:");
for (Map.Entry<String, String> pair : matchedPairs) {
System.out.println(pair.getKey() + " - " + pair.getValue());
}
} else {
System.out.println("没有匹配到主副关键词。");
}
}
}