java主副关键词组匹配

发布时间:2024年01月23日

大家好,本算法采用AC自动机(Aho-Corasick automaton)进行实现是在笔者,为解决主副关键词组匹配问题速度问题临时编写的程序,如果速度还慢可以引入多线程进行解决此问题,大家可以在此基础上进行扩展。

import java.util.*;
import java.util.concurrent.*;

/**
 * Trie树节点类
 */
class TrieNode {
    Map<Character, TrieNode> children; // 子节点映射表
    boolean isEndOfWord; // 是否是关键词结尾
    TrieNode fail; // 失败指针,指向其他节点
    Set<String> mainKeywords; // 主关键词集合
    Set<String> subKeywords; // 副关键词集合

    public TrieNode() {
        children = new HashMap<>();
        isEndOfWord = false;
        fail = null;
        mainKeywords = new HashSet<>();
        subKeywords = new HashSet<>();
    }
}

/**
 * Trie树类
 */
class Trie {
    private TrieNode root;

    public Trie() {
        root = new TrieNode();
    }

    /**
     * 插入主关键词到Trie树
     * @param mainKeywords 主关键词,逗号分隔
     */
    public void insertMainKeywords(String mainKeywords) {
        String[] keywordsArray = mainKeywords.split(", ");
        TrieNode current = root;

        // 插入主关键词
        for (String keyword : keywordsArray) {
            for (char ch : keyword.toCharArray()) {
                current.children.putIfAbsent(ch, new TrieNode());
                current = current.children.get(ch);
            }
            current.isEndOfWord = true;
            current.mainKeywords.add(keyword);

            // 重置到根节点,以插入下一个关键词
            current = root;
        }
    }

    /**
     * 插入副关键词到Trie树
     * @param subKeywords 副关键词,逗号分隔
     */
    public void insertSubKeywords(String subKeywords) {
        String[] keywordsArray = subKeywords.split(", ");
        TrieNode current = root;

        // 插入副关键词
        for (String keyword : keywordsArray) {
            for (char ch : keyword.toCharArray()) {
                current.children.putIfAbsent(ch, new TrieNode());
                current = current.children.get(ch);
            }
            current.isEndOfWord = true;
            current.subKeywords.add(keyword);

            // 重置到根节点,以插入下一个关键词
            current = root;
        }
    }

    /**
     * 构建Trie树的失败指针,用于KMP算法
     */
    public void buildFailPointers() {
        Queue<TrieNode> queue = new LinkedList<>();
        for (TrieNode child : root.children.values()) {
            child.fail = root;
            queue.add(child);
        }

        while (!queue.isEmpty()) {
            TrieNode current = queue.poll();
            for (Map.Entry<Character, TrieNode> entry : current.children.entrySet()) {
                char ch = entry.getKey();
                TrieNode child = entry.getValue();

                TrieNode failNode = current.fail;
                while (failNode != null && !failNode.children.containsKey(ch)) {
                    failNode = failNode.fail;
                }

                if (failNode == null) {
                    child.fail = root;
                } else {
                    child.fail = failNode.children.get(ch);
                    if (!child.mainKeywords.isEmpty()) {
                        child.mainKeywords.addAll(child.fail.mainKeywords);
                    }
                    if (!child.subKeywords.isEmpty()) {
                        child.subKeywords.addAll(child.fail.subKeywords);
                    }
                }

                queue.add(child);
            }
        }
    }

    /**
     * 在文本中搜索主关键词,并返回匹配的主关键词集合
     * @param text 要匹配的文本串
     * @return 匹配的主关键词集合
     */
    public Set<String> searchMainKeywords(String text) {
        TrieNode current = root;
        Set<String> matchedMainKeywords = new HashSet<>();
        StringBuilder matchedMainKeyword = new StringBuilder();

        for (char ch : text.toCharArray()) {
            while (current != root && !current.children.containsKey(ch)) {
                current = current.fail;
            }

            if (current.children.containsKey(ch)) {
                current = current.children.get(ch);
                matchedMainKeyword.append(ch);
                if (current.isEndOfWord) {
                    matchedMainKeywords.addAll(current.mainKeywords);
                }
            } else {
                current = root;
                matchedMainKeyword.setLength(0);
            }
        }

        return matchedMainKeywords;
    }

    /**
     * 在文本中搜索副关键词,并返回匹配的副关键词集合
     * @param text 要匹配的文本串
     * @return 匹配的副关键词集合
     */
    public Set<String> searchSubKeywords(String text) {
        TrieNode current = root;
        Set<String> matchedSubKeywords = new HashSet<>();
        StringBuilder matchedSubKeyword = new StringBuilder();

        for (char ch : text.toCharArray()) {
            while (current != root && !current.children.containsKey(ch)) {
                current = current.fail;
            }

            if (current.children.containsKey(ch)) {
                current = current.children.get(ch);
                matchedSubKeyword.append(ch);
                if (current.isEndOfWord) {
                    matchedSubKeywords.addAll(current.subKeywords);
                }
            } else {
                current = root;
                matchedSubKeyword.setLength(0);
            }
        }

        return matchedSubKeywords;
    }

    /**
     * 在文本中搜索主副关键词,并返回匹配的主副关键词集合
     * @param text 要匹配的文本串
     * @return 匹配的主副关键词集合
     */
    public Set<Map.Entry<String, String>> searchMainSubPairs(String text) {
        Set<String> matchedMainKeywords = searchMainKeywords(text);
        Set<String> matchedSubKeywords = searchSubKeywords(text);
        Set<Map.Entry<String, String>> matchedPairs = new HashSet<>();

        // 主副关键词都必须匹配到
        for (String mainKeyword : matchedMainKeywords) {
            for (String subKeyword : matchedSubKeywords) {
                matchedPairs.add(new AbstractMap.SimpleEntry<>(mainKeyword, subKeyword));
            }
        }

        return matchedPairs;
    }
}

public class ChineseKeywordMatcher1 {
    public static void main(String[] args) throws InterruptedException, ExecutionException {
        // 定义主关键词
        String mainKeywords = "人工智能, 隐私计算";

        // 定义副关键词
        String subKeywords = "AI, 联邦学习, 可信执行环境";

        // 创建Trie树并插入主关键词和副关键词
        Trie trie = new Trie();
        trie.insertMainKeywords(mainKeywords);
        trie.insertSubKeywords(subKeywords);

        // 构建Trie树的失败指针,用于KMP算法
        trie.buildFailPointers();

        // 获取用户输入的文本
        Scanner scanner = new Scanner(System.in);
        System.out.print("请输入中文文本:");
        String userInput = scanner.nextLine();
        scanner.close();

        // 搜索主副关键词,并返回匹配
        Set<Map.Entry<String, String>> matchedPairs = trie.searchMainSubPairs(userInput);

        // 输出匹配的主副关键词
        if (!matchedPairs.isEmpty()) {
            System.out.println("匹配的主副关键词:");
            for (Map.Entry<String, String> pair : matchedPairs) {
                System.out.println(pair.getKey() + " - " + pair.getValue());
            }
        } else {
            System.out.println("没有匹配到主副关键词。");
        }
    }
}

文章来源:https://blog.csdn.net/weixin_43114209/article/details/135684363
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。