利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-20 10:34:36  所属栏目:大数据  来源:网络整理 
            导读:副标题#e# 开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.da
                
                
                
            | 抽词程序 package grid.text.evolution;
import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;
import java.util.HashSet;
import java.util.Set;
public class NewWordDiscover {
    private CnDictionary dictionary;
    /** * Minimum word length */
    private final static int MIN_CANDIDATE_LEN = 2;
    /** * Maximum word length */
    private final static int MAX_CANDIDATE_LEN = 6;
    private static Set<Character> structuralLetterSet = new HashSet<Character>();
    private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' };
    static {
        for (char c : structuralLetters) {
            structuralLetterSet.add(c);
        }
    }
    public NewWordDiscover() {
        dictionary = CnDictionary.Instance();
    }
    /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */
    public Set<String> discover(String document) {
        Set<String> set = new HashSet<String>();
        TextIndexer indexer = new CnPreviewTextIndexer(document);
        TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN);
        EntropyJudger judger = new EntropyJudger(indexer);
        String candidate;
        while (!selector.end()) {
            candidate = selector.next();
            if (TextUtils.isBlank(candidate)) {
                continue;
            }
            if (structuralLetterSet.contains(candidate.charAt(0))
                    || structuralLetterSet.contains(candidate.charAt(candidate
                            .length() - 1))) {
                continue;
            }
            // Replace IF clause with "set.contains(candidate)" if you want to
            // find new word without any dictionary
            if (dictionary.contains(candidate) || set.contains(candidate)) {
                selector.select();
            } else if (judger.judge(candidate)) {
                set.add(candidate);
            }
        }
        return set;
    }
}index(编辑:我爱故事小小网_铜陵站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! | 


 浙公网安备 33038102330570号
 浙公网安备 33038102330570号