利用文本挖掘技术来找出网络中的“小鲜词”

发布时间：2021-01-20 10:34:36 所属栏目：大数据来源：网络整理

导读：副标题#e# 开始之前，先看一下从人人网中发现的90后用户爱用的词是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）项目结构当然，text.da

用来做文本处理，如判断是否为空、匹配字符等

package grid.common;


public class TextUtils {

    public static boolean isCnLetter(char c) {
        return c >= 0x4E00 && c <= 0x9FCB;
    }

    public static boolean isNumeric(char c) {
        return c >= '0' && c <= '9';
    }

    public static boolean isEnLetter(char c) {
        return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
    }

    public static boolean match(String src,int off,String dest) {
        int len = dest.length();
        int srcLen = src.length();
        for (int i = 0; i < len; i++) {
            if (srcLen <= off + i) {
                return false;
            }
            if (dest.charAt(i) != src.charAt(off + i)) {
                return false;
            }
        }
        return true;
    }

    public static boolean isBlank(String str) {
        return null == str || str.isEmpty() || str.trim().isEmpty();
    }
}

Tree.java

语法树

package grid.common;


public class Tree<T> extends Node<T> {

    public Tree(T value) {
        super(value);
    }

}

dic

里边包含CnDictionary类

这里写图片描述

CnDictionary.java

词典处理

package grid.text.dic;

import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;


public class CnDictionary {

    private final String COMMON_WORD_DIC_PATH = "common.dic";

    /** * This text data is for character statistic. Change to your own if you * like. */
    private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";

    private Set<String> dictionary = new HashSet<String>();

    private CountMap<Character> letterCountMap = new CountMap<Character>();

    private int totalLetterCount;

    private static CnDictionary instance;
//单例模式
    public static CnDictionary Instance() {
        if (null == instance) {
            try {
                instance = new CnDictionary();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return instance;
    }

    private CnDictionary() throws IOException {
        initWordDic();
        initLetterCountMap();
    }

    private void initLetterCountMap() throws IOException {
        String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
        final int len = letterResource.length();
        char c;
        for (int i = 0; i < len; i++) {
            c = letterResource.charAt(i);
            if (TextUtils.isCnLetter(c)) {
                letterCountMap.increase(c);
            }
        }
        totalLetterCount = letterCountMap.count();

    }

    private void initWordDic() throws IOException {

        String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
        final int len = bytes.length();
        String s = "";
        char c;
        for (int i = 0; i < len; i++) {
            c = bytes.charAt(i);

            if ('n' == c || 'r' == c || 0 == c) {
                if (!TextUtils.isBlank(s)) {
                    dictionary.add(s.trim());
                }
                s = "";
            } else {
                s += c;
            }
            if (0 == c) {
                break;
            }
        }
    }

    public boolean contains(String word) {
        return dictionary.contains(word);
    }

    public double rate(char c) {
        return (double) letterCountMap.get(c) / totalLetterCount;
    }

    public int size() {
        return dictionary.size();
    }
}

evolution

这里写图片描述

EntropyJudger.java

计算熵值

package grid.text.evolution;

import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;


public class EntropyJudger {

    private TextIndexer indexer;

    /** * A word least appeared count */
    private static int LEAST_COUNT_THRESHOLD = 5;   //阈值

    /** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */
    private static double SOLID_RATE_THRESHOLD = 0.018;

    /** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */
    private static double ENTROPY_THRESHOL = 1.92;

    public EntropyJudger(TextIndexer indexer) {
        this.indexer = indexer;
    }

    public boolean judge(String candidate) {
        double solidRate = getSolidRate(candidate);

        if (solidRate < SOLID_RATE_THRESHOLD) {
            return false;
        }

        double entropy = getEntropy(candidate);

        if (entropy < ENTROPY_THRESHOL) {
            return false;
        }
        return true;
    }

    private double getEntropy(String candidate) {
        Pos pos = new Pos(candidate);
        CountMap<Character> frontCountMap = new CountMap<Character>();
        CountMap<Character> backCountMap = new CountMap<Character>();
        final int candidateLen = candidate.length();
        int off = 0;
        char c;
        double rate,frontEntropy = 0,backEntropy = 0;

        while (indexer.find(pos).isFound()) {
            off = pos.getPos();

            c = indexer.charAt(off - 1);
            if (TextUtils.isCnLetter(c)) {
                frontCountMap.increase(c);
            }
            c = indexer.charAt(off + candidateLen);
            if (TextUtils.isCnLetter(c)) {
                backCountMap.increase(c);
            }

        }

        for (char key : frontCountMap.keySet()) {
            rate = (double) frontCountMap.get(key) / frontCountMap.count();
            frontEntropy -= rate * Math.log(rate);
        }
        for (char key : backCountMap.keySet()) {
            rate = (double) backCountMap.get(key) / backCountMap.count();
            backEntropy -= rate * Math.log(rate);
        }

        return frontEntropy > backEntropy ? backEntropy : frontEntropy;

    }

    /** * @param candidate * @return */
    public double getSolidRate(String candidate) {

        final int candidateLen = candidate.length();

        if (candidateLen < 2) {
            return 1;
        }

        final int count = indexer.count(candidate);
        double rate = 1;

        if (count < LEAST_COUNT_THRESHOLD) {
            return 0;
        }

        for (int i = 0; i < candidateLen; i++) {
            rate *= (double) count / indexer.count("" + candidate.charAt(i));
        }

        return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen);
    }

    public void setIndexer(TextIndexer indexer) {
        this.indexer = indexer;
    }

}

NewWordDiscover.java

（编辑：我爱故事小小网_铜陵站长网）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!

3/13

首页

尾页

善用企业数据策略无惧	未来已来 Cloudera拥抱
MPP与Hadoop 两种主流	数据科学家应对的几大