ECNU自然语言处理作业4:朴素贝叶斯分类

发布时间:2024年01月20日

朴素贝叶斯情感分类

任务描述:

利用nltk语料库中的影评来进行朴素贝叶斯情感分类训练。

影评导入:

from nltk.corpus import movie_reviews
文件具体目录在…\nltk_data\corpora\movie_reviews,已做好分类标注,消极与积极影评各1000 条。

步骤:

经过文本预处理(去噪、分句、分词、去停词、取词干、修剪)和特征选择,生成特
征词表,之后利用朴素贝叶斯模型进行训练。(每个步骤最好注释一下)
选择前 80%(即前 800 条消极影评与前 800 条积极影评)作为训练集,后 20%作为测试集。

输出:

准确率 Accuracy,精确率 Precision,召回率 Recall 和 F1 值,精确到小数点后两位。
其中,F1 = ( 2 * Precision * Recall ) / ( Precision + Recall)。

例如:

Accuracy = 0.98
Precision = 0.67
Recall = 0.32
F1 = 0.43


第一步 读入数据
from nltk.corpus import movie_reviews
all_words = movie_reviews.words()
# all_words

第二步 文本预处理(参考作业二)

标准化,即全部小写、去标点、去停用词,并提取词干。

import re
import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

def prepro(token):
    #转为小写
    text_lower = [word.lower() for word in token]

    #消标点
    x=re.compile('[%s]' % re.escape(string.punctuation))
    text_punc = []
    for review in [text_lower]:
        for token in review:
            new_token = x.sub(u'', token)
            if not new_token == u'':
                text_punc.append(new_token)

    #处理停用词
    stops=set(stopwords.words('english'))
    text_norm=[w for w in text_punc if w not in stops]
    
    #词干提取
    text_stem=[]
    for word in text_norm:
        text_stem.append(PorterStemmer().stem(word))
    return text_stem

pro_words = prepro(all_words)

# pro_words

第三步 特征选择
import nltk
# 特征选择,取词频前5%或前2000个词。
freq_words = nltk.FreqDist(pro_words)
n = int( 0.05 * len(freq_words.keys() ))
workds_features = freq_words.most_common(n)
workds_features = [x[0] for x in workds_features]
# workds_features = list(freq_words.keys())[:2000]
# workds_features

第四步 生成词表
def doc_features(doc):
    # 去重复
    doc_words = set(doc) 
    features = {}
    for word in workds_features:
        # word 在不在总此表中 向量化
        features['contains(%s)' % word] = (word in doc_words)
    return features

documents = [(list(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()  # 类别
            for fileid in movie_reviews.fileids(category)] # 文件名

featuresets = [(doc_features(d),c) for (d,c) in documents]
    
featuresets

输出:

[({'contains(film)': True,
   'contains(movi)': False,
   'contains(one)': True,
   'contains(like)': True,
   'contains(charact)': False,
   'contains(make)': True,
   'contains(get)': True,
   'contains(time)': False,
   'contains(scene)': False,
   'contains(even)': True,
...
   'contains(fairli)': False,
   'contains(seven)': False,
   ...},
  'neg'),
 ...]

第五步 划分数据集
train_set = featuresets[0:800] + featuresets[1000:1800]
test_set = featuresets[800:1000] + featuresets[1800:2000]

第六步 模型训练
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
from collections import defaultdict

"""
A classifier based on the Naive Bayes algorithm.  In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):

|                       P(label) * P(features|label)
|  P(label|features) = ------------------------------
|                              P(features)

The algorithm then makes the 'naive' assumption that all features are
independent, given the label:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                                         P(features)

Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:

|                       P(label) * P(f1|label) * ... * P(fn|label)
|  P(label|features) = --------------------------------------------
|                        SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""

def custome_train(labeled_featuresets, estimator=ELEProbDist):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occurred, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:
            label_freqdist[label] += 1
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname][fval] += 1
                # Record that fname can take the value fval.
                feature_values[fname].add(fval)
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                # Only add a None key when necessary, i.e. if there are
                # any samples with feature 'fname' missing.
                if num_samples - count > 0:
                    feature_freqdist[label, fname][None] += num_samples - count
                    feature_values[fname].add(None)

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[label, fname] = probdist

        return label_probdist, feature_probdist
def custome_prob_classify(featureset,_label_probdist,_feature_probdist):
        # Discard any feature names that we've never seen before.
        # Otherwise, we'll just assign a probability of 0 to
        # everything.
        _labels = list(_label_probdist.samples())
        featureset = featureset.copy()
        for fname in list(featureset.keys()):
            for label in _labels:
                if (label, fname) in _feature_probdist:
                    break
            else:
                # print('Ignoring unseen feature %s' % fname)
                del featureset[fname]

        # Find the log probability of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}
        for label in _labels:
            logprob[label] = _label_probdist.logprob(label)

        # Then add in the log probability of features given labels.
        for label in _labels:
            for (fname, fval) in featureset.items():
                if (label, fname) in _feature_probdist:
                    feature_probs = _feature_probdist[label, fname]
                    logprob[label] += feature_probs.logprob(fval)
                else:
                    # nb: This case will never come up if the
                    # classifier was created by
                    # NaiveBayesClassifier.train().
                    logprob[label] += sum_logs([])  # = -INF.

        return DictionaryProbDist(logprob, normalize=True, log=True)

def custome_classify(featureset,label_probdist,feature_probdist):
        return custome_prob_classify(featureset,label_probdist,feature_probdist).max()
_label_probdist, _feature_probdist = custome_train(train_set)

第七步 模型评估
import collections
from nltk import precision,recall,f_measure

#准确率
print("Accuracy = %.2f" % nltk.classify.accuracy(nltk.NaiveBayesClassifier(_label_probdist, _feature_probdist),test_set))
#精确率、召回率、FI
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i,(feats, label) in enumerate(test_set):
    refsets[label]. add(i)
    observed = custome_classify(feats,_label_probdist, _feature_probdist)
    testsets[observed].add(i)
    
# positive
print('-'*20)
print('postive类得分')
print ('Precision = %.2f'% precision(refsets['pos'],testsets['pos']))
print('Recall = %.2f'% recall(refsets['pos'],testsets['pos']))
print ('F1 = %.2f'% f_measure(refsets['pos'], testsets['pos']))
# negative
print('-'*20)
print('negative类得分')
print('Precision = %.2f'% precision(refsets['neg'],testsets['neg']))
print('Recall = %.2f'% recall(refsets['neg'], testsets['neg']))
print('F1 = %.2f'% f_measure(refsets['neg'],testsets['neg']))

输出:

Accuracy = 0.78
--------------------
postive类得分
Precision = 0.80
Recall = 0.76
F1 = 0.78
--------------------
negative类得分
Precision = 0.77
Recall = 0.81
F1 = 0.79
文章来源:https://blog.csdn.net/Yushan_Ji/article/details/135708989
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。