机器学习---垃圾邮件分类案例

发布时间:2023年12月17日

1、Python贝叶斯案例

依次需要使用pip命令安装numpy、scipy、skikit_learn、matplotlib模块。
python代码:

import os
import sys
import codecs
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == '__main__':

    corpus = []
    labels = []
    corpus_test = []
    labels_test = []
    f = codecs.open("i:/ML/sms_spam.text","rb")
    count = 0
    while True:
        line = f.readline()
        if count == 0:
            count = count +1
            continue
        if line:
            count = count +1
            line = line.split(",")
            label = line[0]
            sentence = line[1]
            corpus.append(sentence)
            if "ham"==label:
                labels.append(0)
            elif "spam"==label:
                labels.append(1)
            if count > 5550:
                corpus_test.append(0)
                if "ham"==label:
                    labels_test.append(0)
                elif "spam"==label:
                    labels_test.append(1)
        else:
            break

    vectorizer=CountVectorizer()
    fea_train = vectorizer.fit_transform(corpus)
    print vectorizer.get_feature_names()
    print fea_train.toarray()

    vectorizer2=CountVectorizer(vocabulary=vectorizer.vocabulary_)
    fea_test = vectorizer2.fit_transform(corpus_test)
    print vectorizer2.get_feature_names()
    print fea_test.toarray()
    #create the Multinomial Naive Bayesian Classifier 
    #alpha = 1 拉普拉斯估计给每个单词加1 
    clf = MultinomialNB(alpha = 1)   
    clf.fit(fea_train,labels)
#    
    pred = clf.predict(fea_test);  
    for p in pred:
        if p == 0:
            print "正常邮件"
        else:
            print "垃圾邮件"

2、Scala贝叶斯案例

object Naive_bayes {
    def main(args: Array[String]){
        //1 构建Spark对象
        val conf = new SparkConf().setAppName("Naive_bayes").setMaster("local")
        val sc = new SparkContext(conf)
    }
    //读取样本数据1
    val data = sc.textFile("sample_naive_bayes_data.txt")
    val parsedData = data.map { line =》
        val parts = line.split(',')
        LabeledPoint(parts(0).toDouble,Vectors.dense(parts(1).split(' 、').map(_.toDouble))

    }

    //样本数据划分训练样本与测试样本
    val splits = parsedData.randomSplit(Array(0.9,0.1),seed = 11L)
    val training = splits(0)
    val test = splits(1)

    //新建贝叶斯分类模型,并训练,lambda拉普拉斯估计
    val model = NaiveBayes.train(training,lambda = 1.0)

    //对测试样本进行测试
    val predictionAndLabel = test.map(p => (model.predict(p.features),p.label))
    val print_predict = predictionAndLabel.take(10020)
    println("prediction" + "\t" + "label")
    for (i <- 0 to print_predict.length - 1) {
      println(print_predict(i)._1 + "\t" + print_predict(i)._2)
    }

    val accuracy = 1.0 * predictionAndLabel
.filter(x => x._1 == x._2).count() / test.count()
    println(accuracy)
    //保存模型
    val ModelPath = "naive_bayes_model"
    model.save(sc, ModelPath)
    val sameModel = NaiveBayesModel.load(sc, ModelPath)

}
}

文章来源:https://blog.csdn.net/yaya_jn/article/details/134921245
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。