1 2 3 | words=document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) |
1 2 3 | hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() |
1 2 | idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) |
1 2 3 | zipped=rate.zip(tfidf) data=zipped.map(lambda line ![]() training, test = data.randomSplit([0.6, 0.4], seed = 0) |
1 2 3 4 | NBmodel = NaiveBayes.train(training, 1.0) predictionAndLabel = test.map(lambda p : (NBmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 \ if x[0] == x[1] else 0.0).count() / test.count() |
1 2 3 4 5 | yourDocument=input("输入待分类的评论:") yourwords="/".join(jieba.cut_for_search(yourDocument)).split("/") yourtf = hashingTF.transform(yourwords) yourtfidf=idfModel.transform(yourtf) print('NaiveBayes Model Predict:',NBmodel.predict(yourtfidf),' |
1 2 3 4 5 | text=words.flatMap(lambda w:w) wordCounts = text.map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a+b).\ sortBy(lambda x: x[1],ascending=False) wordCounts.take(10) |
1 2 3 4 5 6 | def filterStopWords(line): for i in line: if i in stopwords: line.remove(i) return line words=words.map(lambda w : filterStopWords(w)) |
1 2 3 | words=document.map(lambda w:"/".join(jieba.\ cut(w, cut_all=True))).\ map(lambda line: line.split("/")) |
1 2 3 | SVMmodel = SVMWithSGD.train(training, iterations=100) predictionAndLabel = test.map(lambda p : (SVMmodel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count() |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |