1 2 3 4 | originData=sc.textFile('YOUR_FILE_PATH') originDistinctData=originData.distinct() rateDocument=originDistinctData.map(lambda line : line.split('\t')).\ filter(lambda line : len(line)==2) |
1 2 | fiveRateDocument=rateDocument.filter(lambda line : int(line[0])==5) fiveRateDocument.count() |
1 2 3 | negRateDocument=oneRateDocument.union(twoRateDocument).\ union(threeRateDocument) negRateDocument.repartition(1) |
1 2 3 4 5 | posRateDocument=sc.parallelize(fiveRateDocument.take(negRateDocument.count())).repartition(1) allRateDocument=negRateDocument.union(posRateDocument) allRateDocument.repartition(1) rate=allRateDocument.map(lambda s : ReduceRate(s[0])) document=allRateDocument.map(lambda s: s[1]) |
1 2 3 | words=document.map(lambda w:"/".\ join(jieba.cut_for_search(w))).\ map(lambda line: line.split("/")) |
1 2 3 | hashingTF = HashingTF() tf = hashingTF.transform(words) tf.cache() |
1 2 | idfModel = IDF().fit(tf) tfidf = idfModel.transform(tf) |
欢迎光临 电子技术论坛_中国专业的电子工程师学习交流社区-中电网技术论坛 (http://bbs.eccn.com/) | Powered by Discuz! 7.0.0 |