pip install jieba
pip install matplotlib
pip install wordcloud
pip install snownlp
# -*- coding: utf-8 -*-import jiebafrom wordcloud import WordCloudimport matplotlib.pyplot as plt# 加載自定義分詞字典jieba.load_userdict('news.txt')# 語(yǔ)料corpos = '美媒稱,鑒于全球石油市場(chǎng)過(guò)度供給的情況,中國(guó)原油需求下滑是其首要擔(dān)憂之一。過(guò)量生產(chǎn)拉低了石油價(jià)格,但是中國(guó)過(guò)去一年左右的疲弱需求引發(fā)了緩慢的回彈。'seg_list = jieba.cut(corpos)seg_list2 = jieba.cut(corpos)text = ' '.join(seg_list)# 詞頻統(tǒng)計(jì)segStat = {}for seg in seg_list2: if seg in segStat: segStat[seg] = 1 else: segStat[seg] = 1print segStat# 創(chuàng)建詞云wordcloud = WordCloud(font_path='D:\\PDM\\2.1\\simhei.ttf', background_color='black').generate(text)plt.imshow(wordcloud)plt.axis('off')plt.show()
# -*- coding: utf-8 -*-import jieba.analyse# 語(yǔ)料corpos = '美媒稱,鑒于全球石油市場(chǎng)過(guò)度供給的情況,中國(guó)原油需求下滑是其首要擔(dān)憂之一。過(guò)量生產(chǎn)拉低了石油價(jià)格,但是中國(guó)過(guò)去一年左右的疲弱需求引發(fā)了緩慢的回彈。'# 設(shè)置停用詞jieba.analyse.set_stop_words('stop_words.txt')# 提取關(guān)鍵詞#tags = jieba.analyse.extract_tags(corpos, topK=5)tags = jieba.analyse.textrank(corpos, topK=5, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))print(','.join(tags))
# -*- coding: utf-8 -*-import jieba.analysefrom snownlp import SnowNLPcorpos = u'美媒稱,鑒于全球石油市場(chǎng)過(guò)度供給的情況,中國(guó)原油需求下滑是其首要擔(dān)憂之一。過(guò)量生產(chǎn)拉低了石油價(jià)格,但是中國(guó)過(guò)去一年左右的疲弱需求引發(fā)了緩慢的回彈。'# 抽取文章關(guān)鍵詞tags = jieba.analyse.extract_tags(corpos, topK=5)text1 = []for tag in tags: text1.append(tag)print text1# 文章列表[[doc1],[doc2],[doc3]...]text = [text1,[u'文章',u'doc2'],[u'這是doc3']]text2 = text1s = SnowNLP(text)# 值越大越相似print s.sim(text2)# [1.8325582915371863, 0, 0]
# -*- coding: utf-8 -*-from snownlp import SnowNLPtext1 = u'美媒稱,鑒于全球石油市場(chǎng)過(guò)度供給的情況,中國(guó)原油需求下滑是其首要擔(dān)憂之一。過(guò)量生產(chǎn)拉低了石油價(jià)格,但是中國(guó)過(guò)去一年左右的疲弱需求引發(fā)了緩慢的回彈。's = SnowNLP(text1)print s.summary(3)
聯(lián)系客服