獲取收藏榜前50頁的小說列表,第一頁網(wǎng)址為 ‘http://www.jjwxc.net/bookbase.php?fw0=0fbsj=0ycx0=0xx2=2mainview0=0sd0=0lx0=0fg0=0sortType=0isfinish=0collectiontypes=orssearchkeywords=page=1' , 第二頁網(wǎng)址中page=2,以此類推,直到第50頁中page=50。爬取每個(gè)小說的ID,小說名字,小說作者。將爬取到的信息存儲到晉江排行榜【按收藏?cái)?shù)】.txt文件中。
import requests from bs4 import BeautifulSoup import bs4 import re import csv import pandas as pd import numpy as np import matplotlib.pyplot as plt import jieba import seaborn as sns import xlrd from xlutils.copy import copy # 一些魔法命令,使得matplotlib畫圖時(shí)嵌入單元中而不是新開一個(gè)窗口 %matplotlib inline plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' %load_ext autoreload %autoreload 2 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn import metrics from sklearn.model_selection import train_test_split
headers = {"User-Agent": "Mozilla/5.0"} for n in range(1,50): url = 'http://www.jjwxc.net/bookbase.php?fw0=0fbsj=0ycx0=0xx2=2mainview0=0sd0=0lx0=0fg0=0sortType=0isfinish=0collectiontypes=orssearchkeywords=page={}'.format(n) html = requests.get(url,headers=headers) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text, 'html.parser') for tr in soup.find_all('tbody'): tds=tr('td') a = tr('a') count=0 id=[] for u in tr.find_all('a'): count=count+1 book_url=u.get('href') # 獲取小說主頁的url p = re.compile(r'\d+') book_id = p.findall(book_url)[0] # 獲取小說ID if(count%2==0): id.append(book_id) for n in range(0,100): with open('./data/晉江排行榜【按收藏?cái)?shù)】.txt','a+',encoding='utf-8') as f: print("{0}\t{1}\t{2}".format(id[n],a[n*2+1].string,a[n*2].string),file=f) # 序號 書名 作者
分別查看前8部小說的ID和名字
# 查看收藏榜前8部小說的ID with open('./data/晉江排行榜【按收藏?cái)?shù)】.txt','r',encoding='utf-8',errors='ignore') as f: book_list = f.readlines() id_list = [item.split('\t')[0] for item in book_list] print(id_list[:8])
# 查看收藏榜前8部小說的名字 name_list = [item.split('\t')[1] for item in book_list] print(name_list[:8])
找到小說的評論區(qū),第一部小說《天官賜?!返牡谝豁撛u論網(wǎng)址為 ‘http://www.jjwxc.net/comment.php?novelid=3200611huati=1' ,3200611是小說ID,1是評論頁數(shù),這部小說第二頁網(wǎng)址為'http://www.jjwxc.net/comment.php?novelid=3200611huati=2' 。下一部小說《撒野》的ID是2956313,它的第一頁評論網(wǎng)址為'http://www.jjwxc.net/comment.php?novelid=2956313huati=1' ,以此類推,爬取所有小說的評論和打分。為了避免有一些小說評論數(shù)不夠多,自己設(shè)定每部小說只爬取5頁的評論。
爬取思路與爬取小說信息大致相同,不同的是將爬取到的信息存儲到xls文件中。
headers = {"User-Agent": "Mozilla/5.0"} with open('./data/晉江排行榜【按收藏?cái)?shù)】.txt','r',encoding='utf-8') as f: book_list = f.readlines() id_list = [item.split('\t')[0] for item in book_list] for book_id in id_list: for page in range(1,6): url="http://www.jjwxc.net/comment.php?novelid={}huati=1page={}".format(book_id,page) html = requests.get(url,headers=headers) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text, 'html.parser') scores=[] comments=[] for item1 in soup.find_all('span',"coltext"): score=item1('span') scores.append(score[2].string) for item2 in soup.find_all('div',"readbody"): comment=item2('span') comments.append(comment[0].string) for i in range(0,len(comments)): excel = xlrd.open_workbook('./data/jjwxc1.xls') wb = copy(excel) w_sheet = wb.get_sheet(0) sheet = excel.sheets()[0] nrow = sheet.nrows # 文件行數(shù) w_sheet.write(nrow, 0, book_id) w_sheet.write(nrow, 1, comments[i]) w_sheet.write(nrow, 2, scores[i]) wb.save('./data/jjwxc1.xls')
預(yù)處理包括:
使用pandas模塊可以快速將xls文件轉(zhuǎn)換為.csv
# 格式轉(zhuǎn)化 ex=pd.read_excel("./data/jjwxc.xls") ex.to_csv("./data/jjwxc.csv",encoding="gb18030")
# 加載評論 review = pd.read_csv("./data/jjwxc.csv",names=['ID','comment','score'],encoding='gb18030')
去除重復(fù)的行
# 去重 review = review.drop_duplicates()
去除評論相同的行
# 刪除評論內(nèi)容重復(fù)的行 review= review.drop_duplicates('comment') review.shape
根據(jù)打分的分?jǐn)?shù)來添加情緒標(biāo)簽,觀察晉江文學(xué)城的打分機(jī)制發(fā)現(xiàn),打分區(qū)間在[-2,2]內(nèi),且打2分的人數(shù)占大多數(shù),于是將分?jǐn)?shù)為2的評論看作是好評,情緒標(biāo)簽為1,而低于2分的看作是差評,情緒標(biāo)簽為0。
# 添加情緒標(biāo)簽 review['emotion'] = (review.score ==2) * 1
# 打亂順序 review = review.sample(frac=1).reset_index(drop=True) print(review.shape)
短評內(nèi)容進(jìn)行分詞并去掉停用詞
def review_without_stop(review): # 打開停用詞文件 with open("./data/emotion_stopwords.txt","r",encoding="utf-8") as f: stop_word = [x.strip() for x in f.readlines()] all_stop_words = set(stop_word) # 刪除停用詞中重復(fù)的項(xiàng) # 短評中的非中文字符替換為'' review = re.sub("[^\u4e00-\u9fa5]",'',review) # 去除全角空白字符 review = review.replace("\u3000","") # 分詞 review = jieba.cut(review) # 過濾一個(gè)字的詞 review = filter(lambda x: len(x)>1,review) # 去除停用詞 review = filter(lambda x: x not in all_stop_words,review) return ' '.join(review)
# 自定義分詞字典 jieba.load_userdict("./data/emotion_userdict.txt") review['cut_jieba'] = review.comment.apply(review_without_stop)
【注】停用詞和分詞文件需要自己定義
# 查看一些評論 review.head()
# 好評中一些評論包含“不想”,“不喜歡” review[(review['cut_jieba'] == '不想') (review['emotion'] == 1)]
review[(review['cut_jieba'] == '不喜歡') (review['emotion'] == 1)]
# 好評中出現(xiàn)的消極情緒詞,去除這些評論 def change_negtive_like(cut_text): word_list = cut_text.split() if "不喜歡" in word_list: for i in range(len(word_list)): if word_list[i] == "不喜歡": word_list[i] = "" return " ".join(word_list) elif "不想" in word_list: for i in range(len(word_list)): if word_list[i] == "不想": word_list[i] = "" return " ".join(word_list) else: return cut_text
review.loc[review['emotion'] == 1,'cut_jieba'] = review[review['emotion'] == 1].cut_jieba.apply(change_negtive_like)
# 一些評論內(nèi)容為空,去除這些為空的評論 review = review[~(review['cut_jieba'] == '')] review.shape
對所有短評進(jìn)行可視化
from wordcloud import WordCloud from imageio import imread mask = imread("./data/cloud.jpg") font = './data/FZSTK.TTF' wc = WordCloud( font_path= font, max_words=2000, # 設(shè)置最大現(xiàn)實(shí)的字?jǐn)?shù) max_font_size=250,# 設(shè)置字體最大值 background_color = "white", random_state=30, mask = mask) wc.generate(''.join(review['cut_jieba'])) # 生成詞云 plt.imshow(wc) plt.axis('off')
對emotion為1的短評進(jìn)行可視化
from wordcloud import WordCloud from imageio import imread mask = imread("./data/piggy.jpg") font = './data/FZSTK.TTF' wc1 = WordCloud( font_path= font, max_words=2000, # 設(shè)置最大現(xiàn)實(shí)的字?jǐn)?shù) max_font_size=300,# 設(shè)置字體最大值 background_color = "white", random_state=30, mask = mask) wc1.generate(''.join(review['cut_jieba'][review['emotion']==1])) plt.imshow(wc1) plt.axis('off')
對score為-2的短評進(jìn)行可視化
wc1.generate(''.join(review['cut_jieba'][review['score']==-2])) # 生成詞云 plt.imshow(wc1) plt.axis('off')
【注】詞云和字體自己定義
由于已經(jīng)為分析準(zhǔn)備好了數(shù)據(jù),所以現(xiàn)在需要將數(shù)據(jù)分成訓(xùn)練數(shù)據(jù)集和測試數(shù)據(jù)集。將數(shù)據(jù)分成兩部分:75%的訓(xùn)練數(shù)據(jù)和25%的測試數(shù)據(jù)。
x, y = review['cut_jieba'], review['emotion'] x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape) print(y_train.shape)
print(x_test.shape) print(y_test.shape)
使用 sklearn 包中的 TfidfVectorizer 方法進(jìn)行特征提取。
from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vect = TfidfVectorizer(decode_error='ignore', token_pattern=r"\b[^\d\W]\w+\b", # 剔除向量化結(jié)果中的數(shù)字 analyzer='word', ngram_range=(2,4), max_df = 0.8, min_df = 3) Xtrain = tfidf_vect.fit_transform(x_train) Xtest = tfidf_vect.transform(x_test)
print(Xtrain.shape) print(Xtest.shape)
from sklearn.naive_bayes import MultinomialNB
review_classifier = MultinomialNB() review_classifier.fit(Xtrain,y_train)
# 對測試集的樣本進(jìn)行預(yù)測 y_pred = review_classifier.predict(Xtest) metrics.confusion_matrix(y_test, y_pred) # 混淆矩陣
# 利用 sns 模塊查看測試值和預(yù)測值構(gòu)成的熱圖 colorMetrics = metrics.confusion_matrix(y_test, y_pred) sns.heatmap(colorMetrics,annot=True,fmt='d')
# 分類報(bào)告 # 給出每個(gè)類的準(zhǔn)確率,召回率和F值,以及這三個(gè)參數(shù)和宏平均值 print(metrics.classification_report(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
from sklearn.model_selection import cross_val_score score1 = cross_val_score(review_classifier,Xtrain,y_train,cv=10,scoring="accuracy").mean() print(score1)
from sklearn.linear_model import LogisticRegression LR_model = LogisticRegression(penalty='l2',max_iter=3000) LR_model.fit(Xtrain,y_train)
# 對測試集的樣本進(jìn)行預(yù)測 y_pred = LR_model.predict(Xtest) metrics.confusion_matrix(y_test, y_pred) # 混淆矩陣
print(LR_model.score(Xtest,y_test))
# 給出每個(gè)類的準(zhǔn)確率,召回率和F值,以及這三個(gè)參數(shù)和宏平均值 print(metrics.classification_report(y_test,y_pred))
(1)詞云分析:
(2)影響情感分析準(zhǔn)確性的原因:
到此這篇關(guān)于爬取晉江文學(xué)城小說評論(情緒分析)的文章就介紹到這了,希望對你有所幫助,更多相關(guān)python爬取內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持腳本之家!
標(biāo)簽:貴州 雙鴨山 日照 克拉瑪依 金華 陽泉 臨汾 赤峰
巨人網(wǎng)絡(luò)通訊聲明:本文標(biāo)題《python爬取晉江文學(xué)城小說評論(情緒分析)》,本文關(guān)鍵詞 python,爬取,晉江,文學(xué),城,;如發(fā)現(xiàn)本文內(nèi)容存在版權(quán)問題,煩請?zhí)峁┫嚓P(guān)信息告之我們,我們將及時(shí)溝通與處理。本站內(nèi)容系統(tǒng)采集于網(wǎng)絡(luò),涉及言論、版權(quán)與本站無關(guān)。