最近一段时间在学习python爬虫,结合自己学习的内容写一个简单爬虫来获取网易云音乐中指定歌曲的评论信息。第一次发博客并且也是入门爬虫不久,有问题的地方希望大家指出,共同进步~~!
我们的目标是爬取网易云中想要的任何一首歌曲的评论。
观察网易云音乐的网页可以发现两个特点:
所以在本文使用Selenium结合Chrome来爬取内容( PhatomJS已经被建议不使用了,使用chrome的无头模式)
首先打开网易云音乐
接下来要查找搜索框的元素
看到input的name是‘srch’,同时这部分代码还不在iframe框架内,所以这里可以直接获取该元素。
chrome_options = Options()
self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options = self.chrome_options)
#driver = webdriver.Chrome()
driver.get(self.url)
time.sleep(0.5)
driver.set_window_size(1280,800)
put = driver.find_element_by_id("srch")
在这里遇到了一个bug卡了我好久?,就是当设置为无头chrome时打开浏览器的当前界面是没有这个元素的,需要将它显示出来才可以进行下一步操作,而我把窗口放到最大或是移动滚动条都没有效果,后来把屏幕放大到指定数值才可以继续操作。
在操作过程中的把页面截取下来才看出问题。
driver.save_screenshot('D:\\a.png')
接下来是在搜索框中输入搜索内容进入下一个页面
put.send_keys('云烟成雨') #一首我特别喜欢的民谣
time.sleep(0.5)
put.send_keys(Keys.ENTER) #回车操作
到了这个界面后因为我遇到有的歌曲搜索后是直接显示在了专辑栏,所以我先点击单曲栏,再选中第一首歌。
需要注意的是这里已经进入了iframe包括的代码内容,所以需要先把iframe框架中的内容加载出来:
driver.switch_to_frame(‘g_iframe’)
这里的’g_iframe’是iframe的框架。
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'g_iframe')))
driver.switch_to_frame('g_iframe') #网页使用了iframe,需要进行读取
time.sleep(1)
put = driver.find_element_by_class_name('fst') #选中单曲列表
put.click()
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'w0')))
music = driver.find_element_by_class_name('w0')
music = music.find_element_by_class_name('text')
music = music.find_element_by_class_name('s-fc7')
music_lyrics = driver.find_element_by_class_name('w1').text
music_name = driver.find_element_by_class_name('w0').text
print('您搜索到的音乐是 '+music_name+' '+music_lyrics) #看一下自己到底搜到的是什么歌
music.click()
接下来就是开始获取每一页上的评论信息,从每一条评论中可以得到的信息是评论者的姓名,评论的内容,评论的时间和点赞数,还有是针对另一个评论者的回复。这里建一个people的字典来存储这些信息。
people = {'names':[], 'comments':[], 'dates':[], 'votes':[], 'replied_names':[], 'replied_comments':[]}
在获取信息时通过一页页地向后翻来加载评论信息。
def download_next_page(driver):
'获取下一页的html代码'
time.sleep(0.5)
next_page = driver.find_element_by_class_name('znxt')
time.sleep(0.5)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #把滚动条拉到翻页按钮处模仿用户操作
next_page.click()
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'itm'))) #这里是等到确保页面加载完成再进行下一步操作
content = driver.page_source
content = content.replace('
','\n') #输出时按照网页上的格式换行保存评论内容
html = BeautifulSoup(content,'lxml')
return html
也可以向前翻页
def download_previous_page(driver):
'获取上一页的HTML代码'
time.sleep(1)
previous_page = driver.find_element_by_class_name('zprv')
previous_page.click()
#因为向前翻页的时候页面都已经加载过了所以不需要再把滚动条拉下来
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'itm')))
content = driver.page_source
content = content.replace('
','\n')
html = BeautifulSoup(content,'lxml')
return html
在获取评论信息的时候首先还是需要将数据信息规范成统一形式的,研究后使用两个函数来统一时间和点赞数的格式。
def change_vote(vote):
'确保评论的点赞数格式统一为int'
try:
change = vote[vote.index('(')+1:vote.index(')')]
if '万' in change:
change = int(float(change[:change.index('万')])*10000)
else:
change = int(change)
except:
change = 0
return change
def change_time(self,time):
'把时间格式统一为%Y-%m-%d %H:%M,但是时间过早的评论只显示了日期'
now = datetime.datetime.now()
的y = now.strftime('%Y-%m-%d')
year = now.strftime('%Y')
if '昨天' in time:
time = time.replace('昨天',day+' ')
elif '前' in time:
minut = int(time[:time.index('分')])
time = (now + datetime.timedelta(minutes=-minut)).strftime('%Y-%m-%d %H:%M')
elif len(time) == 5:
time = day + ' ' + time
elif time.index('月') == 1:
time = time.replace('月','-').replace('日','')
time = year+ '-' + time
e elif '年' in time:
time = time.replace('年','-').replace('月','-').replace('日','')
else:
print('不明时间格式')
return None
return time
统一格式后就开始把一页页的评论放入字典people中。
首先定义存放一页评论的函数:
def one_page_comments_download(html):
'收集用户评论的姓名,内容,时间,得赞数,针对谁的回复(姓名和内容)'
persons = html.find_all(class_ = 'itm')
for person in persons:
comment = person.find(class_ = 'cnt').text
name = comment[:comment.indedx(':')]
comment = comment[comment.index(':')+1:]
date = person.find(class_ = 'time')
date = self.change_time(date.text)
vote = person.find(class_ = 'rp')
try:
vote = vote.text[vote.text.index('(')+1:vote.text.index(')')]
vote = int(vote)
except ValueError:
vote = 0
try:
replied_comment = person.find(class_ = 'que').text
if '删除' in replied_comment: #遇到’该评论已被删除‘
replied_comment = replied_comment
replied_name = None
else:
replied_name = replied_comment[:replied_comment.index(':')]
replied_comment = replied_comment[replied_comment.index(':')+1:]
except AttributeError as e:
replied_comment = None
replied_name = None
people['names'].append(name)
people['comments'].append(comment)
people['dates'].append(date)
people['votes'].append(vote)
people['replied_names'].append(replied_name)
people['replied_comments'].append(replied_comment)
接下来就是使用循环获取大量评论。
def collect_comments(n=1,name = '云烟成雨'): #默认是我特别喜欢的一首民谣
'n是想要爬取的页码,name是要爬取的歌名'
driver = search(name)
html = []
if n<1:
print('抱歉,您至少得爬一页吧')
driver.close()
return None
elif n>=1:
try:
download_next_page(driver)
html.append(download_previous_page(driver))
#翻页是因为直接搜索到歌曲后会显示最火的15条评论,我希望按照时间顺序获取,翻页再翻回去就没有那15条评论了
one_page_comments_download(html[0])
print('获取了第1页的评论')
for i in range(int(n-1)):
html.append(download_next_page(driver))
one_page_comments_download(html[i])
print('获取了第'+str(i+2)+'页的评论')
time.sleep(0.5)
#print('获取了第'+k+'页的评论')
except Exception as e:
print(e)
finally:
driver.close()
return people
因为光获取数据还是不够的,所以再定义两个函数来存储信息。
def save_mysql(self,people):
'把获取的数据存入数据库'
db = pymysql.connect(host = 'localhost', port = 3306 ,user = 'root', passwd = 'your password', db = 'your db', charset='utf8mb4') #使用utf8mb4来显示一些表情符号等等
cursor = db.cursor()
sql1 = 'USE yourdatabase'
sql2 = 'INSERT INTO wyycomments (name, own_comment, vote, date, replied_name, replied_comment) VALUES (%s,%s,%s,%s,%s,%s)'
#我已经建好了这个数据表,所以直接插入数据
for i in range(len(people['names'])):
try:
cursor.execute(sql1)
cursor.execute(sql2,(people['names'][i],people['comments'][i],people['votes'][i],people['dates'][i],people['replied_names'][i],people['replied_comments'][i]))
cursor.connection.commit()
except Exception as e:
print(e)
db.rollback()
continue
cursor.close()
db.close()
def save_csv(self,people):
'把获取的数据存入csv文件中'
#把字典转换为pandas模块中的DataFrame格式
people = DataFrame(people)
people.to_csv('D:\\wyy_comments.csv',encoding = 'utf_8_sig')
结合上面的过程定义一个类来爬取评论。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time,datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pymysql
from pandas import DataFrame
class wyy(): #从网易云音乐获取评论并制作词云
def __init__(self):
self.url = 'https://music.163.com/'
self.chrome_options = Options()
self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('--disable-gpu')
self.people = {'names':[], 'comments':[], 'dates':[], 'votes':[], 'replied_names':[], 'replied_comments':[]}
def search(self,name):
'根据歌名歌手来搜索歌曲'
#self.driver = webdriver.Chrome() #打开浏览器进行操作
driver = webdriver.Chrome(chrome_options = self.chrome_options) #无头模式的Chrome
driver.get(self.url)
time.sleep(0.5)
driver.set_window_size(1280,800) #在无头模式下把window_size放大以便能找到下面的‘srch’元素
put = driver.find_element_by_id("srch")
put.send_keys(name)
time.sleep(0.5)
put.send_keys(Keys.ENTER)
time.sleep(1)
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'g_iframe')))
driver.switch_to_frame('g_iframe') #网页使用了iframe,需要进行读取
time.sleep(1)
put = driver.find_element_by_class_name('fst') #选中单曲列表
put.click()
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'w0')))
music = driver.find_element_by_class_name('w0')
music = music.find_element_by_class_name('text')
music_lyrics = driver.find_element_by_class_name('w1').text
music_name = driver.find_element_by_class_name('w0').text
print('您搜索到的音乐是 '+music_name+' '+music_lyrics) #看一下自己到底搜到的是什么歌
#因为歌名后可能有其他元素,使用下面的try语句
try:
music = music.find_element_by_class_name('s-fc7')
except:
pass
music.click()
time.sleep(1)
return driver
def download_next_page(self,driver):
'获取下一页的html代码'
time.sleep(0.5)
next_page = driver.find_element_by_class_name('znxt')
time.sleep(0.5)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)") #把滚动条拉到翻页按钮处模仿用户操作
next_page.click()
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'itm'))) #这里是等到确保页面加载完成再进行下一步操作
content = driver.page_source
content = content.replace('
','\n') #输出时按照网页上的格式换行保存评论内容
html = BeautifulSoup(content,'lxml')
return html
def download_previous_page(self,driver):
'获取上一页的HTML代码'
time.sleep(1)
previous_page = driver.find_element_by_class_name('zprv')
previous_page.click()
#因为向前翻页的时候页面都已经加载过了所以不需要再把滚动条拉下来
wait = WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME,'itm')))
content = driver.page_source
content = content.replace('
','\n')
html = BeautifulSoup(content,'lxml')
return html
def change_time(self,time):
'把时间格式统一为%Y-%m-%d %H:%M,但是时间过早的评论只显示了日期'
now = datetime.datetime.now()
day = now.strftime('%Y-%m-%d')
year = now.strftime('%Y')
'把时间转换为统一格式'
if '昨天' in time:
time = time.replace('昨天',day+' ')
elif '前' in time:
minut = int(time[:time.index('分')])
time = (now + datetime.timedelta(minutes=-minut)).strftime('%Y-%m-%d %H:%M')
elif len(time) == 5:
time = day + ' ' + time
elif time.index('月') == 1:
time = time.replace('月','-').replace('日','')
time = year+ '-' + time
elif '年' in time:
time = time.replace('年','-').replace('月','-').replace('日','')
else:
print('不明时间格式')
return None
return time
def change_vote(self,vote):
'确保评论的点赞数格式统一为int'
try:
change = vote[vote.index('(')+1:vote.index(')')]
if '万' in change:
change = int(float(change[:change.index('万')])*10000)
else:
change = int(change)
except:
change = 0
return change
def one_page_comments_download(self,html):
'收集用户评论的姓名,内容,时间,得赞数,针对谁的回复(姓名和内容)'
persons = html.find_all(class_ = 'itm')
for person in persons:
comment = person.find(class_ = 'cnt').text
name = comment[:comment.index(':')]
comment = comment[comment.index(':')+1:]
date = person.find(class_ = 'time')
date = self.change_time(date.text)
vote = person.find(class_ = 'rp')
try:
vote = vote.text[vote.text.index('(')+1:vote.text.index(')')]
vote = int(vote)
except ValueError:
vote = 0
try:
replied_comment = person.find(class_ = 'que').text
if '删除' in replied_comment: #遇到’该评论已被删除‘
replied_comment = replied_comment
replied_name = None
else:
replied_name = replied_comment[:replied_comment.index(':')]
replied_comment = replied_comment[replied_comment.index(':')+1:]
except AttributeError as e:
replied_comment = None
replied_name = None
self.people['names'].append(name)
self.people['comments'].append(comment)
self.people['dates'].append(date)
self.people['votes'].append(vote)
self.people['replied_names'].append(replied_name)
self.people['replied_comments'].append(replied_comment)
def save_mysql(self,people):
'把获取的数据存入数据库'
db = pymysql.connect(host = 'localhost', port = 3306 ,user = 'root', passwd = 'your password', db = 'your db', charset='utf8mb4') #使用utf8mb4来显示一些表情符号等等
cursor = db.cursor()
sql1 = 'USE text'
sql2 = 'INSERT INTO wyycomments (name, own_comment, vote, date, replied_name, replied_comment) VALUES (%s,%s,%s,%s,%s,%s)'
#我已经建好了这个数据表,所以直接插入数据
for i in range(len(people['names'])):
try:
cursor.execute(sql1)
cursor.execute(sql2,(people['names'][i],people['comments'][i],people['votes'][i],people['dates'][i],people['replied_names'][i],people['replied_comments'][i]))
cursor.connection.commit()
except Exception as e:
print(e)
db.rollback()
continue
cursor.close()
db.close()
def save_csv(self,people):
'把获取的数据存入csv文件中'
people = DataFrame(people)
people.to_csv('D:\\wyy_comments.csv',encoding = 'utf_8_sig')
#因为根据评论制作词云,所以单独再输出下面的txt文件
people.to_csv('D:\\ciyun\\use.txt',columns = ['comments'],index = 0,header = 0)
def collect_comments(self,n=1,name = '云烟成雨',style = []): #默认是我特别喜欢的一首民谣
'n是想要爬取的页码,name是要爬取的歌名,style可以选择mysql和csv的存储方式,获取评论'
driver = self.search(name)
html = []
if n<1:
print('抱歉,您至少得爬一页吧')
driver.close()
return None
elif n>=1:
try:
self.download_next_page(driver)
html.append(self.download_previous_page(driver))
#翻页是因为直接搜索到歌曲后会显示最火的15条评论,我希望按照时间顺序获取,翻页再翻回去就没有那15条评论了
self.one_page_comments_download(html[0])
print('获取了第1页的评论')
for i in range(int(n-1)):
html.append(self.download_next_page(driver))
self.one_page_comments_download(html[i])
print('获取了第'+str(i+2)+'页的评论')
time.sleep(0.5)
#print('获取了第'+k+'页的评论')
if 'mysql' in style:
self.save_mysql(self.people)
print('存储入MySQL')
if 'csv' in style:
self.save_csv(self.people)
print('存储入csv')
except Exception as e:
print(e)
finally:
driver.close()
return self.people
text = wyy()
end = text.collect_comments(2000,'云烟成雨',['mysql','csv'])
以上就完成了整个爬取的过程,接下来可以对获取的信息进一步进行分析。比如根据评论来制作一首歌的词云:
词云的制作可以去点击这里学习。
完整的代码可以在我的github上查看
第一次写博客,有不对的地方还请大家多多指点?