使用selenium 爬取豆瓣《千与千寻》影评

这个是老师要求爬两万条数据的时候,顺便做的,我爬取的是《千与千寻》
直接上代码吧

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import re
import csv
import pymysql
import lxml.html
import time
from redis import StrictRedis,ConnectionPool

#用数据池连接Redis
pool = ConnectionPool(host='localhost',port=6379,db=0,decode_responses=True)
redis = StrictRedis(connection_pool=pool)
#声明浏览器对象
driver = webdriver.Chrome()
url = 'https://movie.douban.com/subject/1291561/'#豆瓣网 千与千寻
start = time.time()
print('开始时间:'+str(start))
m = 'QianyuQ'#数据包的键
try:
	driver.get(url)#输入url
	button_3 = driver.find_element(By.XPATH,'//*[@id="comments-section"]/div[1]/h2/span/a')#获取全部评论的节点
	button_3.click()#点击
	#获取评论者名字
	names = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/h3/span[2]/a')
	#获取评论内容
	comments = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/p/span')
	for i in range(len(names)):
		dic = {
		'author': names[i].text,
		'comment': comments[i].text,
		}
		data = json.dumps(dic)#转换为json数据,存入redis数据库中
	    redis.lpush(m,data)
	selector = lxml.html.fromstring(driver.page_source)
	g = selector.xpath('//*[@id="paginator"]/a/@href')
	print(g[0])
	url_1 = g[0]
	str_1 = url+'comments'+url_1
	print(str_1)
	driver.get(str_1)
	#评论者名字
	names = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/h3/span[2]/a')
	#评论内容
	comments = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/p/span')
	for i in range(len(names)):
		dic = {
		'author': names[i].text,
		'comment': comments[i].text,
		}
		data = json.dumps(dic)
	    redis.lpush(m,data)
	s = url_1[7:9]
	print(s)
	url_1 = url_1.replace(s,'60',1)
	print(url_1)
	print(i)
	str_1 = url+'comments'+url_1
	print(str_1)
	driver.get(str_1)
	#评论者名字
	names = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/h3/span[2]/a')
	#评论内容
	comments = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/p/span')
	for i in range(len(names)):
		dic = {
		'author': names[i].text,
		'comment': comments[i].text,
		}
		total.append(dic)
	#改变网址,每一页有20条数据,
	pg = 20
	while(pg<=440):
		pg += 20
		print('当前pg的值'+str(pg))
		s = ''
		if(len(str(pg)) == 2):
			s = url_1[7:9]
		elif(len(str(pg)) == 3):
			s = url_1[7:10]
		elif(len(str(pg)) == 4):
			s = url_1[7:11]
		else:
			s = url_1[7:12]
		print('当前需替换的值'+s)
		url_1 = url_1.replace(s,str(pg),1)
		print('参数'+url_1)
		str_1 = url+'comments'+url_1
		print('网址'+str_1)
		driver.get(str_1)
		#评论者名字
		names = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/h3/span[2]/a')
		#评论内容
		comments = driver.find_elements(By.XPATH,'//*[@id="comments"]/div/div[2]/p/span')
		for i in range(len(names)):
			dic = {
			'author': names[i].text,
			'comment': comments[i].text,
			}
			data = json.dumps(dic)
	    	redis.lpush(m,data)
finally:
	driver.close()#关闭浏览器
	end = time.time()
	print('结束时间:'+str(end))
	print('花费时长:'+str(end-start))

对于将数据库中的值
one = redis.lindex(‘QianyuQ’,0)#取第一条评论
one_data = json.loads(one) #因为是一个json数据,所以需要.loads()
print(one_data[‘author’])#第一条评论的作者的名字

你可能感兴趣的:(python)