#爬取携程国内酒店评论
#-*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import pymysql
import os
import re
import time
options=Options()
options.add_argument('User-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"')
options.add_argument("--proxy-server=http://121.242.67.236:8118")#代理ip网址为http://www.xicidaili.com/nn/
driver=webdriver.Chrome(chrome_options=options)
url_main="http://piao.ctrip.com/piao.html?keyword=中国#ctm_ref=vat_hp_sb_lst"
driver.get(url_main)
#下面这个try...except为获取中国所有的城市名
try:
data=driver.find_element_by_xpath("/html/body/div/div/div/div/ul[3]")
data1=data.find_element_by_class_name('area_box')
data2=data1.find_elements_by_xpath('a')
except:
data2=driver.find_elements_by_xpath("//ul[@class='clearfix']/li")
str1=""
str2=[]
for i in range(len(data2)):
str1=re.sub(r'\d',"",data2[i].get_attribute("innerText"))#查找隐藏的值,去掉隐藏值中的数字
str2.append(re.sub(r'(\W)',"",str1))
url_main="http://www.ctrip.com/"
driver.get(url_main)
for i in range(len(data2)):#所有城市的数量
driver.find_element_by_id('HD_CityName').send_keys(str2[i])#模拟输入
time.sleep(2)
driver.find_element_by_id('HD_CityName').send_keys(Keys.ENTER)#模拟输入回车
driver.find_element_by_id('HD_CityName').send_keys(Keys.ENTER)#模拟点击,上面这些模拟
url_main1=driver.current_url
print(url_main1)
xiecheng_handle=driver.current_window_handle#定位到当前窗口
d=0
try:
data7=driver.find_elements_by_xpath("//div[@class='c_page_list layoutfix']/a")
for h in range(len(data7)):#某城市的酒店页数
d=int(data7[h].text)
print(d)
o=1
for g in range(d):
try:
data3=driver.find_elements_by_xpath("//div[@class='hotel_new_list J_HotelListBaseCell']/ul/li/h2/a")
except:
continue
print(len(data3))
for j in range(len(data3)):#该页酒店数
js="window.open('"+data3[j].get_attribute('href')+"')"
driver.execute_script(js)#打开新的页面
handles=driver.window_handles
jiudian_handle=None
for handle in handles:
if handle!=xiecheng_handle:
jiudian_handle=handle
driver.switch_to.window(jiudian_handle)
data4=driver.find_element_by_xpath("/html/body/form/div/div/div/div/div/h2")
str3=data4.text
print(str3)
data4=driver.find_elements_by_xpath("//div[@class='adress']/span")
str4=""
for k in range(len(data4)):
str4=str4+data4[k].text
print(str4)
try:
data4=driver.find_element_by_xpath("//div[@class='comment_total_score']/span[@class='score']/span")
str5=data4.text
except:
str5=""
print(str5)
try:
data4=driver.find_elements_by_xpath("//div[@class='bar_score']/p")
str10=""
for k in range(len(data4)):
str10=str10+data4[k].text+" "
except:
str10=""
print(str10)
b=0
try:
time.sleep(3)#延迟
data4=driver.find_elements_by_xpath("//div[@class='c_page_list layoutfix']/a")
for k in range(len(data4)):
b=int(data4[k].text)
print(b)#评论页数
page=1
for c in range(b):
data5=driver.find_elements_by_xpath("//div[@class='comment_block J_asyncCmt']")
time.sleep(3)
for l in range(len(data5)):#该页评论数
data6=data5[l].find_element_by_class_name('name')
str6=data6.text
print(str6)
data6=data5[l].find_element_by_class_name('n')
str8=data6.text
print(str8)
try:
data6=data5[l].find_element_by_class_name('J_commentDetail')
str9=data6.text
print(str9)
except:
str9=""
print("")
for _ in range(3):
try:
if(page==b):
break;
driver.find_element_by_class_name('c_down').send_keys(Keys.ENTER)#模拟点击
page=page+1
break;
except:
time.sleep(1)
except:
try:
data5=driver.find_elements_by_xpath("//div[@class='comment_block J_asyncCmt']")
for l in range(len(data5)):
data6=data5[l].find_element_by_class_name('name')
str6=data6.text
print(str6)
data6=data5[l].find_element_by_class_name('n')
str8=data6.text
print(str8)
try:
data6=data5[l].find_element_by_class_name('J_commentDetail')
str9=data6.text
print(str9)
except:
str9=""
print("")
except:
print("没有评论")
driver.close()
driver.switch_to.window(xiecheng_handle)
data3=driver.find_elements_by_xpath("//div[@class='hotel_new_list J_HotelListBaseCell']/ul/li/h2/a")
driver.switch_to.window(xiecheng_handle)
for _ in range(3):
try:
if(o==d):
break;
driver.find_element_by_class_name('c_down').click()
print("--------------翻页--------------------")
print("--------------------------------------")
o=o+1
break;
except:
time.sleep(1);
except:
try:
try:
data3=driver.find_elements_by_xpath("//div[@class='hotel_new_list J_HotelListBaseCell']/ul/li/h2/a")
except:
continue
for j in range(len(data3)):#该页酒店数
js="window.open('"+data3[j].get_attribute('href')+"')"
driver.execute_script(js)#打开新的页面
handles=driver.window_handles
jiudian_handle=None
for handle in handles:
if handle!=xiecheng_handle:
jiudian_handle=handle
driver.switch_to.window(jiudian_handle)
data4=driver.find_element_by_xpath("/html/body/form/div/div/div/div/div/h2")
str3=data4.text
print(str3)
data4=driver.find_elements_by_xpath("//div[@class='adress']/span")
str4=""
for k in range(len(data4)):
str4=str4+data4[k].text
print(str4)
try:
data4=driver.find_element_by_xpath("//div[@class='comment_total_score']/span[@class='score']/span")
str5=data4.text
except:
str5=""
print(str5)
try:
data4=driver.find_elements_by_xpath("//div[@class='bar_score']/p")
str10=""
for k in range(len(data4)):
str10=str10+data4[k].text+" "
except:
str10=""
print(str10)
b=0
try:
time.sleep(3)
data4=driver.find_elements_by_xpath("//div[@class='c_page_list layoutfix']/a")
for k in range(len(data4)):
b=int(data4[k].text)
print(b)
page=1
url_main2=driver.current_url
for c in range(b):
data5=driver.find_elements_by_xpath("//div[@class='comment_block J_asyncCmt']")
for l in range(len(data5)):
data6=data5[l].find_element_by_class_name('name')
str6=data6.text
print(str6)
data6=data5[l].find_element_by_class_name('n')
str8=data6.text
print(str8)
try:
data6=data5[l].find_element_by_class_name('J_commentDetail')
str9=data6.text
print(str9)
except:
str9=""
print("")
for _ in range(3):
try:
page=page+1
if(page==b):
break;
driver.find_element_by_class_name('c_down').send_keys(Keys.ENTER)
break;
except:
time.sleep(3)
except:
try:
data5=driver.find_elements_by_xpath("//div[@class='comment_block J_asyncCmt']")
for l in range(len(data5)):
data6=data5[l].find_element_by_class_name('name')
str6=data6.text
print(str6)
data6=data5[l].find_element_by_class_name('n')
str8=data6.text
print(str8)
try:
data6=data5[l].find_element_by_class_name('J_commentDetail')
str9=data6.text
print(str9)
except:
str9=""
print("")
except:
print("没有评论")
driver.close()
driver.switch_to.window(xiecheng_handle)
data3=driver.find_elements_by_xpath("//div[@class='hotel_new_list J_HotelListBaseCell']/ul/li/h2/a")
except:
print("没有酒店")
driver.get(url_main)
#以上爬虫出现的不同情况是使用try.....except来判断的,比较繁琐,如果你学会了,可以尝试循环