需求:查询某商品后页面的信息,不涉及深入采集
就像这种的:
为什么不用request,urllib进行爬取,而是用selenium呢?
import random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
import csv
import re
#搜索函数
def seacher(key):
driver.find_element_by_id("home-header-searchbox").clear()
driver.find_element_by_id("home-header-searchbox").send_keys(key,Keys.ENTER)
driver.maximize_window()
time.sleep(1)
page = driver.find_element_by_xpath('//div[@class="rootComponent"]//span[@class="fui-paging-total"]').text
#获取最大页数
page = re.findall(('\d+'),page)[0]
print(page)
return int(page)
#解析没有广告的商品
def get_pridect():
try:
driver.implicitly_wait(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(1)
divs = driver.find_elements_by_xpath('//div[@class="sm-offer"]//div[@class="card-container"]')
# print(divs)
for div in divs:
shop = div.find_element_by_xpath('.//div[@class="desc-container"]//a').text #商品名称
price = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price"]').text +'元' #价格
youhui = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="offer-tag-container"]').text #优惠
chengjiaoe = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price-container"]').text
companyname = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-name"]').text #公司名
shangjialeibie = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-tag-container"]/a').text #商家类别
mingcheng = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="common-company-tag"]').text
with open('./1688-data/0619.csv','a',newline='',encoding='utf-8') as file:
csvwriter = csv.writer(file,delimiter=',')
csvwriter.writerow([shop, price, youhui,chengjiaoe, companyname, shangjialeibie, mingcheng ])
except selenium.common.exceptions.TimeoutException:
print('parse_page: TimeoutException')
get_pridect()
except selenium.common.exceptions.StaleElementReferenceException:
print('parse_page: StaleElementReferenceException')
driver.refresh()
except selenium.common.exceptions.NoSuchElementException:
print('youwenti02')
get_pridect()
#解析有广告的商品
def get_pridected():
try:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.implicitly_wait(2)
divsl = driver.find_elements_by_xpath('//div[@class="sm-offer"]//div[@class="card-container ad-item"]')
# print(divsl)
for div in divsl:
shop = div.find_element_by_xpath('.//div[@class="desc-container"]//a').text #商品名称
price = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price"]').text +'元' #价格
youhui = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="offer-tag-container"]').text #优惠
chengjiaoe = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="price-container"]').text
companyname = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-name"]').text #公司名
shangjialeibie = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="company-tag-container"]/a').text #商家类别
mingcheng = div.find_element_by_xpath('.//div[@class="desc-container"]//div[@class="common-company-tag"]').text
#有的没有广告,我有想过使用,可惜不管用呀,有知道的还请说下:
# if div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]'):
# guanggao = div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]').text
# else:
# guanggao = ' '
# guanggao = div.find_element_by_xpath('.div[@class="price-container"]//div[@class="ad-container"]').text
# print(shop,price,sep='|')
with open('./1688-data/0619.csv','a',newline='',encoding='utf-8') as file:
csvwriter = csv.writer(file,delimiter=',')
csvwriter.writerow([shop,price,youhui,chengjiaoe,companyname,shangjialeibie,mingcheng])
except selenium.common.exceptions.TimeoutException:
print('parse_page: TimeoutException')
get_pridected()
except selenium.common.exceptions.StaleElementReferenceException:
print('parse_page: 刷新页面')
driver.refresh()
except selenium.common.exceptions.NoSuchElementException:
print('youwenti01')
get_pridected()
def main():
print('正在爬去第一页ing')
page = seacher(keyword)
get_pridect()
get_pridected()
page_num = 1
while page_num != page:
print('*'*100)
print('正在爬去第{}页'.format(page_num+1))
print('*' * 100)
test = driver.find_element_by_xpath('//div[@class="common-pagination"]//a[@class="fui-next"]')
driver.execute_script("arguments[0].click();", test)
driver.implicitly_wait(2)
driver.maximize_window()
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
get_pridect()
page_num +=1
if __name__ == '__main__':
# keyword = input('请输入需要查询的商品名称: ')
with open('./1688-data/classes.txt', 'r+', encoding='utf-8') as f: # 不需要自己关闭 文件句柄 f.close()不需要 默认 r
datas = f.read().splitlines()
for i in datas:
keyword = i
print('正在爬取'+i)
#无图模式
# driver = webdriver.ChromeOptions()
# prefs = {
# 'profile.default_content_setting_values': {
# 'images': 2
# }
# }
# driver.add_experimental_option('prefs', prefs)
# 代理ip
# PROXY = "171.35.221.103:9999"
# option.add_argument('--proxy-server='+PROXY)
# driver = webdriver.Chrome(chrome_options = option)
#无头
# option.add_experimental_option('prefs', prefs)
# driver = webdriver.Chrome(chrome_options = option)
driver = webdriver.Chrome()
driver.get('https://www.1688.com/',)
main()
别问为什么我用了两个解析网页的函数,别问,问就是我菜,
商品带广告的跟不带广告的虽说区别不大,但是我有想过一些解决办法,可惜了我菜。(有会的大佬,可以指点小弟一二。)
把这个读取的文件路径改成自己的。一个商品为一行,回车分割。
保存路径的在这里,记得改成自己的。
项目报错别怪我,要怪就怪我们都是菜鸡
我有想过无头模式,跟无图片加载(需要换下xpath路径),可惜关于代理ip的问题,我放弃挣扎了!
关于代理ip 他不支持动态切换,只支持一次性的,啧啧,鸡肋呀!
或者利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId,加上代理重新访问一次url,就相当于重启浏览器再去查询一次。
from selenium import webdriver
from selenium.webdriver.common.proxy import ProxyType
browser=webdriver.PhantomJS()
proxy=webdriver.Proxy()
proxy.proxy_type=ProxyType.MANUAL
proxy.http_proxy='122.4.46.181:9999'
proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)
browser.get('https://www.1688.com/')
print('1: ',browser.session_id)
print('2: ',browser.page_source)
print('3: ',browser.get_cookies())
其实我还想了一种方式,就是设置一个函数,当出现登陆页面或者验证码时,提醒更换IP,自己再去控制台手动输入,个人感觉没啥意义,还是算了,不如去想想怎么动态实现。