python爬虫百度图片(动态加载)

  • 记录一下
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import time
import requests

def getnamepage(name):
    b.get("http://image.baidu.com/")
    search_box = b.find_element_by_id('kw')
    search_box.send_keys(name)
    search_box.send_keys(Keys.ENTER)
    time.sleep(3)

def download(imglist,num):
    #选取大尺寸
    ele = b.find_element_by_id('sizeFilter')
    ActionChains(b).move_to_element(ele).perform()
    time.sleep(3)
    ele4 = b.find_element_by_xpath('//*[@id="sizeFilter"]/div/ul/li[3]')
    ActionChains(b).move_to_element(ele4).perform()
    time.sleep(3)
    ele4.click()
    time.sleep(3)

    ele1 = b.find_element_by_xpath('/html/body/div[2]/div[2]/div[4]/div/ul/li[1]/div/a/img') #//*[@id="imgid"]/div/ul/li[1]/div/a/img
    ele1.click()
    b.switch_to.window(b.window_handles[1])
    x = 1
    for i in range(1,num+1):
        ele2 = b.find_element_by_xpath('//*[@id="currentImg"]')
        img = ele2.get_attribute('src')
        r = requests.get(img)
        if r.status_code==200:
            path = './baidu_pic/%d.jpg'%x
            print("正在爬取  "+ img)
            with open(path,'wb') as f:
                f.write(r.content)
                time.sleep(1)
                f.close()
                print('爬取成功')
                x += 1

            ele3 = b.find_element_by_xpath('/html/body/div[1]/div[2]/div/span[2]/span') #//*[@id="container"]/span[2]/span

            ele3.click()
        else:
            ele3 = b.find_element_by_xpath('/html/body/div[1]/div[2]/div/span[2]/span')
            ele3.click()
            time.sleep(1)
            continue

if __name__ == "__main__":
    b = webdriver.Chrome()
    name = '哈士奇'
    num = 30
    imglist = []
    getnamepage(name)
    download(imglist,num)
    b.close()

另外,还看到有个线程池的操作,可以加快速度,有空学习一下
python爬虫百度图片(动态加载)_第1张图片

参考
https://www.bilibili.com/video/BV1Va4y1Y7fK?from=search&seid=12535724320392963786

https://www.bilibili.com/video/BV127411n7jC?from=search&seid=12535724320392963786

你可能感兴趣的:(数据挖掘)