爬取图片

import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
import datetime
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

starttime=datetime.datetime.now()

all_url='http://pic.netbian.com/4kmeinv/'
header= {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}
#获取最大页数
def get_page_max():
    page_url=requests.get(all_url,headers=header)
    page_soup=BeautifulSoup(page_url.text,'html.parser')
    page_max=page_soup.find('div',class_='page').find_all('a')[-2].text
    return page_max
    #print(page_max[-2].text)
#print(get_page_max())
def get_img():
    for i in range(1,int(get_page_max())+1):
        if i==1:
            all_html=requests.get(all_url,headers=header)
        else:
            all_html=requests.get(all_url+'index_'+str(i)+'.html',headers=header)
        all_soup=BeautifulSoup(all_html.text,'html.parser')
        all_a=all_soup.find('ul',class_='clearfix').find_all('a')
        for a in all_a:
            href=a['href']
            #print(href)
            url=same_url+href
            #print(url)
            html=requests.get(url,headers=header)
            html.encoding='gbk'#转换为gbk编码
            #print(html.text)
            soup=BeautifulSoup(html.text,'html.parser')
            image=soup.find('div',class_='photo-pic').find('img')
            text=soup.find('div',class_='photo-hd').text
            #print(text)
            #print(image)
            #print(image['src'])
            pic=requests.get(same_url+image['src'])
            picture_name=path+(text+'.'+image['src'].split('.',-1)[-1])
            #file_dir = os.path.split(picture_name )[1]#获取文件夹下的文件名称及格式
            if os.path.exists(path+(text+'.'+image['src'].split('.',-1)[-1])):
                print('文件已存在,跳过')
                continue
            else:
                with open(picture_name,'wb')as f:
                #print('loading...',pic.url)
                    print(text+'.'+image['src'].split('.',-1)[-1])
                    f.write(pic.content)

if __name__=='__main__':
    if os.name=='nt':
        print(u'你正在使用win平台')
    else:
        print(u'你正在使用linux平台')

    same_url='http://pic.netbian.com'

    path=os.getcwd()+'/netbian/'
    if not os.path.isdir(path):
        os.mkdir(path)
    else:
        print('文件夹已存在')

    pool=Pool(4)
    get_img()
    pool.close()
    pool.join()

    endtime=datetime.datetime.now()
    print((endtime-starttime).seconds)

你可能感兴趣的:(爬虫)