python中使用requests爬取图片并下载

前段时间由于想找个图片做壁纸,在度娘上搜索一下,出来了许多的图片,打开链接看到一歌网站,突然心血来潮就想着爬下来一些图片,由于这是一个小网站,所以也没做什么防止反爬虫的措施,这次分享主要是分享图片下载和命名的方法,具体代码和注释如下:

import requests
from bs4 import BeautifulSoup
# from requests import exceptions
from requests.exceptions import RequestException
import re
import os  #导入os模块
from hashlib import md5   #引入md5主要是为了命名

urls=['http://www.27270.com/ent/meinvtupian/list_11_{}.html'.format(i)for i in range(1,21)]
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'Cookie':'Hm_lvt_63a864f136a45557b3e0cbce07b7e572=1533813608,1533820122,1533887189,1535023368; Hm_lpvt_63a864f136a45557b3e0cbce07b7e572=1535027309'
}

#获取具体url
def get_datail_url(url):
    try:
        response=response=requests.get(url,headers=headers)
        soup=BeautifulSoup(response.text,'lxml')
        links=soup.select('body > div.w1200.yh > div.MeinvTuPianBox > ul > li > a.MMPic')
        for link in links:
            href1=link.get('href')
            get_real_href(href1)


    except RequestException:
        return "连接错误"

#每一张图片的具体地址
def get_real_href(url):
    url = url[:-5]
    hrefs = ['_{}.html'.format(i) for i in range(1, 9)]
    for lin in hrefs:
        href = url + lin
        get_src(href)

#获取图片
def get_src(url):
    response = response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    photo=soup.select('#picBody > p > a > img')
    for pho in photo:
        src=pho.get('src')
        download_image(src)

#下载图片
def download_image(url):
    print("正在下载", url)
    try:
        response=requests.get(url)
        if response.status_code==200:
            save_image(response.content)
        return None
    except RequestException:
        print("图片出错",url)
        return None

#保存图片
def save_image(content):
    #文件命名,保存问jpg格式
    file_path="{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),'jpg') 

    if not os.path.exists(file_path):  #判断图片是否重复
        with open(file_path,'wb')as f:
            f.write(content)
            f.close()

def main():
    for url in urls:
        get_datail_url(url)

if __name__=='__main__':
    main()

 

你可能感兴趣的:(爬虫,python,requests)