到目前为止,学习了爬虫的相关知识点如下:
1、学习第三方包和模块的使用,包括:requests、re、urllib.request、bs4.BeautifulSoup、lxml.etree、os等;
2、学习网页的请求原理,包括:url、html、xml的定义,tag和attrs的组成,get网页,User-Agent代理等;
3、做了大概20个爬虫实例,或大或小,了解到了新建文件夹、下载图片、修改文件名、循环函数、嵌套循环等实际使用。
接下来,打算做更多的爬虫实例。方法,还是把百度的爬虫实例大多重复写一遍。把代码都记录到csdn上来。
# -*- coding: utf-8 -*-
'''
2017年12月16日21:37:54 by Jason Young
爬取网址:https://tieba.baidu.com/p/5431979599?pn=1
教程网址:https://www.cnblogs.com/abelsu/p/4540711.html
'''
import os
import requests
from bs4 import BeautifulSoup
folder = '每日一练'
if not os.path.exists(folder):
os.makedirs(folder)
def download(url, n):
response = requests.get(url)
#name = url.split('/')[-1]
f = open(folder + '/' + str(n) + '.jpg', 'wb')
f.write(response.content)
f.close()
return True
n = 1
for i in range(1,3):
url_tieba = 'https://tieba.baidu.com/p/5431979599?pn=' + str(i)
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response_tieba = requests.get(url_tieba)
html_tieba = response_tieba.text
soup_tieba = BeautifulSoup(html_tieba, 'html.parser')
img_list = soup_tieba.find_all('img', attrs = {'class':'BDE_Image'})
for img in img_list:
print(n)
src = img.get('src')
print(src)
download(src, n)
n += 1
print('OK')
# -*- coding: utf-8 -*-
'''
2017年12月16日22:22:33 by Jason Young
爬取网址:http://jandan.net/ooxx/page-390#comments
教程网址:https://www.cnblogs.com/luoqingyu/p/6441270.html
----爬取失败,煎蛋网被爬虫攻击多了,加入了很多反爬机制----
'''
import os
import requests
from bs4 import BeautifulSoup
folder = '每日一练'
if not os.path.exists(folder):
os.makedirs(folder)
def download(url, n):
response = requests.get(url, headers = header)
f = open(folder + '/' + str(n) + '.jpg', 'wb')
f.write(response.content)
f.close()
return True
n = 1
for i in range(385, 390):
url = 'http://jandan.net/ooxx/page-' + str(i) + '#comments'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = header)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
soup_list = soup.find_all('a', atrs = {'class':'view_img_link'})
for soup in soup_list:
print(n)
href = soup.get('href')
print(href)
download(href, n)
n += 1
print('OK')
# -*- coding: utf-8 -*-
'''
2017年12月17日15:57:44 by Jason Young
爬取网址:http://huaban.com/favorite/beauty/
教程网址:https://www.cnblogs.com/nan86150/p/4272452.html
----爬取失败,花瓣网有反爬措施。在获取的html中查看不了正确的img_src----
'''
import os
import requests
from bs4 import BeautifulSoup
folder = '每日一练'
if not os.path.exists(folder):
os.makedirs(folder)
def download(url):
response = requests.get(url, headers = header)
name = url.split('/')[-1]
f = open(folder + '/' + name + '.jpg' , 'wb')
f.write(response.content)
f.close()
return True
url = 'http://huaban.com/boards/19377361/'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = header)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
src_list = soup.find_all('img', attrs = {'data-baiduimageplus-ignore':'1'})
n = 1
for src_img in src_list:
print(n)
src = src_img.get('src')
print(src)
download(src)
n += 1
print('OK')
# -*- coding: utf-8 -*-
'''
2017年12月18日17:09:49 by Jason Young
爬取网址:http://tieba.baidu.com/p/2166231880
教程网址:http://blog.csdn.net/u012705410/article/details/47685417
----爬取成功,百度贴吧十分容易爬取信息----
'''
import os
import requests
from lxml import etree
folder = '每日一练'
if not os.path.exists(folder):
os.makedirs(folder)
def download(url, n):
response = requests.get(url, headers = header)
f = open(folder + '/' + str(n) + '.jpg', 'wb')
f.write(response.content)
f.close()
return True
url = 'http://tieba.baidu.com/p/2166231880'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = header)
html = response.text
xml = etree.HTML(html)
src_list = xml.xpath('//cc/div/img/@src')
#src_num = len(src_list)
n = 1
for src in src_list:
print(n)
print(src)
download(src, n)
n += 1
print('OK')
# -*- coding: utf-8 -*-
'''
2017年12月18日20:11:21 by Jason Young
爬取网址:http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1513595532629_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8D%A1%E8%BD%A6
-----------------百度图片,爬取失败-------------------
'''
import os
import requests
from lxml import etree
folder = '每日一练'
if not os.path.exists(folder):
os.makedirs(folder)
def download(url, n):
response = requests.get(url, headers = header)
f = open(folder + '/' + str(n) + '.jpg', 'wb')
f.write(response.content)
f.close()
return True
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%8D%A1%E8%BD%A6&ct=201326592&ic=0&lm=-1&width=&height=&v=flip'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
response = requests.get(url, headers = header)
html = response.text
xml = etree.HTML(html)
img_list = xml.xpath('//div[@class="imgpage"]/ul/li/@data-objurl')
n = 1
for img in img_list:
print(n)
print(img)
download(img, n)
n += 1
print('OK')