python爬虫 爬取漫画网站

-- coding: UTF-8 --

import requests
import re
from bs4 import BeautifulSoup
import lxml
from lxml import etree
from fake_useragent import UserAgent
import time
import os
import random
from multiprocessing import Pool

def get_url(url): #浏览器的头
ua = UserAgent()
headers = {‘User-Agent’:ua.random}
response = requests.get(url,headers=headers,timeout=10)
response.encoding = ‘UTF-8’
return response.text
pass

def list_url(text): #章节列表
html = etree.HTML(text)
list_url = html.xpath(’//*[@id=“play_0”]/ul/li/a/@href’)
list_url1 = list_url[::-1]
return list_url1

def parse_page_num(html2,detail_url):
try:
pattern = re.compile(r’function.?prepage.?totalpage = (.*?);’, re.S)
items = re.findall(pattern, html2)
label_list = [int(i) for i in items]
x = label_list[0]
print(‘本章一共%s页’% x)
total_every_page = []
for i in range(x+1):
if i == 0:
every_page = detail_url
elif i == 1:
continue
else:
every_page = detail_url[:-5] + ‘_’ + str(i)+’.html’
i = i + 1
total_every_page.append(every_page)
return total_every_page
except ValueError:
pass

def image_url(text):
html = etree.HTML(text)
img_url = html.xpath(’/html/body/div[2]/div[2]/div[4]/p/a/img/@src’)[0]
return img_url
pass

def down_image(img_url1,num1,filename):
ua = UserAgent()
headers = {‘User-Agent’:ua.random}
response = requests.get(img_url1,headers=headers,timeout=10)
response.encoding = ‘UTF-8’
path = filename +’\’+ str(num1) + ‘.jpg’
with open(path, ‘wb’) as f:
f.write(response.content)

def new_bao_all(text):
html = etree.HTML(text)
title_all = html.xpath(’//*[@id=“intro_l”]/div[2]/h1/text()’)[0]
print(title_all)
pwd = os.getcwd()
filename = pwd + ‘\’ + title_all
if not os.path.exists(filename):
os.makedirs(filename)
return filename
pass

def new_bao(html2,filename0):
html = etree.HTML(html2)
title_url = html.xpath(’/html/body/div[2]/div[2]/h1/text()’)[0]
pwd = os.getcwd()
filename = filename0+’\’+ title_url
print(filename)
pass
return filename

def main():
url = ‘https://www.dagumanhua.com/manhua/3537/’ #首页
text = get_url(url) #xpath
url_list = list_url(text)
filename0 = new_bao_all(text) #获取每章列表
for detail_url in url_list: #遍历补足链接
list_url1 = ‘https://www.dagumanhua.com’ + str(detail_url)
print(list_url1)
html2 = get_url(list_url1)
filename1 =new_bao(html2,filename0)
detail_list = parse_page_num(html2, detail_url)
i = 0
if not os.path.exists(filename1):
os.makedirs(filename1)
try:
for detail_list_i in detail_list:
detail_list_i1 = ‘https://www.dagumanhua.com’+ str(detail_list_i)
print(detail_list_i1)
text1 = get_url(detail_list_i1)
img_url1 = image_url(text1).strip()
print(img_url1)
i = i + 1
down_image(img_url1,i,filename1)
time.sleep(0.5)
except:
pass
else:
continue
pass

if name == ‘main’:
p = Pool()
for i in range(4):
p.apply_async(main)#非阻塞
p.close()
p.join()

emmm,个人练手第一步,爬取漫画网站。

你可能感兴趣的:(python)