p站爬取数据绕过vip筛选高人气画作

先放码

import requests
from urllib.parse import urlencode   #构建url
import json       #读取数据
import re         #正则表达式
import time
import os
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

def data_save(save_folder,book_data = None):   #默认参数要放在指定参数后面
	#print(save_folder)
	if book_data == None:
		try:
			with open('./item_data/'+save_folder+'.json','r',encoding = 'utf-8') as data:
				data_json = json.load(data)
		except:
			return {}
		else:
			return data_json
	else:
		try:
			with open('./item_data/'+save_folder+'.json','w',encoding = 'utf-8') as data:
				json.dump(book_data,data)
		except:
			return

def load_cookies():
	cookie_json = {}
	try:
		with open('cookies.json','r',encoding = 'utf-8') as cookies_file:
			cookie = json.load(cookies_file)
	except:
		print('cookies读取失败')
	else:
		for i in range(len(cookie)):
			if cookie[i]['domain'] == '.pixiv.net':
				cookie_json[cookie[i]['name']] = cookie[i]['value']
		return cookie_json

def get_session():
	head = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
	'Accept-Language': 'zh-CN,zh;q=0.9',
	'Accept-Encoding': '',
	'Referer': 'https://www.pixiv.net/',
	}
	
	cookie_json = load_cookies()
	
	session = requests.session()

	session.headers = head
	requests.utils.add_dict_to_cookiejar(session.cookies,cookie_json)

	return session

def get_item_id(item_number):
	data = {
	'type': 'illust',
	'sample_illusts': 'auto',
	'num_recommendations': item_number,
	'page': 'discovery',
	'mode': '',     #safe,all,r18
	}
	session = get_session()
	host_url = 'https://www.pixiv.net/rpc/recommender.php?'+urlencode(data)
	try:
		host_response = session.get(host_url,verify=False).json()
	except:
		pass
	else:
		return host_response['recommendations']

def popular():
	#save_folder = time.strftime('%Y-%m-%d-%H')
	save_folder = 'data'
	book_data = data_save(save_folder)
	#print(book_data)
	recommendations = get_item_id(1000)
	number = 0
	for item_id in recommendations:
		number += 1
		item_url = 'https://www.pixiv.net/artworks/'+item_id
		try:
			item_response = requests.get(item_url,verify=False)
		except:
			print('访问错误:'+item_url)
		else:
			if item_response.status_code == 200:
				item_response.encoding = 'utf-8'
				try:
					book_title = re.findall('#(.*?) - pixiv'.format(item_id),item_response.text)[0]
					book_page_count = re.findall('"pageCount":(.{1,3}),"bookmarkCount":',item_response.text)[0]
					book_mark_count = re.findall('"bookmarkCount":(.*?),',item_response.text)[0]
					book_like_count = re.findall('likeCount":(.*?),',item_response.text)[0]
				except:
					pass
				else:
					book_data[item_id] = {"book_title":book_title,"book_page_count":book_page_count,"book_mark_count":book_mark_count,"book_like_count":book_like_count}
					#print(item_id)
					time.sleep(3)
					print('正在储存:'+book_title)
				#print(book_data)
					if number%10 == 0:
						data_save(save_folder,book_data)
	data_save(save_folder,book_data)
	data_load(save_folder)

def main():
	popular()

if __name__ == '__main__':
	popular()

load_cookies(),get_session()

这就不加以赘述了,详情请看上篇p站收藏夹爬虫

get_item_id(item_number)

from urllib.parse import urlencode
def get_item_id(item_number):
	data = {
	'type': 'illust',
	'sample_illusts': 'auto',
	'num_recommendations': item_number,
	'page': 'discovery',
	'mode': '',    #safe,all,r18
	}
	session = get_session()
	host_url = 'https://www.pixiv.net/rpc/recommender.php?'+urlencode(data)
	try:
		host_response = session.get(host_url,verify=False).json()
	except:
		pass
	else:
		return host_response['recommendations']

在pixiv主页打开查看更多后,会到一个推荐页,里面都是推荐的画作。
里面的画都是通过异步请求完成数据的加载,而且一次请求会默认返回1000个画作的id。然后在根据里面的数据依次加载50个画作,然后当浏览器滚动条下滑到一定程度时会继续加载。
p站爬取数据绕过vip筛选高人气画作_第1张图片
其中get_item_id(item_number)传入的参数可以改变,如果改为2000,就会返回2000条id,但是为了稳妥,我只使用1000作为参数。

popular()

import re
import time
def popular():
	#save_folder = time.strftime('%Y-%m-%d-%H')
	save_folder = 'data'
	book_data = data_save(save_folder)
	#print(book_data)
	recommendations = get_item_id(1000)
	number = 0
	for item_id in recommendations:
		number += 1
		item_url = 'https://www.pixiv.net/artworks/'+item_id
		try:
			item_response = requests.get(item_url,verify=False)
		except:
			print('访问错误:'+item_url)
		else:
			if item_response.status_code == 200:
				item_response.encoding = 'utf-8'
				try:
					book_title = re.findall('#(.*?) - pixiv'.format(item_id),item_response.text)[0]
					book_page_count = re.findall('"pageCount":(.{1,3}),"bookmarkCount":',item_response.text)[0]
					book_mark_count = re.findall('"bookmarkCount":(.*?),',item_response.text)[0]
					book_like_count = re.findall('likeCount":(.*?),',item_response.text)[0]
				except:
					pass
				else:
					book_data[item_id] = {"book_title":book_title,"book_page_count":book_page_count,"book_mark_count":book_mark_count,"book_like_count":book_like_count}
					#print(item_id)
					time.sleep(3)
					print('正在储存:'+book_title)
				#print(book_data)
					if number%10 == 0:
						data_save(save_folder,book_data)
	data_save(save_folder,book_data)
	data_load(save_folder)

我使用get_item_id()获取了大量的图片项目id,利用id构建完整的url
我们既然要筛选出高人气画作,肯定就要知道这套图片的浏览量,收藏量,喜欢量。有人可能要问了,这里为什么不用get_session()传出的参数了,不验证登入吗?其实这里验证登入可有可无,因为就算是限制级的图片无法查看,但是网页中的浏览量,收藏量,喜欢量照样存在,只是把图片隐藏起来了。所以为了防止账号出现问题,这里需要大量访问的地方肯定不能拿账号犯险。
最后在使用data_save()方法储存数据。由于担心中途中断数据爬取,我设置每爬5条数据保存一次。

data_save()

import json
def data_save(save_folder,book_data = None):
	#print(save_folder)
	if book_data == None:
		try:
			with open('./item_data/'+save_folder+'.json','r',encoding = 'utf-8') as data:
				data_json = json.load(data)
		except:
			return {}
		else:
			return data_json
	else:
		try:
			with open('./item_data/'+save_folder+'.json','w',encoding = 'utf-8') as data:
				json.dump(book_data,data)
		except:
			return

如果传入data参数为空,就使用默认参数None。并有两种模式,读取和储存
读取模式时如果出现错误,基本就是文件数据错误/损坏/为空。就返回空字典,重新开始
存储模式时就将数据以json格式存入文件中,现在连这些数据都获取到了就不用我教你们拿来干嘛了吧?

时间:2020/3/29

你可能感兴趣的:(爬虫)