先放码
import requests
from urllib.parse import urlencode #构建url
import json #读取数据
import re #正则表达式
import time
import os
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def data_save(save_folder,book_data = None): #默认参数要放在指定参数后面
#print(save_folder)
if book_data == None:
try:
with open('./item_data/'+save_folder+'.json','r',encoding = 'utf-8') as data:
data_json = json.load(data)
except:
return {}
else:
return data_json
else:
try:
with open('./item_data/'+save_folder+'.json','w',encoding = 'utf-8') as data:
json.dump(book_data,data)
except:
return
def load_cookies():
cookie_json = {}
try:
with open('cookies.json','r',encoding = 'utf-8') as cookies_file:
cookie = json.load(cookies_file)
except:
print('cookies读取失败')
else:
for i in range(len(cookie)):
if cookie[i]['domain'] == '.pixiv.net':
cookie_json[cookie[i]['name']] = cookie[i]['value']
return cookie_json
def get_session():
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': '',
'Referer': 'https://www.pixiv.net/',
}
cookie_json = load_cookies()
session = requests.session()
session.headers = head
requests.utils.add_dict_to_cookiejar(session.cookies,cookie_json)
return session
def get_item_id(item_number):
data = {
'type': 'illust',
'sample_illusts': 'auto',
'num_recommendations': item_number,
'page': 'discovery',
'mode': '', #safe,all,r18
}
session = get_session()
host_url = 'https://www.pixiv.net/rpc/recommender.php?'+urlencode(data)
try:
host_response = session.get(host_url,verify=False).json()
except:
pass
else:
return host_response['recommendations']
def popular():
#save_folder = time.strftime('%Y-%m-%d-%H')
save_folder = 'data'
book_data = data_save(save_folder)
#print(book_data)
recommendations = get_item_id(1000)
number = 0
for item_id in recommendations:
number += 1
item_url = 'https://www.pixiv.net/artworks/'+item_id
try:
item_response = requests.get(item_url,verify=False)
except:
print('访问错误:'+item_url)
else:
if item_response.status_code == 200:
item_response.encoding = 'utf-8'
try:
book_title = re.findall('#(.*?) - pixiv '.format(item_id),item_response.text)[0]
book_page_count = re.findall('"pageCount":(.{1,3}),"bookmarkCount":',item_response.text)[0]
book_mark_count = re.findall('"bookmarkCount":(.*?),',item_response.text)[0]
book_like_count = re.findall('likeCount":(.*?),',item_response.text)[0]
except:
pass
else:
book_data[item_id] = {"book_title":book_title,"book_page_count":book_page_count,"book_mark_count":book_mark_count,"book_like_count":book_like_count}
#print(item_id)
time.sleep(3)
print('正在储存:'+book_title)
#print(book_data)
if number%10 == 0:
data_save(save_folder,book_data)
data_save(save_folder,book_data)
data_load(save_folder)
def main():
popular()
if __name__ == '__main__':
popular()
这就不加以赘述了,详情请看上篇p站收藏夹爬虫
from urllib.parse import urlencode
def get_item_id(item_number):
data = {
'type': 'illust',
'sample_illusts': 'auto',
'num_recommendations': item_number,
'page': 'discovery',
'mode': '', #safe,all,r18
}
session = get_session()
host_url = 'https://www.pixiv.net/rpc/recommender.php?'+urlencode(data)
try:
host_response = session.get(host_url,verify=False).json()
except:
pass
else:
return host_response['recommendations']
在pixiv主页打开查看更多后,会到一个推荐页,里面都是推荐的画作。
里面的画都是通过异步请求完成数据的加载,而且一次请求会默认返回1000个画作的id。然后在根据里面的数据依次加载50个画作,然后当浏览器滚动条下滑到一定程度时会继续加载。
其中get_item_id(item_number)传入的参数可以改变,如果改为2000,就会返回2000条id,但是为了稳妥,我只使用1000作为参数。
import re
import time
def popular():
#save_folder = time.strftime('%Y-%m-%d-%H')
save_folder = 'data'
book_data = data_save(save_folder)
#print(book_data)
recommendations = get_item_id(1000)
number = 0
for item_id in recommendations:
number += 1
item_url = 'https://www.pixiv.net/artworks/'+item_id
try:
item_response = requests.get(item_url,verify=False)
except:
print('访问错误:'+item_url)
else:
if item_response.status_code == 200:
item_response.encoding = 'utf-8'
try:
book_title = re.findall('#(.*?) - pixiv '.format(item_id),item_response.text)[0]
book_page_count = re.findall('"pageCount":(.{1,3}),"bookmarkCount":',item_response.text)[0]
book_mark_count = re.findall('"bookmarkCount":(.*?),',item_response.text)[0]
book_like_count = re.findall('likeCount":(.*?),',item_response.text)[0]
except:
pass
else:
book_data[item_id] = {"book_title":book_title,"book_page_count":book_page_count,"book_mark_count":book_mark_count,"book_like_count":book_like_count}
#print(item_id)
time.sleep(3)
print('正在储存:'+book_title)
#print(book_data)
if number%10 == 0:
data_save(save_folder,book_data)
data_save(save_folder,book_data)
data_load(save_folder)
我使用get_item_id()获取了大量的图片项目id,利用id构建完整的url
我们既然要筛选出高人气画作,肯定就要知道这套图片的浏览量,收藏量,喜欢量。有人可能要问了,这里为什么不用get_session()传出的参数了,不验证登入吗?其实这里验证登入可有可无,因为就算是限制级的图片无法查看,但是网页中的浏览量,收藏量,喜欢量照样存在,只是把图片隐藏起来了。所以为了防止账号出现问题,这里需要大量访问的地方肯定不能拿账号犯险。
最后在使用data_save()方法储存数据。由于担心中途中断数据爬取,我设置每爬5条数据保存一次。
import json
def data_save(save_folder,book_data = None):
#print(save_folder)
if book_data == None:
try:
with open('./item_data/'+save_folder+'.json','r',encoding = 'utf-8') as data:
data_json = json.load(data)
except:
return {}
else:
return data_json
else:
try:
with open('./item_data/'+save_folder+'.json','w',encoding = 'utf-8') as data:
json.dump(book_data,data)
except:
return
如果传入data参数为空,就使用默认参数None。并有两种模式,读取和储存
读取模式时如果出现错误,基本就是文件数据错误/损坏/为空。就返回空字典,重新开始
存储模式时就将数据以json格式存入文件中,现在连这些数据都获取到了就不用我教你们拿来干嘛了吧?
时间:2020/3/29