Python-抓取小红书文章的心路历程

在这之前从未了解过小红书,然后习惯性地百度了一下。发现是这样的

Python-抓取小红书文章的心路历程_第1张图片

研究发现,这玩意没有pc端的接口,也就是说没办法直接从pc端抓取数据。好吧,放弃。不过pc端还是有用处的

Python-抓取小红书文章的心路历程_第2张图片

打开社区精选,点开几个推送详情页看了看,发现所有的文章url都是https://www.xiaohongshu.com/discovery/item/ + 文章绑定的一串字符,这个很关键。然后pc端不行,就只能从手机端想办法,下载了小红书的app,又看了一下微信的小红书小程序,试着用fidder抓包,然后发现小程序的更好抓,还是得借马爸爸的门啊!

采集路径有了,第一个问题顺利解决,然后开始fidder抓包。

Python-抓取小红书文章的心路历程_第3张图片

json文件找到了。观察一下json文件里的参数,会发现。跟每一个title的同级都有一个绑定的id。看起来有点眼熟啊,是不是跟刚才社区精选看到的url的id很像?于是复制一个到pc端测试,果然

Python-抓取小红书文章的心路历程_第4张图片

好了,第二个问题解决,现在就要思考如何拿到json文件的数据。然后批量访问

试了一下添加cookie,ua,referer。结果失败

然后一股脑把所有参数都带上了去访问,结果成功

没挨个测试反正能成功就行,有兴趣的朋友可以挨个测试,看看哪些是必要参数。

交个底,大部分参数都是固定的,只有请求头里的keyword跟X-Sign是变动的,需要手动获取。自此大部分问题解决

还有个最操蛋的问题,就是小红书对访问频率太敏感,正常的套路去爬基本上两三篇文章最多就给你弹验证,然后程序就挂了。试过添加随机请求头,不过不管用。唯一的两个办法是使用代理,这样的话基本上你要爬多少篇文章就得准备多少代理,免费ip不好用,又没钱买付费代理,这条路我不走了,有条件的朋友可以去试试,我是用的第二个方法,selenium+time降低访问速度。反正不要求效率,让程序慢慢跑就完事了,省钱省事不香吗?哈哈哈

好了,直接贴代码。

import time
import pymysql
import requests
import re
import urllib3
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import json


class Fuck_xhs(object):
    def __init__(self):
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        self.conn = pymysql.connect(xxxxx, 自己写)
        self.cursor = self.conn.cursor()
        self.url = 'https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/search/notes?'
        self.key_dict = {
            '1': 'Xd44b23783ad8f18c2e41c045a0cda867',
            '2': 'Xe8b3f71b7585c080e9ca55e7d1b034e0',
            '3': 'X2351ff0514bb05145e8171975fe1d96d',
            '4': 'X2422fd5312cf50b12c722e1d63b2f9aa',
            '5': 'X44d5cf63fb658c609be10404b77291d5',
        }
        with open('小红书url.txt', 'r', encoding='utf-8')as f:
            r = f.read().replace('\ufeff', '')
            self.old_list = r.split('\n')
            print(self.old_list)
        options = Options()
        options.add_argument('--headless')
        self.chrome = Chrome(options=options)

    def get_detail_url(self):
        for key, value in self.key_dict.items():
            headers = {
                'Host': 'www.xiaohongshu.com',
                'Connection': 'keep-alive',
                'Authorization': 'wxmp.4aad8f54-3422-4d76-b440-5f4cce8d0907',
                'Device-Fingerprint': 'WHJMrwNw1k/Ff2NfArpikjizTJkAdQe2Y1P0AQTa74gJcSlBSWoMjTXYq+VUDRGsE9VCMBXrfD5W9YT2GqNMbnISuxoWerClbdCW1tldyDzmauSxIJm5Txg==1487582755342',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
                'X-Sign': value,
                'content-type': 'application/json',
                'Referer': 'https://servicewechat.com/wxffc08ac7df482a27/378/page-frame.html',
                'Accept-Encoding': 'gzip, deflate, br',
            }
            params = {
                'keyword': '揭阳周边游',
                'sortBy': 'general',
                'page': key,
                'pageSize': '20',
                'needGifCover': 'true',
            }
            res = requests.get(self.url, headers=headers, params=params, verify=False).text
            print(res)
            res_dict = json.loads(res)
            notes = res_dict['data']['notes']
            for note in notes:
                id = note['id']
                print(id)
                self.detail_url = 'https://www.xiaohongshu.com/discovery/item/' + id
                print(self.detail_url)
                if self.detail_url in self.old_list:
                    print('链接已存在。')
                    continue
                else:
                    with open('小红书url.txt', 'a', encoding='utf-8')as w:
                        w.write('\n')
                        w.write(self.detail_url)
                    self.get_detail()
                    continue
        self.conn.close()

    def get_detail(self):
        self.chrome.get(self.detail_url)
        time.sleep(1.5)
        try:
            video = self.chrome.find_element_by_xpath('//div[@class="videoframe"]')
            if video:
                return None
        except:
            pass
        self.content_pic = '
    ' + str(self.chrome.find_element_by_class_name("slide").get_attribute('innerHTML')) + '
' print(self.content_pic) urls = re.findall(r'style="background-image.*?;"', self.content_pic, re.DOTALL) for ur in urls: print('ur的值为%s' % ur) u = ''.join(re.findall(r'url\((.*?)\)', ur)) url = 'http:' + u.replace('"', '').replace('"', '').replace('https:', '').replace('http:', '') + '.jpg' print(url) self.content_pic = str(self.content_pic).replace(ur, 'src=' + '"' + url + '"').replace('span', 'img').replace( '', '') print(self.content_pic) self.content = self.chrome.find_element_by_class_name('content').get_attribute('innerHTML') try: self.author = self.chrome.find_element_by_class_name('name-detail').text print(self.author) except: self.author = ' ' try: self.title = self.chrome.find_element_by_class_name('title').text if not self.title: self.title = self.chrome.find_element_by_class_name('as-p').text print(self.title) except: self.title = ' ' try: span = self.chrome.find_elements_by_xpath('//div[@class="operation-block"]/span') self.like = span[0].find_element_by_xpath('./span').text self.comment = span[1].find_element_by_xpath('./span').text self.star = span[2].find_element_by_xpath('./span').text print(self.like, self.comment, self.star) except: self.like = ' ' self.comment = ' ' self.star = ' ' try: self.b_q = self.chrome.find_elements_by_xpath('//div[@class="keywords"]/a[@class="keyword category"]') print(self.b_q) a_l = [] for bq in self.b_q: a = bq.text a_l.append(a) self.a_l = str(a_l).replace('[', '').replace(']', '').replace("'", '').replace(',', ',') print(self.a_l) except: self.a_l = ' ' try: self.pub_time = str(self.chrome.find_element_by_xpath('//div[@class="publish-date"]/span').text).replace( '发布于', '') print(self.pub_time) except: self.pub_time = ' ' try: self.author_img = self.chrome.find_element_by_xpath('//div[@class="left-img"]/img').get_attribute('src') print(self.author_img) except: self.author_img = ' ' time.sleep(5) self.create_time = time.strftime("%Y-%m-%d %H:%M:%S") print(self.create_time) self.is_import = '0' time.sleep(3) self.deposit_mysql() def deposit_mysql(self): sql = "insert into xhs_article(id, author, author_img, title, text_img, content, like_count, review_count, collect_count, org_url, publish_time, keyword_tag, create_time, is_import, import_time) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,null)" self.cursor.execute(sql, ( str(self.author), str(self.author_img), str(self.title), str(self.content_pic), str(self.content), str(self.like), str(self.comment), str(self.star), str(self.detail_url), str(self.pub_time), str(self.a_l), str(self.create_time), str(self.is_import))) self.conn.commit() return None if __name__ == '__main__': xhs = Fuck_xhs() xhs.get_detail_url()

入库

Python-抓取小红书文章的心路历程_第5张图片

代码2020/08/14下午刚做了优化,测试可用,分享。

2020-10-12记 如果报错,先考虑参数过期情况,毕竟都两月了,自行更换最新的cookie再尝试!

你可能感兴趣的:(python,mysql,chrome,selenium)