测试你的Python环境是否配置成功

# 导入需要的库
import requests
from bs4 import BeautifulSoup

# 目标网页 URL
url = 'https://quotes.toscrape.com/' # 这是一个专门用来练习爬虫的网站

# 设置请求头,模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    # 发送网络请求
    response = requests.get(url, headers=headers)
    response.raise_for_status() # 如果请求失败 (例如 404, 500),会抛出异常

    # 使用 BeautifulSoup 解析网页内容
    soup = BeautifulSoup(response.text, 'lxml')

    # 查找所有名言的 div 标签
    quotes = soup.find_all('div', class_='quote')

    print(f"成功找到 {len(quotes)} 条名言:\n")

    # 遍历并打印每条名言的文本和作者
    for quote in quotes:
        text = quote.find('span', class_='text').get_text()
        author = quote.find('small', class_='author').get_text()
        print(f"名言: {text}")
        print(f"作者: {author}\n")

except requests.exceptions.RequestException as e:
    print(f"请求失败: {e}")

爬网易云《晴天》歌曲的评论

import requests
import json
import base64
import random
import string
import time
from Crypto.Cipher import AES

# ==========================================================================================
#  加密算法部分 (无需修改)
# ==========================================================================================
MODULUS = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
PUBKEY = "010001"
NONCE = b'0CoJUm6Qyw8W8jud'

def get_random_key(size=16, chars=string.ascii_letters + string.digits):
    return ''.join(random.choice(chars) for _ in range(size))

def aes_encrypt(text, key):
    iv = b'0102030405060708'
    pad = 16 - len(text) % 16
    text = text + bytes([pad]) * pad
    encryptor = AES.new(key, AES.MODE_CBC, iv)
    result = encryptor.encrypt(text)
    return base64.b64encode(result)

def rsa_encrypt(text, pub_key, modulus):
    text = text[::-1]
    rsa_val = pow(int(text.hex(), 16), int(pub_key, 16), int(modulus, 16))
    return format(rsa_val, 'x').zfill(256)

def get_final_params(text_dict):
    text_bytes = json.dumps(text_dict).encode('utf-8')
    secret_key = get_random_key(16).encode('utf-8')
    params = aes_encrypt(text_bytes, NONCE)
    params = aes_encrypt(params, secret_key)
    enc_sec_key = rsa_encrypt(secret_key, PUBKEY, MODULUS)
    return {'params': params.decode('utf-8'), 'encSecKey': enc_sec_key}

# ==========================================================================================
#  爬虫主体 (*** 最终版修改 ***)
# ==========================================================================================
class NeteaseMusicScraper:
    def __init__(self):
        self.session = requests.Session()
        self.base_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
            'Referer': 'https://music.163.com/',
            'Content-Type': 'application/x-www-form-urlencoded'
        }
        self.session.headers.update(self.base_headers)
        # 初始化时自动获取匿名令牌
        self._get_anonymous_token()

    def _get_anonymous_token(self):
        """第一步:获取匿名令牌并存入 session 的 Cookie 中"""
        print("正在获取匿名访问令牌...")
        token_url = "https://music.163.com/weapi/register/anonimous"
        # 构造获取令牌所需的请求数据
        token_req_data = {
            'username': ''.join(random.choices(string.ascii_letters + string.digits, k=10))
        }
        encrypted_data = get_final_params(token_req_data)
        try:
            response = self.session.post(token_url, data=encrypted_data)
            if response.json().get('code') == 200:
                print("获取令牌成功!")
            else:
                print("获取令牌失败,可能影响后续请求。")
        except Exception as e:
            print(f"获取匿名令牌时出错: {e}")


    def get_hot_comments(self, song_id):
        """第二步:携带令牌 Cookie 请求评论数据"""
        print(f"正在尝试爬取歌曲 ID 为 [{song_id}] 的热门评论...")
        comments_url = f"https://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}"
        req_data = {"csrf_token": ""}
        encrypted_data = get_final_params(req_data)

        try:
            time.sleep(1)
            # 使用 self.session 发送请求,它会自动携带已获取的令牌 Cookie
            response = self.session.post(comments_url, data=encrypted_data)
            response.raise_for_status()
            result = response.json()

            if result.get('code') == 200:
                if "hotComments" in result and result["hotComments"]:
                    print("\n" + "="*20 + " 热门评论 " + "="*20)
                    for i, comment in enumerate(result["hotComments"]):
                        print(f"\n{i+1}. 用户: {comment['user']['nickname']}")
                        print(f"   评论: {comment['content']}")
                        print(f"   点赞: {comment['likedCount']}")
                    print("\n" + "="*52)
                else:
                    print("这首歌可能没有热门评论。")
            else:
                print(f"服务器返回错误码: {result.get('code')}, 信息: {result.get('message')}")

        except json.JSONDecodeError:
            print("请求失败: 服务器返回的不是有效的JSON。被反爬虫策略拦截。")
            print("服务器返回内容:", response.text)
        except Exception as e:
            print(f"处理数据时发生错误: {e}")

# ==========================================================================================
#  主程序入口
# ==========================================================================================
if __name__ == "__main__":
    SONG_ID = "186016"
    scraper = NeteaseMusicScraper()
    scraper.get_hot_comments(SONG_ID)

你可能感兴趣的:(python,开发语言)