网易云音乐爬取小实战

from selenium import webdriver  # 驱动浏览器
from selenium.webdriver.common.by import By  # 选择器
import time,os,re
from requests_html import HTMLSession #和requests模块的使用差不太多
#构建请求对象
session = HTMLSession()
class Spider():
    os_path = os.getcwd()+'/深夜网易云/'
    if not os.path.exists(os_path):
        os.mkdir(os_path)
    def __init__(self):
        '''第一步准备数据,(url,headers,parms)'''
        self.url = 'https://music.163.com/'
        self.opt = webdriver.ChromeOptions()  # 防止被检测
        self.opt.add_experimental_option('excludeSwitches', ['enable-automation'])
        self.browser = webdriver.Chrome(options=self.opt)  # 驱动窗口
        self.browser.maximize_window()#窗口最大化
    def start_url(self):
        '''第二步 发送请求'''
        self.browser.get(self.url)
        time.sleep(1.5)
        #有iframe嵌套,需要执行iframe切换
        iframe = self.browser.find_element(By.NAME,'contentFrame')
        self.browser.switch_to.frame(iframe)
        a_obj = self.browser.find_element(By.XPATH,'//*[@id="top-flag"]/dl[1]/dd/div/a').get_attribute('href')
        # print(a_obj)
        # music = self.browser.find_element(By.XPATH,'//*[@id="21243814741708267596967"]/td[2]/div/div/a').get_attribute('href')
        headers = {
            'cookie':'NMTID=00OReIe5LbjUauGR0BCscJMqTECeaQAAAGNvK3XVg; JSESSIONID-WYYY=guJquI7U6hAEKTkyyym5ppz2vdDq3%5Cp%2BdZVygmYOZ8YPAXsabSI7VSC53Cm2RA5fce89lFgJYPKJW%2BoZ%5CcZYz3lsfD88xE4BY1W5Kot9%2FqyCRpK%5CTHA%2F1gxf05zIn5c3cia%2BQQz%2By8UVb6eBso7cao%2FrTNXqtmlmIoT%2F%5CfoBfIljoVyz%3A1708269327449; _iuqxldmzr_=32; _ntes_nnid=9e5ee18c19e28f36ccb58dfd8ab6762b,1708267527482; _ntes_nuid=9e5ee18c19e28f36ccb58dfd8ab6762b; WEVNSM=1.0.0; WNMCID=kwzsgs.1708267527681.01.0; ntes_utid=tid._.PIG1WIXSIOBEEgRERULA44u1jfOyvTQI._.0; sDeviceId=YD-I0tfPeHFkd1BAxRUQBKQs9%2F0yOej%2BCbK; WM_NI=ZQYDmzoD8uTPA4VtS6OOFQRPV5%2FhEYavb282%2BKCTwX0o1NAPX%2FMNr6WzdL8TIeC4XKftxnHe%2F6mNRXN%2Bz4p8G%2BUuP%2BJVnyV6iDB1HSIdp5oqsRX%2F2WL1IEPDEqZOw9TcVHk%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6ee8ecb6fb1ecfdaff97af5928bb6d44f938f8a87c834f8e9a88fe86ead9ff9a6e42af0fea7c3b92a9a8f97a5f072ad9cf896d75c83ec9a95d23e8d918ab2d15c96ebe189c853a3aaaed3bb3ae9958db5f654f4b1ffafc45c9cf183aff13e95a6acadbb608bbb8dd5db41a6acb88fcb6fedeaa3afd1548ef5b6d5f33485f59b96d3218a8aa29ad53eab9c8ed2cb598bbcc0d6ee7bb18e98d0b15d90ef87a6d35087af96b0f63398bd9aa9ea37e2a3; WM_TID=kbi865lee79ARAQUBFeA55uwzab8UvEP',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
        }
        response = session.get(a_obj,headers=headers).html#session的用法和requests类似,对从第一页获取到的第二页网址发请求
        #详情页源码已经在response里
        music_id = response.xpath('//ul[@class="f-hide"]/li/a/@href')
        # print(music_id)
        music_name = response.xpath('//ul[@class="f-hide"]/li/a/text()')
        # print(music_name)
        self.parse_data(music_id,music_name)
    def parse_data(self,music_id,music_name):
        '''第三步 解析'''
        for music_ids,music_names in zip(music_id,music_name):
            music_ids = music_ids[6:]
            # print(music_ids)
            #拼接
            url = 'http://music.163.com/song/media/outer/url?'+music_ids
            print(url)
            print(music_names)
            print('=======')
            #对数据发起二进制请求 在保存
            data = session.get(url).content
            self.save_data(data,music_names)
    def save_data(self,data,mp3_name):
        '''保存'''
        song_name = re.sub('[\/:*?"<>]','-',mp3_name)
        with open (self.os_path+song_name+'.mp3','wb')as f:
            f.write(data)
            print(f"歌曲{mp3_name}保存成功")
if __name__ == '__main__':
    s = Spider()
    s.start_url()

你可能感兴趣的:(爬虫,python)