python抓取音频文件方法_Python简易爬虫——爬取某马拉雅音频

[Python] 纯文本查看 复制代码# !/usr/bin/env python

# -*- coding: utf-8 -*-

"""

爬取X马拉雅音频

"""

import requests

import os

from bs4 import BeautifulSoup

from multiprocessing import Pool

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0"}

def get_audio_list(page_num):

"""

获取当前页面音频的ID号以便下面操作

"""

url = 'https://www.ximalaya.com/youshengshu/4256765/p%s/'

id_list = []

for num in range(1, page_num+1):

text = requests.get(url=url % num, headers=headers).text

bs = BeautifulSoup(text, 'lxml')

href_tag_list = bs.find('div', attrs={'class': 'sound-list rC5T'}).find_all('a')

id_list += [i.get('href').split('/')[-1] for i in href_tag_list]

return [i for i in id_list if i != '']

def get_audio(url, file_name):

"""

下载音频文件

"""

bin_content = requests.get(url).content

audio_file = os.path.join('audio', file_name)

with open(audio_file, 'wb') as f:

f.write(bin_content)

def get_source(source_url):

"""

获取某音频ID的json信息并下载

"""

json_dict = requests.get(source_url).json()

audio_url = json_dict.get('play_path')

audio_title = json_dict.get('title')

audio_name = f'{audio_title}.mp3'

get_audio(audio_url, audio_name)

if __name__ == '__main__':

os.mkdir('audio')

pool = Pool(4)

for page_num in range(1, 18):

audio_list = get_audio_list(page_num)

for file_id in audio_list:

json_url = f'http://m.ximalaya.com/tracks/{file_id}.json'

obj = pool.apply_async(func=get_source, args=(json_url,))

pool.close()

pool.join()

print('资源下载完成')

你可能感兴趣的:(python抓取音频文件方法)