python爬虫——网易云音乐

最近爬虫搞得头很大,主要是自己太菜,对付网易云音乐实在搞不定,网易云的部分内容使用api传输,靠大神的指导终于完成了,话不多说大神的代码送上

#python3
# -*- coding: utf-8 -*-
# File  : spider1.py
# Author: Wang Chao
# Date  : 2018/11/5
import requests
import json
from pprint import pprint


class NEM_spider(object):


    def __init__(self):
        self.headers = {
        'host': 'music.163.com',
        'Referer': 'http://music.163.com/search/',
        'User-Agent':
        ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        ' (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
        }
        self.cookies = {'appver': '1.5.2'}


    # Getting playlist (歌单)
    def get_playlist_detail(self, playlist_id):
        url = 'http://music.163.com/api/playlist/detail'
        payload = {'id': playlist_id}

        r = requests.get(url, params = payload, headers = self.headers,
            cookies=self.cookies)
        #这里是一个json文件,可以写入到文件里,也可以后边直接用
        playlist_detail = r.json()
        #歌单信息的索引
        #['result']['tracks']

        return playlist_detail

    #从歌单中获取歌曲列表
    def from_playlist_get_song_list(self, playlist_id):
        playlist_detail = self.get_playlist_detail(playlist_id)
        songlist = []
        for song_detail in playlist_detail:
            song = {}
            song['id'] = song_detail['id']
            song['name'] = song_detail['name']
            #song['time'] = song_detail['hMusic']['playTime']
            artists_detail = []
            for artist in song_detail['artists']:
                artist_detail = {}
                artist_detail['name'] = artist['name']
                artist_detail['id'] = artist['id']
                artists_detail.append(artist_detail)
            song['artists'] = artists_detail
            songlist.append(song)

        return songlist

    
    def get_artists_songlist(self, artist_id):
        url = 'http://music.163.com/api/artist/{}'.format(artist_id)

        r = requests.get(url, headers=self.headers, cookies=self.cookies)
        hotSongs = r.json()['hotSongs']

        songlist = []
        for hotSong in hotSongs:
            song = {}
            song['id'] = hotSong['id']
            song['name'] = hotSong['name']
            songlist.append(song)

        return songlist


    def get_song_lyric(self, song_id):
        url = 'http://music.163.com/api/song/lyric'
        payload = {
        'os': 'pc', # osx
        'id': song_id,
        'lv': -1,
        'kv': -1,
        'tv': -1
        }

        r = requests.get(url, params=payload, headers=self.headers,
            cookies=self.cookies)

        result = r.json()
        # print(result)
        if ('nolyric' in result) or ('uncollected' in result):
            return None
        elif 'lyric' not in result['lrc']:
            return None
        else:
            return result['lrc']['lyric']


    def get_song_comments(self, song_id, offset=0, total='false', limit=100):
        url = ('http://music.163.com/api/v1/resource/comments/R_SO_4_{}/'
            ''.format(song_id))
        payload = {
        'rid': 'R_SO_4_{}'.format(song_id),
        'offset': offset,
        'total': total,
        'limit': limit
        }

        r = requests.get(url, params=payload, headers=self.headers,
            cookies=self.cookies)


        return r.json()


    def get_total_comments(self, song_id):
        comments = self.get_song_comments(song_id)['comments']
        comments_list = []
        offset = 0
        while comments:
            for comment in comments:
                comment_detail = {}
                comment_detail['user_name'] = comment['user']['nickname']
                comment_detail['user_id'] = comment['user']['userId']
                comment_detail['content'] = comment['content']
                comment_detail['time'] = comment['time']
                comments_list.append(comment_detail)

            offset = offset + 100
            comments = self.get_song_comments(song_id,
                offset=offset)['comments']

        return comments_list


    def from_playlist_get_artist_id(self, *playlists):
        artist_id_list = []
        for playlist_id in playlists:
            song_list = self.from_playlist_get_song_list(playlist_id)
            for song in song_list:
                for artist in song['artists']:
                    print("Got {}'s id ==> {}".format(artist['name'],
                        artist['id']))
                    # artist_id_dict = {}
                    # artist_id_dict['name'] = artist['name']
                    # artist_id_dict['id'] = artist['id']
                    # artist_id_list.append(artist_id_dict)
                    artist_id_list.append(artist['id'])

        artist_id_list=list(set(artist_id_list))
        return artist_id_list


    def from_playlist_get_full_lyric_text(self, *playlists):
        artist_id_list = self.from_playlist_get_artist_id(playlists)
        song_id_list = []
        lyric_list = []

        for artist_id in artist_id_list:

            print('Processing the work of the artist with id: {}'
                ''.format(artist_id))

            songlist = self.get_artists_songlist(artist_id)
            artist_song_id_list = [song['id'] for song in songlist]
            song_id_list.extend(artist_song_id_list)

        song_id_list = list(set(song_id_list))

        for song_id in song_id_list:

            print('Processing the lyric of the song with id: {}'
                ''.format(song_id))

            lyric = self.get_song_lyric(song_id)
            # print(lyric)
            if lyric is not None:
                # pprint(lyric)
                lyric_list.append(lyric)

        return lyric_list




if __name__ == '__main__':
    spider = NEM_spider()
    import pandas as pd
    #读取歌单id的csv文件
    playlist_id = pd.read_csv(r'C:\Users\playlist_id.csv',engine = 'python')
    '''
    #获取歌单信息
    data_list =[]
    for j in playlist_id.iloc[:,0]:
        song_time = []
        song_id = []
        song_name = []
        song_artists = []
        #playlist_detail = spider.get_playlist_detail(2037239103)
        songlist = spider.from_playlist_get_song_list(int(j))
        for i in songlist:
            song_id.append(i['id'])
            song_name.append(i['name'])
            song_artists.append(i['artists'][0]['name'])
            #song_time.append(i['time'])
        x = pd.DataFrame({'song_id':song_id,
                          'song_artist':song_artists,
                          'song_name':song_name,
                          #'song_time':song_time
                          })
        x['playlist_id'] = j
        data_list.append(x)
        print('finish'+str(j))
        #with open('test_playlist.json', 'w') as f:
        #   pprint(playlist, f)
    data = pd.concat(data_list,axis=0)
    data.to_csv('C:/Users/Administrator/Desktop/课程文件/商务大数据/songlist.csv',index = False)'''
    #获取歌单发布者的信息
    data_list = []
    for i in playlist_id.iloc[:,0]:
        creador = spider.get_playlist_detail(i)['result']['creator']
        creador_id = creador['userId']
        creador_gender = creador['gender']
        creador_province = creador['province']
        creador_signature = creador['signature']
        x = pd.DataFrame({'creador_id':creador_id,
                          'creador_gender':creador_gender,
                          'creador_province':creador_province,
                          'creador_signature':creador_signature,
                          'songlist_id':str(i)},index = [0])
        data_list.append(x)
        print('finish'+str(i))
    data = pd.concat(data_list,axis = 0)
    data.to_csv('C:/Users/Administrator/Desktop/课程文件/商务大数据/creador0.csv',index = False)


 

你可能感兴趣的:(python)