python-关于爬虫爬取斗鱼直播

# 利用selenium爬取斗鱼直播的标题,类型,用户,和图片链接(相对request较慢)

import json

import time
from selenium import webdriver

class Douyu(object):
    def __init__(self):
        self.url = 'https://www.douyu.com/directory/all'
        self.driver = webdriver.Chrome()
        self.file = open('douyu.json','w')

    def parse_data(self):
        # 获取页面元素列表
        node_list= self.driver.find_elements_by_xpath('//*[@id="live-list-contentbox"]/li/a')
        detail_list =[]
        for node in node_list:
            temp={}
            temp['title'] = node.find_element_by_xpath('./div/div/h3').text
            temp['owner']=node.find_element_by_xpath('./div/p/span[1]').text
            temp['type']=node.find_element_by_xpath('./div/div/span').text
            temp['num']=node.find_element_by_xpath('./div/p/span[2]').text
            temp['cover'] = node.find_element_by_xpath('./span/img').get_attribute('src')
            print(temp)
            detail_list.append(temp)
        return detail_list
    def save_data(self,detail_list):
        for data in detail_list:
            json_data= json.dumps(data,ensure_ascii=False)+',\n'
            self.file.write(json_data)

    def __del__(self):
        self.file.close()
        self.driver.close() # 关闭当前窗口,如果打开了新窗口,用quit

    def run(self):
        # 构建url
        # 创建浏览器驱动对象
        # 发送请求
        self.driver.get(self.url)
        # 解析数据页面元素数据的定位和提取
        while True:
            detail_list= self.parse_data()
            # 保存数据
            self.save_data(detail_list)
            try:
                next_url = self.driver.find_element_by_xpath('//a[contains(text(),"下一页")]')
                next_url.click()
                time.sleep(3)
            except:
                break

if __name__ == '__main__':
    douyu = Douyu()
    douyu.run()

 

你可能感兴趣的:(python,webspider)