【爬虫专栏12】xpath,正则,json爬取斗鱼直播

xpath和正则

import requests
import re
from requests.exceptions import  RequestException
import csv
from lxml import etree
import pandas as pd 


for i in range(1):
    url = 'https://www.douyu.com/directory/all'

    def get_one_page(url):
        try:
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
            response = requests.get(url, headers = headers)
            if response.status_code==200:
                return response.text
            return None
        except RequestException:
            return None

    def parse_one_page(html):
        #csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
        csv_file = open(r'E:\vscode_code\练习\斗鱼\douyu_data.csv', 'w', newline='', encoding='utf-8-sig')  # 解决中文乱码问题
        writer = csv.writer(csv_file)
        writer.writerow(['名称', '类型', '主播', '热度'])
        
        pattern1 = re.compile(r'

',re.S)##正则 pattern2 = re.compile(r'(.*?)',re.S)##正则 pattern3 = re.compile(r'(.*)',re.S)##正则 pattern4 = re.compile(r'(.*)',re.S)##正则 names = re.findall(pattern1,html) nums = re.findall(pattern2,html) s = etree.HTML(html) #网上代码剪贴过来的中英符号要注意,尤其是引号冒号这种不然会报错Invalid expression这种 file=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/h2/text()') file2=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/span/text()') ''' data = {'names':names, 'nums':nums, 'people':file, 'num':file2} basic_data = pd.DataFrame.from_dict(data = data) ''' basic_data.to_csv(r'E:\vscode_code\爬虫测试\B站\Bzhan2.csv', index=False, header=False) print(basic_data) csv_file.close() def main(): html = get_one_page(url) print('打印第',(i+1),'页') parse_one_page(html) if __name__=='__main__': main()

json

import requests
import re
from requests.exceptions import  RequestException
import csv
from lxml import etree
import pandas as pd 


for i in range(1):
    url = 'https://www.douyu.com/directory/all'

    def get_one_page(url):
        try:
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
            response = requests.get(url, headers = headers)
            if response.status_code==200:
                return response.text
            return None
        except RequestException:
            return None

    def parse_one_page(html):
        #csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
        csv_file = open(r'E:\vscode_code\练习\斗鱼\douyu_data.csv', 'w', newline='', encoding='utf-8-sig')  # 解决中文乱码问题
        writer = csv.writer(csv_file)
        writer.writerow(['名称', '类型', '主播', '热度'])
        
        pattern1 = re.compile(r'

',re.S)##正则 pattern2 = re.compile(r'(.*?)',re.S)##正则 pattern3 = re.compile(r'(.*)',re.S)##正则 pattern4 = re.compile(r'(.*)',re.S)##正则 names = re.findall(pattern1,html) nums = re.findall(pattern2,html) s = etree.HTML(html) #网上代码剪贴过来的中英符号要注意,尤其是引号冒号这种不然会报错Invalid expression这种 file=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/h2/text()') file2=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/span/text()') ''' data = {'names':names, 'nums':nums, 'people':file, 'num':file2} basic_data = pd.DataFrame.from_dict(data = data) ''' basic_data.to_csv(r'E:\vscode_code\爬虫测试\B站\Bzhan2.csv', index=False, header=False) print(basic_data) csv_file.close() def main(): html = get_one_page(url) print('打印第',(i+1),'页') parse_one_page(html) if __name__=='__main__': main()

你可能感兴趣的:(爬虫和数据分析)