python从入门到精通(十八):python爬虫的练习案列集合

python爬虫的练习

  • 1.爬取天气网的北京城市历史天气数据
    • 1.1 第一种使用面向对象OOP编写爬虫
    • 1.2 第二种使用面向过程函数编写爬虫

1.爬取天气网的北京城市历史天气数据

1.1 第一种使用面向对象OOP编写爬虫

import re
import requests
from bs4 import BeautifulSoup
import xlwt


class Spider(object):
	"""  
    天气数据爬虫类  
    """  
    # 定义类变量,用于正则表达式
    datatime_pattern = re.compile(r'
(.*?)
'
) wendu_pattern = re.compile(r'
(.*?)
'
) def __init__(self, url, headers, filepath): """ 初始化方法 :param url: 基础URL模板 :param headers: HTTP请求头 :param filepath: 输出文件路径 """ self.url = url self.headers = headers self.datalist = [] # 存储日期的列表 self.mwen = [] # 存储最高温度的列表 self.iwen = [] # 存储最低温度的列表 self.tq = [] # 存储天气状况的列表 self.fx = [] # 存储风向的列表 self.filepath = filepath def download_page(self,url): """ 下载页面并返回页面内容 :param url: 要下载的页面URL :return: 页面内容或None(如果下载失败) """ try: response = requests.get(url, headers=self.headers) response.raise_for_status() # 如果HTTP请求返回了不成功的状态码,则引发HTTPError异常 return response.text except requests.RequestException as e: print(f"Error downloading page: {e}") return None def parse_page(self, html): """ 解析页面内容,提取日期和温度数据 :param html: 页面内容 """ soup = BeautifulSoup(html, 'html.parser') # print(soup) for item in soup.find_all('ul', class_='thrui'): item_str = str(item) # print(item) # 使用正则表达式提取日期数据 dates = re.findall(self.datatime_pattern, item_str) self.datalist.extend(dates) # print(dates) # 使用正则表达式提取温度数据 temperatures = re.findall(self.wendu_pattern, item_str) print(temperatures) # 假设每组温度数据包含4个部分:最高温度、最低温度、天气状况、风向 for i in range(0, len(temperatures), 4): self.mwen.append(temperatures[i]) self.iwen.append(temperatures[i + 1]) self.tq.append(temperatures[i + 2]) self.fx.append(temperatures[i + 3]) def download_and_parse_all_pages(self): """ 下载并解析所有页面 """ for year in range(23, 24): # 这里设定只是下载2023年的 for month in range(1, 2): # 这里设定只是下载2023年的1月的 # base_url = self.url page_url = f"{self.url}20{year:02d}{month:02d}.html" # 这里设定不够两位补0 print(page_url) html = self.download_page(page_url) # print(html) if html: self.parse_page(html) def save_to_excel(self): """ 将爬取的数据保存到Excel文件中 """ workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) worksheet = workbook.add_sheet('北京历史天气数据', cell_overwrite_ok=True) # 写入表头 columns = ("日期", "最高温度", "最低温度", "天气", "风向") for i, col in enumerate(columns): worksheet.write(0, i, col) # 写入数据 for i in range(len(self.datalist)): worksheet.write(i + 1, 0, self.datalist[i]) worksheet.write(i + 1, 1, self.mwen[i]) worksheet.write(i + 1, 2, self.iwen[i]) worksheet.write(i + 1, 3, self.tq[i]) worksheet.write(i + 1, 4, self.fx[i]) workbook.save(self.filepath) print(f"Data saved to {self.filepath}") def run(self): self.download_and_parse_all_pages() self.save_to_excel() if __name__ == '__main__': headers = { 'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)' } url_template = "http://lishi.tianqi.com/beijing/" filepath = "beijing_weather_data.xls" spider = Spider(url_template, headers, filepath) spider.run()

1.2 第二种使用面向过程函数编写爬虫

import requests
from bs4 import BeautifulSoup
import re
import xlwt

datatime = re.compile('
(.*?)
'
) wendu = re.compile('
(.*?)
'
) def down_allpage(url): datalist = [] mwen = [] iwen = [] tq = [] fx = [] for i in range(23,24): for j in range(1,2): baseurl = url + '20{}{:0>2d}.html'.format(i, j) html = down_page(baseurl) # print(html) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('ul',class_='thrui'): # print(item) item = str(item) riqi = re.findall(datatime,item) for item1 in riqi: datalist.append(item1) # print(datalist) zb_all = re.findall(wendu,item) # print(zb_all) for i in range(31): mwen.append(zb_all[i*4+0]) iwen.append(zb_all[i*4+1]) tq.append(zb_all[i*4+2]) fx.append(zb_all[i*4+3]) # print(mwen,'\n',iwen,'\n',tq,'\n',fx) return datalist,mwen,iwen,tq,fx def save_xls(datalist,mwen,iwen,tq,fx): wb = xlwt.Workbook(encoding='utf-8', style_compression=0) ws = wb.add_sheet('天气数据',cell_overwrite_ok=True) col = ("日期","最高温度","最低温度","天气","风向") for i in range(len(col)): ws.write(0,i,col[i]) for i in range(len(datalist)): ws.write(i+1,0,datalist[i]) for i in range(len(mwen)): ws.write(i+1,1,mwen[i]) for i in range(len(iwen)): ws.write(i+1,2,iwen[i]) for i in range(len(tq)): ws.write(i+1,3,tq[i]) for i in range(len(fx)): ws.write(i+1,4,fx[i]) wb.save(r'D:\天气数据.xls') def down_page(url): headers = { 'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)' } r = requests.get(url, headers=headers) html = r.text return html # print(html) if __name__ == '__main__': url = 'http://lishi.tianqi.com/beijing/' # down_page(url) down_allpage(url) datalist,mwen,iwen,tq,fx = down_allpage(url) print(datalist) save_xls(datalist,mwen,iwen,tq,fx)

你可能感兴趣的:(python,python,爬虫,开发语言)