爬虫获取小说(试炼)

  • 获取网页内容:
  • 使用 requests 库发送 HTTP 请求,获取网页内容。

2. 解析书籍信息:

  • 使用 BeautifulSoup 解析网页,提取书名。

3. 获取章节内容:

  • 支持多页章节内容,自动爬取“下一页”直到章节结束。
  • 保存章节内容:
  • 以书名为文件夹,每个章节保存为一个文本文件,文件名为章节标题。
  • 自动爬取下一章:
  • 检查“下一章”链接,自动爬取下一章内容,直到没有下一章为止。
  • 本程序只供大家练习使用。
  • import os
    import requests
    from bs4 import BeautifulSoup
    
    # 目标网页地址
    url = "http://www.wxkushu.net/files/article/xiaoshuo/328/328229/84971906.html"
    
    # 获取网页内容
    def get_page_content(url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"请求失败,状态码: {response.status_code}")
        return response.text
    
    # 解析书籍信息
    def parse_book_info(html):
        soup = BeautifulSoup(html, "html.parser")
        # 获取书名
        book_title = soup.find("h1").text.strip()
        return book_title
    
    # 获取章节内容(支持多页)
    def get_chapter_content(chapter_url):
        content = ""
        current_url = chapter_url
        while True:
            html = get_page_content(current_url)
            soup = BeautifulSoup(html, "html.parser")
            content_div = soup.find("div", id="content")
            if content_div:
                content += content_div.text.strip() + "\n"
            # 检查是否有下一页
            next_page = soup.find("a", text="下一页")
            if next_page and "href" in next_page.attrs:
                current_url = "http://www.wxkushu.net" + next_page["href"]
            else:
                break
        return content.strip()
    
    # 保存章节内容
    def save_chapter(book_title, chapter_title, content):
        # 创建书籍文件夹
        if not os.path.exists(book_title):
            os.makedirs(book_title)
        # 保存章节内容
        file_path = os.path.join(book_title, f"{chapter_title}.txt")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(content)
        print(f"章节已保存: {file_path}")
    
    # 主函数
    def main():
        try:
            # 获取网页内容
            html = get_page_content(url)
            # 解析书籍信息
            book_title = parse_book_info(html)
            print(f"书名: {book_title}")
            # 初始化当前章节
            current_url = url
            chapter_number = 1
            while current_url:
                print(f"正在下载第 {chapter_number} 章")
                chapter_title = f"第 {chapter_number} 章"
                content = get_chapter_content(current_url)
                save_chapter(book_title, chapter_title, content)
                # 检查是否有下一章
                html = get_page_content(current_url)
                soup = BeautifulSoup(html, "html.parser")
                next_chapter = soup.find("a", text="下一章")
                if next_chapter and "href" in next_chapter.attrs:
                    current_url = "http://www.wxkushu.net" + next_chapter["href"]
                    chapter_number += 1
                else:
                    break
            print("所有章节下载完成!")
        except Exception as e:
            print(f"发生错误: {str(e)}")
    
    # 运行程序
    if __name__ == "__main__":
        main()
    

你可能感兴趣的:(爬虫,爬虫)