利用 Python 实现简单的主题爬虫

利用 Python 实现简单的主题爬虫

  利用 Python 实现简单的主题爬虫,主要是通过对指定的 主题 和 网站 进行深度爬取,获取对应网页的标题和 url ,仅供学习参考。

爬取结果:

利用 Python 实现简单的主题爬虫_第1张图片

实验源码:

import csv
import urllib.request
from io import BytesIO
from bs4 import BeautifulSoup
import gzip

# 1.初始化
# 输入关键词
topic = input("输入爬取主题:")
# topic = '运维'
# 设置爬行深度和初始网页URL
deep = 2
firstUrl = 'https://www.csdn.net/nav/ops'

# 发送HTTP请求时的HEAD信息,用于伪装为浏览器
headersParameters = {
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'Accept-Encoding': 'gzip, deflate',
    'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}


# 爬取网页,检测是否与主题相关
def page_spider(url, deep_num, existing_list):
    if deep_num < 0:
        return None, None

    return_urls = list()
    return_titles = []

    try:
        # req = urllib.request.Request(url=url, headers=headersParameters)
        html = urllib.request.urlopen(url).read()
        print(html)
        # 解压缩完成
        html = html.decode("UTF-8")  # 根据网页charset解码
        print(html)
        soup = BeautifulSoup(html, "html.parser")
        # print(soup)
        count = soup.body.get_text().count(topic)
        if count == 0:
            print("该网址与主题无关")
            return None, None
        print("网页爬取中: ", url)
        return_urls.append(url)
        if soup.title.string:
            return_titles.append(soup.title.text)
        else:
            return_titles.append("无标题")
        tags = soup('a')
        for tag in tags:
            href = tag.get('href', "")
            if href not in existing_list:
                urls, titles = page_spider(href, deep_num - 1, return_urls)
                if (urls != None):
                    return_urls.extend(urls)
                    return_titles.extend(titles)
    except Exception as e:
        print("无效网址", e)
        return None, None

    return return_urls, return_titles


if __name__ == "__main__":
    # 指定初始网页开始爬取
    result_urls, result_titles = page_spider(firstUrl, deep, list())

    # 保存结果到文件中
    result_file = open('./result.csv', 'w', newline='')
    writer = csv.writer(result_file)
    writer.writerow(('title', 'url'))

    if result_urls is None:
        print('该网址与输入主题无关')
    else:
        for i, url in enumerate(result_urls):
            writer.writerow([result_titles[i], url])
            print(result_titles[i], url)
        result_file.close()

你可能感兴趣的:(python)