简单的网页链接爬虫

from urllib.request import urlopen
from urllib.parse import urljoin
from html.parser import HTMLParser

# 自定义 HTML 解析器
class LinkParser(HTMLParser):
    def __init__(self, base_url):
        super().__init__()
        self.base_url = base_url  # 基础 URL(用于补全相对路径)
        self.links = []  # 存储抓取的链接

    def handle_starttag(self, tag, attrs):
        if tag == "a":  # 处理  标签
            for attr in attrs:
                if attr[0] == "href":  # 获取链接
                    link = attr[1]
                    # 补全相对路径
                    full_link = urljoin(self.base_url, link)
                    self.links.append(full_link)

# 抓取网页链接
def fetch_links(url):
    try:
        # 发送 HTTP 请求
        response = urlopen(url)
        html_content = response.read().decode("utf-8")

        # 解析 HTML
        parser = LinkParser(url)
        parser.feed(html_content)

        # 返回抓取的链接
     

你可能感兴趣的:(爬虫,python,简单)