豆瓣数据爬取

完成了!

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0"
}

# 定义一个函数来爬取电影信息
def scrape_movie_info(url):
    response = requests.get(url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    # 初始化变量
    movie_name = ""
    director_name = []
    actor_name = []
    type_name = []
    point_name = ""
    time_name = []

    # 提取电影名称
    name_all = soup.find_all("span", attrs={"property": "v:itemreviewed"})
    for name in name_all:
        movie_name = name.get_text()

    # 提取导演信息
    direct_all = soup.find_all("a", attrs={"rel": "v:directedBy"})
    for direct in direct_all:
        director_name.append(direct.get_text())

    # 提取演员信息
    actor_all = soup.find_all("a", attrs={"rel": "v:starring"})
    for actor in actor_all:
        actor_name.append(actor.get_text())

    # 提取电影类型
    type_all = soup.find_all("span", attrs={"property": "v:genre"})
    for type in type_all:
        type_name.append(type.get_text())

    # 提取评分
    point_all = soup.find_all("strong", attrs={"property": "v:average"})
    for point in point_all:
        point_name = point.get_text()

    # 提取上映时间
    time_all = soup.find_all("span", attrs={"property": "v:initialReleaseDate"})
    for time in time_all:
        time_name = time.get_text()

    return {
        "Movie Name": movie_name,
        "Director": "; ".join(director_name),
        "Actors": "; ".join(actor_name),
        "Genre": "; ".join(type_name),
        "Rating": point_name,
        "Release Date": time_name
    }

# 定义一个列表,包含所有需要爬取的电影 URL
urls = [
"https://movie.douban.com/subject/1297311/",
"https://movie.douban.com/subject/1295815/",
"https://movie.douban.com/subject/1292233/",
"https://movie.douban.com/subject/1291568/",
"https://movie.douban.com/subject/1297188/",
"https://movie.douban.com/subject/1294421/",
"https://movie.douban.com/subject/3690289/",
"https://movie.douban.com/subject/1361276/",
"https://movie.douban.com/subject/1298871/",
"https://movie.douban.com/subject/1299103/",
"https://movie.douban.com/subject/1689787/",
"https://movie.douban.com/subject/1858233/",
"https://movie.douban.com/subject/1293071/",
"https://movie.douban.com/subject/1301279/",
"https://movie.douban.com/subject/1291559/",
]

# 写入 TXT 文件
with open("test.txt", "a", encoding="utf-8") as f:
    for url in urls:
        movie_info = scrape_movie_info(url)
        f.write(f"Movie Name: {movie_info['Movie Name']}\n")
        f.write(f"Director: {movie_info['Director']}\n")
        f.write(f"Actors: {movie_info['Actors']}\n")
        f.write(f"Genre: {movie_info['Genre']}\n")
        f.write(f"Rating: {movie_info['Rating']}\n")
        f.write(f"Release Date: {movie_info['Release Date']}\n")
        f.write("\n")  # 添加一个空行分隔不同电影的信息

 

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}

# 打开文件用于写入
f = open("moviehref.txt", "w", encoding="utf-8")

for start_num in range(0, 150, 15):
    url = f"https://movie.douban.com/people/282795808/collect?start={start_num}"
    response = requests.get(url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    # 找到所有包含电影信息的 div 标签
    all_titles = soup.find_all("div", class_="info")
    for title in all_titles:
        # 找到每个 div 中的 a 标签
        a_tag = title.find("a")
        if a_tag:
            href = a_tag.get("href")
            print(href)
            f.write(f'"{href}"\n')
f.close()

我的代码可以根据你的豆瓣网址爬取你看过电影的详细信息

你可能感兴趣的:(python,开发语言)