关于知识图谱数据获取(多源异构数据收集),结合电影知识图谱案例展开,涵盖数据源选择、API调用、网页爬取、数据整合等关键环节:
明确数据需求清单
实体类型 | 核心字段 | 可选字段 | 数据格式 |
---|---|---|---|
电影 | 标题、ID、上映年份、片长 | 简介、海报URL、票房 | 结构化数据 |
演员 | 姓名、ID、出生日期、国籍 | 代表作、社交账号 | 半结构化数据 |
关系 | 导演→电影、演员→角色→电影 | 合作次数、获奖记录 | 需关联多个源 |
评估数据源可行性
制定数据获取策略
操作步骤
注册API Key
调用流程示例
import requests
import json
from tqdm import tqdm
API_KEY = "your_api_key"
BASE_URL = "https://api.themoviedb.org/3"
def fetch_movies(page=1):
"""获取电影列表(按popularity排序)"""
url = f"{BASE_URL}/discover/movie?api_key={API_KEY}&sort_by=popularity.desc&page={page}"
response = requests.get(url)
return response.json()
def fetch_movie_details(movie_id):
"""获取单部电影详细信息(含演员、导演)"""
url = f"{BASE_URL}/movie/{movie_id}?api_key={API_KEY}&append_to_response=credits"
response = requests.get(url)
return response.json()
# 批量获取电影数据(示例:前10页)
all_movies = []
for page in tqdm(range(1, 11)):
page_data = fetch_movies(page)
for movie in page_data["results"]:
details = fetch_movie_details(movie["id"])
all_movies.append(details)
# 保存数据
with open("tmdb_movies.json", "w", encoding="utf-8") as f:
json.dump(all_movies, f, ensure_ascii=False, indent=2)
数据清洗与映射
def process_tmdb_data(movie_data):
processed = {
"movie_id": movie_data["id"],
"title": movie_data["title"],
"year": int(movie_data["release_date"].split("-")[0]) if movie_data.get("release_date") else None,
"rating": movie_data["vote_average"],
"overview": movie_data["overview"],
"genres": [genre["name"] for genre in movie_data["genres"]],
"cast": [
{
"actor_id": actor["id"],
"name": actor["name"],
"character": actor["character"]
}
for actor in movie_data["credits"]["cast"][:10] # 取前10位主演
],
"director": next(
(crew["name"] for crew in movie_data["credits"]["crew"] if crew["job"] == "Director"),
None
)
}
return processed
操作步骤
下载数据集
title.basics.tsv.gz
(电影基本信息)title.ratings.tsv.gz
(评分信息)name.basics.tsv.gz
(人物信息)title.principals.tsv.gz
(人物与电影关联)数据解析与整合
import pandas as pd
# 读取电影基本信息
titles_df = pd.read_csv("title.basics.tsv.gz", sep="\t", na_values=["\\N"])
titles_df = titles_df[titles_df["titleType"] == "movie"] # 筛选电影类型
# 读取评分信息并合并
ratings_df = pd.read_csv("title.ratings.tsv.gz", sep="\t")
movies_df = pd.merge(titles_df, ratings_df, on="tconst")
# 读取人物信息
names_df = pd.read_csv("name.basics.tsv.gz", sep="\t", na_values=["\\N"])
# 提取电影-人物关联(仅导演和主演)
principals_df = pd.read_csv("title.principals.tsv.gz", sep="\t")
directors_df = principals_df[principals_df["category"] == "director"]
actors_df = principals_df[principals_df["category"].isin(["actor", "actress"])]
操作步骤
分析页面结构
https://movie.douban.com/subject/{douban_id}/
实现爬虫(示例)
import requests
from bs4 import BeautifulSoup
import re
def get_douban_info(douban_id):
url = f"https://movie.douban.com/subject/{douban_id}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
try:
# 提取评分
rating = float(soup.select_one(".rating_num").text)
# 提取评分人数
rating_count = int(re.search(r"(\d+)人评价", soup.select_one(".rating_people").text).group(1))
# 提取前5条短评
comments = [
comment.select_one(".short").text
for comment in soup.select(".comment-item")[:5]
]
return {
"douban_rating": rating,
"douban_votes": rating_count,
"comments": comments
}
except Exception as e:
print(f"Error fetching {douban_id}: {e}")
return None
反爬策略
操作步骤
调用Wikipedia API
import wikipedia
def get_wikipedia_info(person_name):
try:
# 搜索并获取页面
page = wikipedia.page(person_name, auto_suggest=False)
# 提取基本信息
summary = page.summary
birth_date = re.search(r"born ([\w\s,]+)", summary)
birth_date = birth_date.group(1) if birth_date else None
# 提取关系(如合作导演、常演角色类型)
relationships = []
for sentence in summary.split(". "):
if "collaborated with" in sentence:
relationships.append(sentence)
return {
"summary": summary,
"birth_date": birth_date,
"relationships": relationships
}
except Exception as e:
print(f"Error fetching {person_name}: {e}")
return None
实体识别与关系抽取
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_relations(text):
doc = nlp(text)
relations = []
for ent in doc.ents:
if ent.label_ == "PERSON":
# 提取与人物相关的关系
for token in ent:
if token.dep_ == "nsubj":
for child in token.head.children:
if child.dep_ == "dobj":
relations.append({
"subject": ent.text,
"relation": token.head.text,
"object": child.text
})
return relations
操作步骤
构建ID映射表
def align_entities(tmdb_data, douban_data):
id_mapping = {}
for tmdb_movie in tmdb_data:
tmdb_title = tmdb_movie["title"]
tmdb_year = tmdb_movie["year"]
# 在豆瓣数据中查找匹配项
for douban_movie in douban_data:
# 模糊匹配标题(考虑中英文差异)
if fuzz.ratio(tmdb_title, douban_movie["title"]) > 80 and \
abs(tmdb_year - douban_movie["year"]) <= 1:
id_mapping[tmdb_movie["movie_id"]] = douban_movie["douban_id"]
break
return id_mapping
数据融合
def merge_data(tmdb_data, douban_data, id_mapping):
merged = []
for tmdb_movie in tmdb_data:
tmdb_id = tmdb_movie["movie_id"]
douban_id = id_mapping.get(tmdb_id)
merged_movie = {
"id": tmdb_id,
"title": tmdb_movie["title"],
"year": tmdb_movie["year"],
"tmdb_rating": tmdb_movie["rating"],
"genres": tmdb_movie["genres"],
"cast": tmdb_movie["cast"]
}
# 添加豆瓣数据(如果存在)
if douban_id:
douban_info = next(
(d for d in douban_data if d["douban_id"] == douban_id),
None
)
if douban_info:
merged_movie["douban_rating"] = douban_info["rating"]
merged_movie["douban_comments"] = douban_info["comments"]
merged.append(merged_movie)
return merged
def check_data_integrity(data):
missing_fields = {}
for entity_type in ["movie", "actor", "director"]:
required_fields = {
"movie": ["title", "year", "rating"],
"actor": ["name", "actor_id"],
"director": ["name", "director_id"]
}
for field in required_fields[entity_type]:
missing_count = sum(1 for item in data if field not in item or not item[field])
if missing_count > 0:
missing_fields[f"{entity_type}.{field}"] = missing_count
return missing_fields
def validate_consistency(data):
issues = []
# 验证电影-演员关系一致性
for movie in data:
if "cast" in movie:
for actor in movie["cast"]:
# 检查演员ID是否存在于演员实体中
actor_exists = any(
a["actor_id"] == actor["actor_id"]
for a in data if a.get("type") == "actor"
)
if not actor_exists:
issues.append(f"电影 {movie['title']} 引用了不存在的演员ID: {actor['actor_id']}")
return issues
data/
├── raw/ # 原始数据
│ ├── tmdb/ # TMDB API数据
│ ├── imdb/ # IMDb数据集
│ └── douban/ # 豆瓣爬取数据
├── processed/ # 处理后的数据
│ ├── movies.json # 合并后的电影数据
│ ├── actors.json # 演员数据
│ └── relationships.csv # 关系数据
└── metadata/ # 元数据
├── schema.json # 当前Schema版本
└── data_mapping.csv # 数据源映射表
# 使用Git管理数据变更
git add data/raw/tmdb_movies_202305.json
git commit -m "Update TMDB data (2023-05)"
git tag v1.0.0 # 标记数据版本
知识图谱构建中数据清洗的完整流程与关键注意事项,结合电影知识图谱案例详细说明:
操作步骤
加载数据
import pandas as pd
from collections import Counter
# 加载多源数据
tmdb_data = pd.read_json("tmdb_movies.json")
imdb_data = pd.read_csv("imdb_movies.csv")
douban_data = pd.read_json("douban_movies.json")
基本统计分析
def data_summary(df):
print(f"数据量: {len(df)} 条")
print(f"字段数: {len(df.columns)} 个")
print("\n缺失值统计:")
print(df.isnull().sum())
print("\n数据类型:")
print(df.dtypes)
print("\n唯一值统计:")
for col in df.columns:
print(f"{col}: {df[col].nunique()} 个唯一值")
data_summary(tmdb_data)
质量问题识别
问题类型 | 检测方法 |
---|---|
缺失值 | df.isnull().sum() 统计各字段缺失率 |
异常值 | 数值字段:计算Z-score(|Z|>3视为异常) 日期字段:检查是否在合理范围(如年份>2023) |
重复数据 | df.duplicated().sum() 统计完全重复行df[['title', 'year']].duplicated().sum() 统计特定组合重复 |
格式不一致 | 正则表达式验证(如日期格式是否为YYYY-MM-DD) |
值域违规 | 枚举类型字段(如电影类型)检查是否存在非法值(如"未知类型") |
策略选择
删除法
# 删除"title"字段缺失的记录
tmdb_data = tmdb_data.dropna(subset=["title"])
填充法
# 数值字段:用中位数填充评分
tmdb_data["rating"] = tmdb_data["rating"].fillna(tmdb_data["rating"].median())
# 分类字段:用众数填充电影类型
most_common_genre = Counter([g for genres in tmdb_data["genres"] for g in genres]).most_common(1)[0][0]
tmdb_data["genres"] = tmdb_data["genres"].apply(lambda x: x if x else [most_common_genre])
模型预测法
from sklearn.ensemble import RandomForestRegressor
# 使用其他字段预测缺失的"runtime"
rf = RandomForestRegressor()
train_data = tmdb_data[tmdb_data["runtime"].notnull()]
test_data = tmdb_data[tmdb_data["runtime"].isnull()]
X_train = train_data[["rating", "year"]] # 特征
y_train = train_data["runtime"]
X_test = test_data[["rating", "year"]]
rf.fit(X_train, y_train)
tmdb_data.loc[tmdb_data["runtime"].isnull(), "runtime"] = rf.predict(X_test)
操作示例
数值异常值
# 计算Z-score,过滤异常票房数据
mean_revenue = tmdb_data["revenue"].mean()
std_revenue = tmdb_data["revenue"].std()
tmdb_data = tmdb_data[(tmdb_data["revenue"] - mean_revenue).abs() / std_revenue <= 3]
日期异常值
# 过滤未来日期
from datetime import datetime
current_year = datetime.now().year
tmdb_data = tmdb_data[tmdb_data["year"] <= current_year]
分类异常值
# 定义合法电影类型列表
valid_genres = {"Action", "Comedy", "Drama", "Sci-Fi", "Thriller", "Romance", "Animation"}
# 替换非法类型为"Other"
tmdb_data["genres"] = tmdb_data["genres"].apply(
lambda genres: [g if g in valid_genres else "Other" for g in genres]
)
操作示例
日期格式统一
# 将不同格式的日期统一为YYYY-MM-DD
def normalize_date(date_str):
if not date_str:
return None
try:
# 处理"2023-05-20"格式
return datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%d")
except ValueError:
try:
# 处理"2023/05/20"格式
return datetime.strptime(date_str, "%Y/%m/%d").strftime("%Y-%m-%d")
except ValueError:
# 处理"May 20, 2023"格式
return datetime.strptime(date_str, "%B %d, %Y").strftime("%Y-%m-%d")
tmdb_data["release_date"] = tmdb_data["release_date"].apply(normalize_date)
文本规范化
import re
# 去除标题中的特殊符号和多余空格
def clean_title(title):
title = re.sub(r"[^\w\s]", "", title) # 移除特殊符号
title = re.sub(r"\s+", " ", title).strip() # 合并连续空格
return title
tmdb_data["title"] = tmdb_data["title"].apply(clean_title)
数值单位统一
# 将票房从字符串转换为数值(处理"$100万"、"100,000"等格式)
def parse_revenue(revenue_str):
if not revenue_str:
return 0
revenue_str = revenue_str.replace("$", "").replace(",", "")
if "万" in revenue_str:
return float(revenue_str.replace("万", "")) * 10000
return float(revenue_str)
tmdb_data["revenue"] = tmdb_data["revenue"].apply(parse_revenue)
操作示例
实体名称统一
# 演员名称标准化(处理大小写、中间名缩写等)
def normalize_name(name):
if not name:
return None
name_parts = name.strip().split()
normalized_parts = []
for part in name_parts:
if len(part) > 1:
normalized_parts.append(part.capitalize())
else:
normalized_parts.append(part.upper()) # 处理中间名缩写(如"J.")
return " ".join(normalized_parts)
# 应用于演员列表
tmdb_data["cast"] = tmdb_data["cast"].apply(
lambda cast: [{**actor, "name": normalize_name(actor["name"])} for actor in cast]
)
跨源数据对齐
# 建立导演名称映射表(处理别名)
director_aliases = {
"Christopher Nolan": ["克里斯托弗·诺兰", "诺兰"],
"Quentin Tarantino": ["昆汀·塔伦蒂诺", "昆汀"]
}
# 反向构建映射字典
alias_to_canonical = {
alias: canonical
for canonical, aliases in director_aliases.items()
for alias in aliases
}
# 统一导演名称
def unify_director_name(name):
if not name:
return None
return alias_to_canonical.get(name, name)
tmdb_data["director"] = tmdb_data["director"].apply(unify_director_name)
douban_data["director"] = douban_data["director"].apply(unify_director_name)
操作示例
完全重复记录
# 删除完全重复的电影记录
tmdb_data = tmdb_data.drop_duplicates()
相似记录合并
from difflib import SequenceMatcher
# 定义电影相似度函数
def movie_similarity(movie1, movie2):
# 标题相似度(权重0.6)
title_sim = SequenceMatcher(None, movie1["title"], movie2["title"]).ratio()
# 年份相似度(权重0.3)
year_sim = 1.0 if movie1["year"] == movie2["year"] else 0.0
# 导演相似度(权重0.1)
director_sim = 1.0 if movie1["director"] == movie2["director"] else 0.0
return 0.6 * title_sim + 0.3 * year_sim + 0.1 * director_sim
# 合并相似度>0.8的电影记录
def merge_similar_movies(movies):
merged = []
processed_indices = set()
for i in range(len(movies)):
if i in processed_indices:
continue
# 查找相似电影
similar_indices = [j for j in range(i+1, len(movies))
if movie_similarity(movies[i], movies[j]) > 0.8]
# 合并相似电影的属性
merged_movie = movies[i].copy()
for j in similar_indices:
processed_indices.add(j)
# 合并评分(取平均值)
if "rating" in movies[j] and pd.notna(movies[j]["rating"]):
if "rating" in merged_movie and pd.notna(merged_movie["rating"]):
merged_movie["rating"] = (merged_movie["rating"] + movies[j]["rating"]) / 2
else:
merged_movie["rating"] = movies[j]["rating"]
# 合并类型(去重)
if "genres" in movies[j]:
merged_movie["genres"] = list(set(merged_movie.get("genres", []) + movies[j]["genres"]))
merged.append(merged_movie)
return merged
# 应用于电影列表
tmdb_data = pd.DataFrame(merge_similar_movies(tmdb_data.to_dict("records")))
# 使用Git记录清洗过程
git add data/cleaned_tmdb_movies.csv
git commit -m "Clean TMDB data: handle missing values and normalize titles"
git tag v1.1 # 标记数据版本
missing_values:
rating: median # 用中位数填充
genres: most_frequent # 用最频繁值填充
outliers:
revenue:
method: z_score
threshold: 3
normalization:
title:
remove_special_chars: true
capitalize: true
import logging
# 配置日志
logging.basicConfig(
filename='data_cleaning.log',
level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s'
)
# 在清洗函数中记录异常
def clean_revenue(revenue_str):
try:
return parse_revenue(revenue_str)
except Exception as e:
logging.error(f"Failed to parse revenue: {revenue_str}, error: {e}")
return 0
tmdb_data["revenue"] = tmdb_data["revenue"].apply(clean_revenue)
挑战 | 解决方案 |
---|---|
数据量过大 | 1. 分批次处理(如每次处理10万条记录) 2. 使用分布式计算框架(如Spark) |
复杂依赖关系 | 1. 先处理独立字段,再处理关联字段 2. 使用图数据库检测数据依赖环路 |
清洗规则冲突 | 1. 定义规则优先级(如完整性规则>一致性规则) 2. 人工审核冲突案例 |
数据质量波动 | 1. 建立质量监控指标(如每日缺失率阈值) 2. 异常时自动触发重新清洗 |
指标 | 清洗前 | 清洗后 | 提升效果 |
---|---|---|---|
缺失率 | 评分字段:23% | 评分字段:0% | +23% |
异常值比例 | 票房字段:15% | 票房字段:2% | +13% |
重复记录数 | 1,200条 | 0条 | +100% |
字段一致性 | 导演名称:58种 | 导演名称:42种 | 减少16种变体 |
import matplotlib.pyplot as plt
import seaborn as sns
# 对比清洗前后的评分分布
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(tmdb_data_before_cleaning["rating"], kde=True)
plt.title("Before Cleaning")
plt.subplot(1, 2, 2)
sns.histplot(tmdb_data_after_cleaning["rating"], kde=True)
plt.title("After Cleaning")
plt.tight_layout()
plt.show()
class DataCleaner:
def __init__(self, data):
self.data = data
self.report = {"initial_size": len(data)}
def handle_missing_values(self, config):
for field, method in config.items():
if method == "drop":
self.data = self.data.dropna(subset=[field])
elif method == "mean":
self.data[field] = self.data[field].fillna(self.data[field].mean())
# 其他填充策略...
self.report["missing_values"] = self.data.isnull().sum().to_dict()
return self
def remove_outliers(self, config):
for field, params in config.items():
if params["method"] == "z_score":
mean = self.data[field].mean()
std = self.data[field].std()
self.data = self.data[(self.data[field] - mean).abs() / std <= params["threshold"]]
self.report["outlier_count"] = self.report["initial_size"] - len(self.data)
return self
def export(self, path):
self.data.to_csv(path, index=False)
return self.report
# 使用示例
cleaner = DataCleaner(tmdb_data)
report = cleaner.handle_missing_values({
"title": "drop",
"rating": "mean"
}).remove_outliers({
"revenue": {"method": "z_score", "threshold": 3}
}).export("cleaned_tmdb_data.csv")
print(report) # 输出清洗报告
通过系统化的数据清洗流程,可显著提升知识图谱的质量,为后续的知识抽取和应用提供可靠基础。
在知识图谱构建中,数据获取涉及多源异构数据的采集、处理与整合。以下是按功能分类的常用工具推荐,附适用场景、操作示例及工具对比:
用于结构化数据获取,支持REST/SOAP等接口协议。
https://api.themoviedb.org/3/movie/550?api_key={YOUR_KEY}
。Content-Type: application/json
)。import requests
API_KEY = "your_api_key"
url = f"https://api.themoviedb.org/3/movie/550?api_key={API_KEY}"
response = requests.get(url)
data = response.json()
print(data["title"]) # 输出:Fight Club
requests-cache
:自动缓存API响应,避免重复请求。ratelimit
:限制请求频率,遵守API使用条款。用于半结构化数据(HTML/XML)采集,支持动态页面渲染与反爬处理。
import scrapy
class DoubanSpider(scrapy.Spider):
name = "douban"
start_urls = ["https://movie.douban.com/top250"]
def parse(self, response):
for item in response.css(".item"):
yield {
"title": item.css(".title::text").get(),
"rating": item.css(".rating_num::text").get(),
"quote": item.css(".inq::text").get(),
}
# 跟进下一页
next_page = response.css("span.next a::attr(href)").get()
if next_page is not None:
yield response.follow(next_page, self.parse)
find()
、find_all()
)定位DOM元素。from bs4 import BeautifulSoup
import requests
url = "https://www.imdb.com/chart/top"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
for movie in soup.select(".titleColumn a"):
print(movie.text)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
service = Service("path/to/chromedriver")
driver = webdriver.Chrome(service=service)
driver.get("https://movie.douban.com/subject/1292052/comments")
# 点击“加载更多”按钮
more_button = driver.find_element(By.CSS_SELECTOR, ".more-btn")
more_button.click()
# 提取评论
comments = driver.find_elements(By.CSS_SELECTOR, ".comment-content")
for comment in comments:
print(comment.text)
处理多源数据的格式统一、缺失值填充与质量校验。
dropna()
、fillna()
)。import pandas as pd
# 读取数据
df = pd.read_csv("movies.csv")
# 处理缺失值
df["rating"] = df["rating"].fillna(df["rating"].mean())
# 类型转换
df["year"] = pd.to_datetime(df["year"], format="%Y")
# 导出清洗后数据
df.to_csv("cleaned_movies.csv", index=False)
解决多源数据中实体指代不一致问题(如同一个人在不同数据源中的不同名称)。
<LIMES>
<sourceEndpoint>file:tmdb_movies.csvsourceEndpoint>
<targetEndpoint>file:imdb_movies.csvtargetEndpoint>
<sourceVar>?tmdbsourceVar>
<targetVar>?imdbtargetVar>
<metric>levenshtein(?tmdb:title, ?imdb:title) < 0.2metric>
<threshold>0.8threshold>
LIMES>
import deepmatcher as dm
# 加载训练数据
train, validation, test = dm.data.process(
path='data/movies',
train='train.csv',
validation='validation.csv',
test='test.csv'
)
# 创建模型
model = dm.MatchingModel(attr_summarizer='hybrid')
model.run_train(train)
# 预测对齐结果
predictions = model.run_prediction(test)
管理获取的原始数据与处理后的中间结果。
.gitignore
排除大文件(如GB级数据集)。# 初始化仓库
git init data_repo
cd data_repo
# 添加数据文件
git add movies_sample.csv
# 提交变更
git commit -m "Add initial movie data sample"
# 创建分支开发新数据源
git branch imdb_data
git checkout imdb_data
# 初始化DVC
dvc init
# 添加大数据文件
dvc add data/movies_full.csv
# 推送到远程存储
dvc push
# 拉取特定版本数据
git checkout v1.0
dvc pull
针对大规模数据获取与处理场景,推荐组合工具方案:
数据采集层:
数据处理层:
数据存储层:
监控与调度层:
API调用:
网页爬取:
robots.txt
规则。数据安全:
通过合理选择和组合工具,可高效完成多源异构数据的获取、清洗与整合,为知识图谱构建奠定坚实基础。
挑战 | 解决方案 |
---|---|
API速率限制 | 1. 使用API密钥轮换 2. 实现请求队列与限流(如100请求/分钟) 3. 缓存已请求数据 |
网页反爬机制 | 1. 随机User-Agent池 2. 代理IP轮换(如Luminati、ScrapingBee) 3. 模拟人类浏览行为 |
数据缺失与噪声 | 1. 多源数据互补(如用IMDb补充TMDB缺失的电影) 2. 数据清洗规则(如过滤异常值) |
实体对齐歧义 | 1. 多特征匹配(名称+出生日期+国籍) 2. 人工审核关键实体(如知名导演) 3. 使用知识图谱嵌入(如Graph Embedding)计算相似度 |
source: "TMDB API v3"
)和获取时间,便于问题定位。通过系统化的数据获取流程,可构建高质量的知识图谱基础,为后续的知识抽取和应用提供可靠支撑。