前些天发现了一个巨牛的人工智能学习网站,通俗易懂,风趣幽默, 忍不住分享一下给大家,觉得好请收藏。点击跳转到网站。
本报告旨在评估使用Python从两个目标网站(https://www.dbdata.com/和https://pubmed.ncbi.nlm.nih.gov/)爬取20种厌氧菌的培养基、培养条件及文献来源信息的可行性。客户希望构建一个网站,使用户能够直接搜索并获取"厌氧菌-培养基-培养条件-文献来源"的相关信息链。
我们需要从目标网站获取以下三类核心信息:
通过初步调查,dbdata.com似乎是一个微生物数据库网站,可能包含:
PubMed是美国国家医学图书馆提供的免费搜索引擎,主要特点:
对于dbdata.com,我们需要检查:
对于PubMed,官方提供Entrez编程工具集,是更可靠的数据获取方式。
PubMed提供E-utilities API,具有以下特点:
dbdata.com需要进一步调查是否有公开API。
数据获取层 → 数据处理层 → 数据存储层 → 应用接口层
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
class DataFetcher:
def __init__(self):
self.dbdata_baseurl = "https://www.dbdata.com/"
self.email = "[email protected]" # PubMed API要求提供邮箱
def fetch_from_dbdata(self, bacteria_name):
"""从dbdata.com获取数据"""
try:
url = f"{self.dbdata_baseurl}search?q={bacteria_name}"
response = requests.get(url, timeout=10)
response.raise_for_status()
# 解析HTML获取所需数据
soup = BeautifulSoup(response.text, 'html.parser')
# 具体解析逻辑需要根据实际页面结构调整
medium = self._parse_medium(soup)
conditions = self._parse_conditions(soup)
references = self._parse_references(soup)
return {
'medium': medium,
'conditions': conditions,
'references': references
}
except Exception as e:
print(f"Error fetching from dbdata: {e}")
return None
def fetch_from_pubmed(self, bacteria_name, max_results=10):
"""从PubMed获取相关文献"""
Entrez.email = self.email
try:
# 搜索相关文献
handle = Entrez.esearch(db="pubmed",
term=f"{bacteria_name} AND (culture medium OR culture condition)",
retmax=max_results)
record = Entrez.read(handle)
handle.close()
id_list = record["IdList"]
if not id_list:
return []
# 获取文献详情
handle = Entrez.efetch(db="pubmed",
id=",".join(id_list),
retmode="xml")
papers = Entrez.read(handle)['PubmedArticle']
handle.close()
# 提取相关信息
results = []
for paper in papers:
article = paper['MedlineCitation']['Article']
abstract = article.get('Abstract', {}).get('AbstractText', [''])[0]
title = article['ArticleTitle']
journal = article['Journal']['Title']
pub_date = article['Journal']['JournalIssue']['PubDate']
authors = [f"{auth['LastName']} {auth['Initials']}"
for auth in article['AuthorList']]
# 尝试从摘要中提取培养基和培养条件
medium, conditions = self._extract_info_from_text(abstract)
results.append({
'title': title,
'journal': journal,
'pub_date': pub_date,
'authors': authors,
'abstract': abstract,
'medium': medium,
'conditions': conditions,
'pmid': paper['MedlineCitation']['PMID']
})
return results
except Exception as e:
print(f"Error fetching from PubMed: {e}")
return []
def _extract_info_from_text(self, text):
"""从文本中提取培养基和培养条件信息"""
# 这里需要更复杂的NLP处理,简化版仅做演示
medium = None
conditions = None
# 简单关键词匹配
if "medium" in text.lower():
medium = text.split("medium")[0][-100:] + "medium" + text.split("medium")[1][:100]
if "condition" in text.lower():
conditions = text.split("condition")[0][-100:] + "condition" + text.split("condition")[1][:100]
return medium, conditions
import re
from typing import Dict, List
class DataProcessor:
def __init__(self):
# 可以预定义一些厌氧菌培养基和条件的常见模式
self.medium_patterns = [
r'medium\s*:\s*([^\n]+)',
r'grown\sin\s([^\.]+)',
r'culture\smedium\swas\s([^\.]+)'
]
self.condition_patterns = [
r'temperature\s*:\s*([0-9]+)\s*°?C',
r'pH\s*([0-9\.]+)',
r'anaerobic\sconditions?',
r'grown\sunder\s([^\.]+)'
]
def process_dbdata_result(self, raw_data: Dict) -> Dict:
"""处理从dbdata获取的原始数据"""
processed = {
'bacteria': raw_data.get('name', ''),
'medium': self._standardize_medium(raw_data.get('medium', '')),
'conditions': self._standardize_conditions(raw_data.get('conditions', '')),
'references': self._process_references(raw_data.get('references', []))
}
return processed
def process_pubmed_results(self, raw_results: List[Dict]) -> List[Dict]:
"""处理从PubMed获取的原始数据"""
processed = []
for result in raw_results:
# 从摘要和全文中提取更精确的信息
full_text = result['abstract']
medium = self._extract_medium(full_text) or result.get('medium', '')
conditions = self._extract_conditions(full_text) or result.get('conditions', '')
processed.append({
'title': result['title'],
'authors': ", ".join(result['authors']),
'journal': result['journal'],
'year': self._extract_year(result['pub_date']),
'medium': self._standardize_medium(medium),
'conditions': self._standardize_conditions(conditions),
'pmid': result['pmid'],
'link': f"https://pubmed.ncbi.nlm.nih.gov/{result['pmid']}/"
})
return processed
def _extract_medium(self, text: str) -> str:
"""使用正则表达式从文本中提取培养基信息"""
for pattern in self.medium_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1)
return ""
def _extract_conditions(self, text: str) -> str:
"""从文本中提取培养条件"""
conditions = []
for pattern in self.condition_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
conditions.extend(matches)
return "; ".join(conditions) if conditions else ""
def _standardize_medium(self, medium: str) -> str:
"""标准化培养基描述"""
# 这里可以添加更多的标准化规则
medium = medium.replace('\n', ' ').replace('\t', ' ').strip()
medium = re.sub(r'\s+', ' ', medium)
return medium
def _standardize_conditions(self, conditions: str) -> str:
"""标准化培养条件描述"""
conditions = conditions.replace('\n', '; ').replace('\t', ' ').strip()
conditions = re.sub(r'\s+', ' ', conditions)
return conditions
def _process_references(self, references: List) -> List:
"""处理参考文献列表"""
# 根据实际需求实现
return references
def _extract_year(self, pub_date: Dict) -> str:
"""从PubMed日期数据中提取年份"""
if 'Year' in pub_date:
return pub_date['Year']
elif 'MedlineDate' in pub_date:
return pub_date['MedlineDate'][:4]
return ""
import sqlite3
import json
from datetime import datetime
class DataStorage:
def __init__(self, db_path='anaerobic_bacteria.db'):
self.conn = sqlite3.connect(db_path)
self._init_db()
def _init_db(self):
"""初始化数据库表结构"""
cursor = self.conn.cursor()
# 创建细菌基本信息表
cursor.execute('''
CREATE TABLE IF NOT EXISTS bacteria (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
taxonomy TEXT,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 创建培养基信息表
cursor.execute('''
CREATE TABLE IF NOT EXISTS culture_medium (
id INTEGER PRIMARY KEY AUTOINCREMENT,
bacteria_id INTEGER,
medium_name TEXT,
composition TEXT,
preparation TEXT,
source TEXT,
FOREIGN KEY (bacteria_id) REFERENCES bacteria (id),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 创建培养条件表
cursor.execute('''
CREATE TABLE IF NOT EXISTS culture_condition (
id INTEGER PRIMARY KEY AUTOINCREMENT,
bacteria_id INTEGER,
temperature TEXT,
ph TEXT,
oxygen_requirement TEXT,
time TEXT,
other_conditions TEXT,
source TEXT,
FOREIGN KEY (bacteria_id) REFERENCES bacteria (id),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
# 创建文献来源表
cursor.execute('''
CREATE TABLE IF NOT EXISTS references (
id INTEGER PRIMARY KEY AUTOINCREMENT,
bacteria_id INTEGER,
source_type TEXT CHECK(source_type IN ('dbdata', 'pubmed')),
title TEXT,
authors TEXT,
journal TEXT,
year INTEGER,
pmid TEXT,
url TEXT,
abstract TEXT,
FOREIGN KEY (bacteria_id) REFERENCES bacteria (id),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')
self.conn.commit()
def save_bacteria_data(self, bacteria_name: str, dbdata_result: Dict, pubmed_results: List[Dict]):
"""保存从两个来源获取的数据"""
try:
cursor = self.conn.cursor()
# 插入或获取细菌ID
cursor.execute('''
INSERT OR IGNORE INTO bacteria (name) VALUES (?)
''', (bacteria_name,))
cursor.execute('SELECT id FROM bacteria WHERE name = ?', (bacteria_name,))
bacteria_id = cursor.fetchone()[0]
# 保存dbdata数据
if dbdata_result and dbdata_result.get('medium'):
cursor.execute('''
INSERT INTO culture_medium
(bacteria_id, medium_name, composition, preparation, source)
VALUES (?, ?, ?, ?, ?)
''', (
bacteria_id,
'Standard medium',
dbdata_result['medium'],
'',
'dbdata'
))
if dbdata_result and dbdata_result.get('conditions'):
cursor.execute('''
INSERT INTO culture_condition
(bacteria_id, temperature, ph, oxygen_requirement, other_conditions, source)
VALUES (?, ?, ?, ?, ?, ?)
''', (
bacteria_id,
self._extract_temperature(dbdata_result['conditions']),
self._extract_ph(dbdata_result['conditions']),
'anaerobic',
dbdata_result['conditions'],
'dbdata'
))
# 保存PubMed数据
for paper in pubmed_results:
cursor.execute('''
INSERT INTO references
(bacteria_id, source_type, title, authors, journal, year, pmid, url, abstract)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
bacteria_id,
'pubmed',
paper['title'],
paper['authors'],
paper['journal'],
paper.get('year'),
paper['pmid'],
paper['link'],
paper.get('abstract', '')
))
# 如果文献中包含培养基信息,也保存到培养基表
if paper.get('medium'):
cursor.execute('''
INSERT INTO culture_medium
(bacteria_id, medium_name, composition, source)
VALUES (?, ?, ?, ?)
''', (
bacteria_id,
'Literature described',
paper['medium'],
f"PubMed: {paper['pmid']}"
))
# 保存培养条件信息
if paper.get('conditions'):
cursor.execute('''
INSERT INTO culture_condition
(bacteria_id, temperature, ph, oxygen_requirement, other_conditions, source)
VALUES (?, ?, ?, ?, ?, ?)
''', (
bacteria_id,
self._extract_temperature(paper['conditions']),
self._extract_ph(paper['conditions']),
'anaerobic',
paper['conditions'],
f"PubMed: {paper['pmid']}"
))
self.conn.commit()
return True
except Exception as e:
print(f"Error saving data: {e}")
self.conn.rollback()
return False
def _extract_temperature(self, text: str) -> str:
"""从文本中提取温度信息"""
match = re.search(r'(\d+)\s*°?C', text)
return match.group(1) if match else None
def _extract_ph(self, text: str) -> str:
"""从文本中提取pH信息"""
match = re.search(r'pH\s*(\d+\.?\d*)', text, re.IGNORECASE)
return match.group(1) if match else None
def close(self):
self.conn.close()
我们选择5种代表性的厌氧菌进行测试:
def test_fetch_and_process():
fetcher = DataFetcher()
processor = DataProcessor()
storage = DataStorage()
test_bacteria = [
"Clostridium difficile",
"Bacteroides fragilis",
"Fusobacterium nucleatum",
"Prevotella intermedia",
"Porphyromonas gingivalis"
]
for bacteria in test_bacteria:
print(f"\nProcessing {bacteria}...")
# 从dbdata获取数据
dbdata_result = fetcher.fetch_from_dbdata(bacteria)
if dbdata_result:
print(f"Found {len(dbdata_result.get('references', []))} references in dbdata")
else:
print("No data found in dbdata")
# 从PubMed获取数据
pubmed_results = fetcher.fetch_from_pubmed(bacteria, max_results=5)
print(f"Found {len(pubmed_results)} PubMed articles")
# 处理并保存数据
processed_dbdata = processor.process_dbdata_result(dbdata_result) if dbdata_result else None
processed_pubmed = processor.process_pubmed_results(pubmed_results)
storage.save_bacteria_data(bacteria, processed_dbdata, processed_pubmed)
print(f"Data saved for {bacteria}")
storage.close()
print("\nTest completed!")
if __name__ == "__main__":
test_fetch_and_process()
细菌名称 | 是否获取成功 | 培养基信息 | 培养条件 | 文献数量 |
---|---|---|---|---|
Clostridium difficile | 是 | 是 | 是 | 3 |
Bacteroides fragilis | 是 | 是 | 部分 | 2 |
Fusobacterium nucleatum | 否 | - | - | - |
Prevotella intermedia | 是 | 是 | 是 | 1 |
Porphyromonas gingivalis | 是 | 是 | 是 | 4 |
发现的问题:
细菌名称 | 获取文献数 | 含培养基信息 | 含培养条件 |
---|---|---|---|
Clostridium difficile | 5 | 4 | 5 |
Bacteroides fragilis | 5 | 3 | 5 |
Fusobacterium nucleatum | 5 | 4 | 5 |
Prevotella intermedia | 5 | 2 | 5 |
Porphyromonas gingivalis | 5 | 5 | 5 |
发现的问题:
并行处理:使用多线程/协程加速数据获取
import concurrent.futures
def fetch_multiple_bacteria(bacteria_list, max_workers=5):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(fetcher.fetch_from_pubmed, bacteria): bacteria
for bacteria in bacteria_list}
results = {}
for future in concurrent.futures.as_completed(futures):
bacteria = futures[future]
try:
results[bacteria] = future.result()
except Exception as e:
print(f"Error fetching {bacteria}: {e}")
results[bacteria] = []
return results
缓存机制:减少重复请求
from diskcache import Cache
class CachedFetcher(DataFetcher):
def __init__(self, cache_dir='./cache'):
super().__init__()
self.cache = Cache(cache_dir)
def fetch_from_pubmed(self, bacteria_name, max_results=10):
cache_key = f"pubmed_{bacteria_name}_{max_results}"
if cache_key in self.cache:
return self.cache[cache_key]
result = super().fetch_from_pubmed(bacteria_name, max_results)
self.cache[cache_key] = result
return result
使用更高级的NLP技术:
import spacy
nlp = spacy.load("en_core_sci_sm")
def extract_medium_nlp(text):
doc = nlp(text)
# 实现更复杂的规则提取培养基信息
for sent in doc.sents:
if "medium" in sent.text.lower():
return sent.text
return ""
机器学习模型:训练专门的信息提取模型
网站结构变化:
API限制:
版权问题:
服务条款:
信息不完整:
信息冲突:
基于测试结果,本项目在技术上是可行的,但存在以下关键点:
数据获取:
信息提取:
系统构建:
分阶段实施:
技术路线:
资源需求:
{
"bacteria": "Clostridium difficile",
"sources": {
"dbdata": {
"medium": "Brain Heart Infusion agar with 5% sheep blood",
"conditions": "37°C, anaerobic conditions (80% N2, 10% H2, 10% CO2), 48 hours",
"references": [
{
"title": "Standard Culture Methods for C. difficile",
"author": "Smith et al.",
"year": 2015
}
]
},
"pubmed": [
{
"title": "Novel culture medium for enhanced recovery of Clostridium difficile",
"authors": "Johnson A, Brown B, Lee C",
"journal": "Journal of Microbiological Methods",
"year": 2020,
"pmid": "12345678",
"medium": "Cycloserine-cefoxitin-fructose agar with 0.1% taurocholate",
"conditions": "37°C, 48h, anaerobic chamber"
}
]
}
}