最近发现浏览器的书签越来越乱了,主要是因为自己太懒,其次之前建的分类太多又乱,重新手动整理确实比较烦。因此有了这个小项目。借助智谱AI的力量对书签进行重新分类。
本工具用于自动整理浏览器书签,通过AI智能分类技术,将杂乱的书签按照主题自动归类,并且重新生成结构化的书签文件。
CONFIG
配置中的INPUT_FILE
和OUTPUT_FILE
CONFIG
配置中的API_KEYpython 浏览器书签文件重新分类.py
bookmarks.html
在脚本开头的CONFIG
字典中可配置以下参数:
API_KEY
: 智谱AI API密钥(必填)INPUT_FILE
: 输入的书签HTML文件路径EXTRACTED_FILE
: 提取的临时文件路径CLASSIFIED_FILE
: 分类结果Markdown文件路径OUTPUT_FILE
: 最终输出的HTML书签文件路径DEFAULT_CATEGORIES
: 默认分类列表# 主要功能模块
1. 配置参数 (CONFIG)
2. AI客户端初始化 (zhipu_client)
3. 书签提取 (extract_bookmark_info)
4. 书签分类 (classify_bookmark)
5. 分类创建 (create_new_category)
6. 格式转换 (md_to_netscape)
7. 文件清理 (cleanup_temp_files)
CONFIG = {
'API_KEY': "9e2458a8acf46274d1c1b9418eec500c.oBojpibVHt3IyXnz", # 智谱API密钥,必填
'INPUT_FILE': 'bookmarks_2025_1_3.html', # 输入文件,必填
'EXTRACTED_FILE': 'extracted_bookmarks.txt', # 提取的临时文件
'CLASSIFIED_FILE': 'classified_bookmarks.md', # 分类结果文件
'OUTPUT_FILE': 'bookmarks.html', # 最终输出文件
'DEFAULT_CATEGORIES': [ # 默认分类
'编程语言',
'人工智能',
'数据科学',
]
}
zhipu_client = ZhipuAI(api_key=CONFIG['API_KEY'])
def get_llm_response(sysPrompt, questionPrompt):
response = zhipu_client.chat.completions.create(
model="GLM-4-Plus", # 填写需要调用的模型编码
messages=[
{"role": "user", "content": sysPrompt},
{"role": "assistant", "content": "你好,请告诉我你需要分类的书签?"},
{"role": "user", "content": questionPrompt},
],
)
return response.choices[0].message.content
get_llm_response
,用于向AI模型发送系统提示和用户问题,并返回模型的响应内容。with open(CONFIG['INPUT_FILE'], 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
bookmarks = soup.find_all('a')
bookmark_info = []
for bookmark in bookmarks:
href = bookmark.get('href')
add_date = bookmark.get('add_date')
icon = bookmark.get('icon')
text = bookmark.text.strip().replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '')
bookmark_info.append({
'URL': href,
'Text': text
})
with open(CONFIG['EXTRACTED_FILE'], 'w', encoding='utf-8') as f:
for info in bookmark_info:
f.write(f"Name: {info['Text']}, URL: {info['URL']}\n")
标签作为书签。bookmark_info
列表中。EXTRACTED_FILE
中。def extract_bookmark_info(line):
name_match = re.search(r'Name: ([^,]+)', line)
url_match = re.search(r'URL: (https?://[^\s]+)', line)
if not name_match or not url_match:
return None, None
return name_match.group(1), url_match.group(1)
def classify_bookmark(bookmark_name, bookmark_url, categories):
system_prompt = (
"你是一个智能助手,任务是根据提供的书签名称和URL从给定的分类列表中选择最合适的分类返回给用户。仅返回分类名称。\n"
"请根据书签的内容和性质选择最适合的分类。如果没有合适的分类,请回答'无法分类'。\n"
"给定分类: {categories_str}\n"
"请选择最合适的分类。\n"
"示例:"
"user: 书签名称:Python官方文档\n"
"书签URL:https://docs.python.org/3/\n"
"assistant: 编程语言\n"
).format(categories_str=", ".join(categories))
user_prompt = (
"书签名称: {bookmark_name}\n"
"书签URL: {bookmark_url}\n"
).format(bookmark_name=bookmark_name, bookmark_url=bookmark_url)
try:
response = get_llm_response(system_prompt, user_prompt)
except Exception as e:
response = "其他"
if response == "无法分类":
return None
return response
def create_new_category(name, url):
system_prompt = (
"你是一个智能助手,任务是根据提供的书签名称和URL给定一个合适的书签分类。仅返回分类名称。\n"
"请根据书签的内容和性质给定分类。\n"
"示例:"
"user: 书签名称:Python官方文档\n"
"书签URL:https://docs.python.org/3/\n"
"assistant: 编程语言\n"
)
user_prompt = (
"书签名称: {name}\n"
"书签URL: {url}\n"
).format(name=name, url=url)
response = get_llm_response(system_prompt, user_prompt)
return response
extract_bookmark_info
: 从临时文件中每行提取书签名称和URL。classify_bookmark
: 根据书签名称和URL,使用AI模型从预定义分类中选择最合适的分类。如果无法分类,则返回None
。create_new_category
: 如果书签无法归入现有分类,则创建新分类。categories = CONFIG['DEFAULT_CATEGORIES']
with open(CONFIG['EXTRACTED_FILE'], 'r', encoding='utf-8') as f:
bookmarks = f.readlines()
classified = defaultdict(list)
for line in bookmarks:
if not line.strip():
continue
name, url = extract_bookmark_info(line)
if not name or not url:
continue
category = classify_bookmark(name, url, categories)
if not category:
category = create_new_category(name, url)
categories.append(category)
print(f'分类书签: {name} -> {category}')
classified[category].append(line)
with open(CONFIG['CLASSIFIED_FILE'], 'w', encoding='utf-8') as f:
f.write('# 书签\n\n')
for category, items in classified.items():
f.write(f'## {category}\n')
for item in items:
name, url = extract_bookmark_info(item)
if name and url:
f.write(f'- [{name}]({url})\n')
f.write('\n')
CLASSIFIED_FILE
中。def md_to_netscape(md_content):
html = '''
Bookmarks
Bookmarks
'''
lines = md_content.split('\n')
stack = []
current_level = 0
for line in lines:
if line.startswith('#'):
level = line.count('#')
title = line.lstrip('#').strip()
while stack and stack[-1] >= level:
html += '\n'
stack.pop()
html += f' {int(datetime.now().timestamp())}
" LAST_MODIFIED="{int(datetime.now().timestamp())}">{title}\n'
html += ' \n'
stack.append(level)
current_level = level
elif line.startswith('-'):
match = re.match(r'-\s*\[(.*?)\]\((.*?)\)', line)
if match:
name, url = match.groups()
html += f' {url} " ADD_DATE="{int(datetime.now().timestamp())}">{name}\n'
while stack:
html += '\n'
stack.pop()
return html
with open(CONFIG['CLASSIFIED_FILE'], 'r', encoding='utf-8') as f:
md_content = f.read()
html_content = md_to_netscape(md_content)
with open(CONFIG['OUTPUT_FILE'], 'w', encoding='utf-8') as f:
f.write(html_content)
md_to_netscape
函数,将Markdown格式的分类结果转换为Netscape Bookmark HTML格式。CLASSIFIED_FILE
,将其转换为HTML格式并保存到OUTPUT_FILE
。def cleanup_temp_files():
temp_files = [
CONFIG['EXTRACTED_FILE'],
CONFIG['CLASSIFIED_FILE']
]
for file in temp_files:
if os.path.exists(file):
os.remove(file)
print(f"已删除临时文件: {file}")
if __name__ == '__main__':
try:
# 主逻辑代码...
cleanup_temp_files()
print("脚本执行完成,临时文件已清理")
except Exception as e:
print(f"脚本执行出错: {str(e)}")
cleanup_temp_files
函数,清理脚本运行过程中产生的临时文件。该脚本通过以下步骤实现了对浏览器书签文件的重新分类:
这个过程不仅简化了书签管理,还利用AI技术提高了分类的准确性和效率。
# 浏览器书签文件重新分类.py
from zhipuai import ZhipuAI
from bs4 import BeautifulSoup
import re
import os
from collections import defaultdict
from datetime import datetime
# 配置参数
CONFIG = {
'API_KEY': "my-secret-key", # 智谱API密钥,必填
'INPUT_FILE': 'bookmarks_2025_1_3.html', # 输入文件,必填
'EXTRACTED_FILE': 'extracted_bookmarks.txt', # 提取的临时文件
'CLASSIFIED_FILE': 'classified_bookmarks.md', # 分类结果文件
'OUTPUT_FILE': 'bookmarks.html', # 最终输出文件
'DEFAULT_CATEGORIES': [ # 默认分类
'编程语言',#这个被用到给AI举例,不能删除
'人工智能',
'数据科学',
]
}
# 初始化客户端
zhipu_client = ZhipuAI(api_key=CONFIG['API_KEY'])
def get_llm_response(sysPrompt, questionPrompt):
response = zhipu_client.chat.completions.create(
model="GLM-4-Plus", # 填写需要调用的模型编码
messages = [
{"role": "user", "content": sysPrompt},
{"role": "assistant", "content": "你好,请告诉我你需要分类的书签?"},
{"role": "user", "content": questionPrompt},
],
)
return response.choices[0].message.content
# 读取HTML文件内容
with open(CONFIG['INPUT_FILE'], 'r', encoding='utf-8') as file:
html_content = file.read()
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有的标签
bookmarks = soup.find_all('a')
# 提取书签信息
bookmark_info = []
for bookmark in bookmarks:
href = bookmark.get('href')
add_date = bookmark.get('add_date')
icon = bookmark.get('icon')
text = bookmark.text.strip().replace('\n', '').replace('\t', '').replace('\r', '').replace(' ', '')
bookmark_info.append({
'URL': href,
'Text': text
})
# 打印提取的书签信息
for info in bookmark_info:
print(info)
# 保存提取的书签信息
with open(CONFIG['EXTRACTED_FILE'], 'w', encoding='utf-8') as f:
for info in bookmark_info:
f.write(f"Name: {info['Text']}, URL: {info['URL']}\n")
def extract_bookmark_info(line):
"""从书签行中提取名称和URL"""
name_match = re.search(r'Name: ([^,]+)', line)
url_match = re.search(r'URL: (https?://[^\s]+)', line)
if not name_match or not url_match:
return None, None
return name_match.group(1), url_match.group(1)
def classify_bookmark(bookmark_name, bookmark_url, categories):
system_prompt = (
"你是一个智能助手,任务是根据提供的书签名称和URL从给定的分类列表中选择最合适的分类返回给用户。仅返回分类名称。\n"
"请根据书签的内容和性质选择最适合的分类。如果没有合适的分类,请回答'无法分类'。\n"
"给定分类: {categories_str}\n"
"请选择最合适的分类。\n"
"示例:"
"user: 书签名称:Python官方文档\n"
"书签URL:https://docs.python.org/3/\n"
"assistant: 编程语言\n"
).format(categories_str=", ".join(categories))
user_prompt = (
"书签名称: {bookmark_name}\n"
"书签URL: {bookmark_url}\n"
).format(bookmark_name=bookmark_name, bookmark_url=bookmark_url)
try:
response = get_llm_response(system_prompt, user_prompt)
except Exception as e:
response = "其他"
if response == "无法分类":
return None
return response
def create_new_category(name, url):
"""
创建新分类
:param name: 书签名称
:param url: 书签URL
:return: 新分类名
"""
system_prompt = (
"你是一个智能助手,任务是根据提供的书签名称和URL给定一个合适的书签分类。仅返回分类名称。\n"
"请根据书签的内容和性质给定分类。\n"
"示例:"
"user: 书签名称:Python官方文档\n"
"书签URL:https://docs.python.org/3/\n"
"assistant: 编程语言\n"
)
user_prompt = (
"书签名称: {name}\n"
"书签URL: {url}\n"
).format(name=name, url=url)
response = get_llm_response(system_prompt, user_prompt)
return response
# 使用配置中的分类规则
categories = CONFIG['DEFAULT_CATEGORIES']
with open(CONFIG['EXTRACTED_FILE'], 'r', encoding='utf-8') as f:
bookmarks = f.readlines()
classified = defaultdict(list)
for line in bookmarks:
if not line.strip():
continue
name, url = extract_bookmark_info(line)
if not name or not url:
continue
# 分类书签
category = classify_bookmark(name, url, categories)
if not category:
# 创建新分类
category = create_new_category(name, url)
categories.append(category)
print(f'分类书签: {name} -> {category}')
classified[category].append(line)
# 将分类结果保存为文件
with open(CONFIG['CLASSIFIED_FILE'], 'w', encoding='utf-8') as f:
f.write('# 书签\n\n')
for category, items in classified.items():
f.write(f'## {category}\n')
for item in items:
name, url = extract_bookmark_info(item)
if name and url:
f.write(f'- [{name}]({url})\n')
f.write('\n')
from datetime import datetime
# 将分类结果重新转为HTML书签文件
def md_to_netscape(md_content):
# 初始化HTML结构
html = '''
Bookmarks
Bookmarks
'''
# 解析MD内容
lines = md_content.split('\n')
stack = []
current_level = 0
for line in lines:
# 处理标题
if line.startswith('#'):
level = line.count('#')
title = line.lstrip('#').strip()
# 关闭之前的DL
while stack and stack[-1] >= level:
html += '\n'
stack.pop()
# 添加H3标签
html += f' {int(datetime.now().timestamp())}
" LAST_MODIFIED="{int(datetime.now().timestamp())}">{title}\n'
html += ' \n'
stack.append(level)
current_level = level
# 处理链接
elif line.startswith('-'):
match = re.match(r'-\s*\[(.*?)\]\((.*?)\)', line)
if match:
name, url = match.groups()
html += f' {url} " ADD_DATE="{int(datetime.now().timestamp())}">{name}\n'
# 关闭所有打开的DL
while stack:
html += '\n'
stack.pop()
return html
with open(CONFIG['CLASSIFIED_FILE'], 'r', encoding='utf-8') as f:
md_content = f.read()
html_content = md_to_netscape(md_content)
with open(CONFIG['OUTPUT_FILE'], 'w', encoding='utf-8') as f:
f.write(html_content)
# 清理临时文件
def cleanup_temp_files():
"""清理脚本运行过程中产生的临时文件"""
temp_files = [
CONFIG['EXTRACTED_FILE'],
CONFIG['CLASSIFIED_FILE']
]
for file in temp_files:
if os.path.exists(file):
os.remove(file)
print(f"已删除临时文件: {file}")
# 主程序执行
if __name__ == '__main__':
try:
# 原有主逻辑代码...
# 脚本执行完成后清理临时文件
cleanup_temp_files()
print("脚本执行完成,临时文件已清理")
except Exception as e:
print(f"脚本执行出错: {str(e)}")