aHR0cDovL2RyYWdvbmZvcnh4YnAzYXdjN216czVka3N3cnVhM3pucXl4NXJvZWZtaTRzbWpyc2RpMjJ4d3FkLm9uaW9uL3d3dy5jaXR5b2Zncm92ZW9rLmdvdg==
base64解码
1. 网站有 token 校验与刷新机制,每小时刷新一次 token,之前的链接会失效。
2.token 是通过 JavaScript 动态注入 的,而不是页面初始加载的 HTML 就包含的内容。得通过手动刷新主目录获得,直接静态爬取不会执行 JS ,无法获得 token。
3.如果想实现自动化爬取, 需要使用 selenium
+ Tor 浏览器驱动来模拟完整浏览器行为 。(我没用这种)
4.发现改变的只是 token,前面的路径结构没有发生改变,可以更新 token 后直接拼接到链接参数后面。
http://fsguestuctexqqaoxuahuydfa6ovxuhtng66pgyr5gqcrsi7qgchpkad.onion/?path=Incode&token=
先爬取下载目录,后下载链接,有图形化界面
# 导入所需模块
import os
import time
import sys
import json
import threading
import tkinter as tk
from tkinter import messagebox
from urllib.parse import urljoin, urlparse, unquote, parse_qs
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# ------------------------ 全局配置区域 ------------------------
# 下载的根目录
download_root = r"F:\workfile\20250612"
# 支持识别并下载的文件扩展名
file_extensions = [
'.csv', '.pdf', '.zip', '.7z', '.doc', '.docx', '.xls', '.xlsx',
'.rar', '.db', '.bak', '.tar', '.gz', '.json', '.txt', '.doc', '.adb', '.jpg', '.jpeg', '.png'
]
# Tor 网络代理配置(默认 Tor Browser 使用9150端口)
proxies = {
'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'
}
# 跳过目录记录文件 / 已下载文件记录文件
visited_file = 'visited_dirs.txt'
download_record_file = 'downloaded_files.json'
downloaded_files = set() # 内存中已下载文件的集合
# Token 和控制状态变量(供 GUI 操作)
current_token = ''
paused = False # 是否暂停下载
crawl_finished = False # 下载是否已完成
task_lock = threading.Lock() # 用于线程间同步的锁
# ------------------------ 初始化辅助函数 ------------------------
# 读取需要跳过的目录关键词
def load_skip_keywords():
if not os.path.exists(visited_file):
return set()
with open(visited_file, encoding="utf-8") as f:
return set(line.strip() for line in f if line.strip())
# 读取已下载文件记录
def load_downloaded_files():
if os.path.exists(download_record_file):
with open(download_record_file, encoding="utf-8") as f:
return set(json.load(f))
return set()
# 将文件下载记录写入JSON文件
def save_downloaded_file(path):
downloaded_files.add(path)
with open(download_record_file, 'w', encoding='utf-8') as f:
json.dump(list(downloaded_files), f, ensure_ascii=False, indent=2)
# 加载配置
skip_keywords = load_skip_keywords()
downloaded_files = load_downloaded_files()
# ------------------------ 实用函数 ------------------------
# 过滤非法文件名字符
def safe_filename(filename):
return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
# 判断是否是直接下载链接
def is_direct_file_url(href):
href = href.lower()
return any(href.endswith(ext) for ext in file_extensions)
# 判断是否是 "?download" 形式的下载链接
def is_download_link(href):
return href.startswith("/download?")
# 判断是否是可递归的目录链接(含 path 和 token 参数)
def is_directory_link(href):
return "path=" in href and "token=" in href
# 根据URL解析本地目标目录路径
def get_local_dir_from_url(url):
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
if 'path' in query_params:
dir_path = query_params['path'][0].replace('/', os.sep)
else:
dir_path = parsed.path.strip('/').replace('/', os.sep)
return os.path.join(download_root, dir_path)
# 判断是否应该跳过该目录
def should_skip_dir(url):
decoded_url = unquote(url)
for keyword in skip_keywords:
if keyword.lower() in decoded_url.lower():
return True
return False
# ------------------------ 下载核心逻辑 ------------------------
# 下载并保存单个文件
def save_file(file_url, local_dir, filename, indent):
filename = unquote(filename)
filename = safe_filename(filename)
local_path = os.path.join(local_dir, filename)
file_key = os.path.relpath(local_path, download_root)
# 已记录则跳过
if file_key in downloaded_files:
print(f"{indent}✅ 已记录下载: {file_key}")
return
# 如果文件已存在且非空,也记录为已下载
if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
print(f"{indent}⏩ 文件已存在,跳过: {filename}")
save_downloaded_file(file_key)
return
os.makedirs(local_dir, exist_ok=True)
try:
with requests.get(file_url, proxies=proxies, timeout=20, stream=True) as r:
if r.status_code == 403:
print(f"{indent}❌ Token 已过期,返回 403,终止程序。")
os._exit(1)
elif r.status_code == 200:
total = int(r.headers.get('content-length', 0))
if total == 0:
print(f"{indent}⚠️ 文件大小为0,跳过: {filename}")
return
# 使用 tqdm 显示下载进度
with open(local_path, 'wb') as f, tqdm(
desc=f"{indent}⬇️ 下载: {filename}",
total=total,
unit='B',
unit_scale=True,
unit_divisor=1024,
leave=False
) as bar:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
bar.update(len(chunk))
save_downloaded_file(file_key)
else:
print(f"{indent}⚠️ 下载失败: 状态码 {r.status_code} -> {file_url}")
except Exception as e:
print(f"{indent}❌ 下载出错: {e}")
# 递归爬取页面中的链接和文件
def crawl(url, depth=0, executor=None, futures=None):
global paused
indent = ' ' * depth
while paused:
print(f"{indent}⏸️ 暂停中...")
time.sleep(1)
if should_skip_dir(url):
print(f"{indent}⏭️ 目录跳过: {url}")
return
print(f"{indent} 访问目录: {url}")
try:
r = requests.get(url, proxies=proxies, timeout=15)
except Exception as e:
print(f"{indent}❌ 请求失败: {url} -> {e}")
return
if r.status_code == 403:
print(f"{indent} Token 过期(403),脚本自动终止")
os._exit(1)
if r.status_code != 200:
print(f"{indent}⚠️ 状态异常: {r.status_code} -> {url}")
return
# 解析HTML
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')
# 如果当前页面没有文件,则不继续递归
has_files = any(
is_direct_file_url(a.get('href', '')) or is_download_link(a.get('href', ''))
for a in links
)
if not has_files:
print(f"{indent} 无文件,跳过递归: {url}")
return
local_dir = get_local_dir_from_url(url)
os.makedirs(local_dir, exist_ok=True)
# 遍历所有链接
for a in links:
href = a.get('href', '')
if not href:
continue
full_url = urljoin(url, href)
# 文件下载任务加入线程池
if is_direct_file_url(href):
filename = os.path.basename(unquote(urlparse(href).path))
futures.append(executor.submit(save_file, full_url, local_dir, filename, indent))
elif is_download_link(href):
params = parse_qs(urlparse(href).query)
path_param = params.get('path', ['unknown'])[0]
filename = os.path.basename(unquote(path_param))
futures.append(executor.submit(save_file, full_url, local_dir, filename, indent))
elif is_directory_link(href):
time.sleep(0.2) # 控制访问速度
crawl(full_url, depth + 1, executor, futures)
# ------------------------ 图形界面(Tkinter) ------------------------
# 启动爬虫主逻辑
def start_crawler():
global current_token, paused
paused = False
start_url = f"http://fsguestuctexqqaoxuahuydfa6ovxuhtng66pgyr5gqcrsi7qgchpkad.onion/?path=Incode&token={current_token.strip()}"
def run():
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
crawl(start_url, executor=executor, futures=futures)
for _ in as_completed(futures):
pass
messagebox.showinfo("完成", "所有文件已下载完成!")
threading.Thread(target=run, daemon=True).start()
# 暂停任务
def pause_crawler():
global paused
paused = True
messagebox.showinfo("已暂停", "下载已暂停,请更换 Token 后继续。")
# 恢复任务
def resume_crawler():
global paused
paused = False
messagebox.showinfo("恢复中", "已恢复下载任务。")
# 启动 GUI
def launch_gui():
global current_token
root = tk.Tk()
root.title("Onion 文件爬虫")
# Token 输入
tk.Label(root, text="Token:").grid(row=0, column=0, padx=5, pady=5)
token_entry = tk.Entry(root, width=80)
token_entry.grid(row=0, column=1, padx=5, pady=5)
# 更新 Token 按钮
def update_token():
global current_token
current_token = token_entry.get().strip()
messagebox.showinfo("Token 更新", "Token 已更新")
tk.Button(root, text="更新 Token", command=update_token).grid(row=0, column=2, padx=5)
# 控制按钮
tk.Button(root, text="开始爬取", command=start_crawler).grid(row=1, column=0, padx=5, pady=10)
tk.Button(root, text="暂停下载", command=pause_crawler).grid(row=1, column=1, padx=5, pady=10)
tk.Button(root, text="恢复下载", command=resume_crawler).grid(row=1, column=2, padx=5, pady=10)
root.mainloop()
# ------------------------ 程序入口 ------------------------
if __name__ == "__main__":
launch_gui()
边爬边下载,无图形化界面
import os
import time
import requests
from urllib.parse import urljoin, urlparse, unquote, parse_qs
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
# 配置 Tor 的代理,用于访问 .onion 网站
proxies = {
'http': 'socks5h://127.0.0.1:9150',
'https': 'socks5h://127.0.0.1:9150'
}
# 文件下载保存的根目录
download_root = r"F:\workfile\20250612"
# 常见的可下载文件扩展名
file_extensions = ['.csv', '.pdf', '.zip', '.7z', '.doc', '.docx', '.xls', '.xlsx',
'.rar', '.db', '.bak', '.tar', '.gz', '.json', '.txt', '.doc', '.adb', '.jpg', '.jpeg', '.png']
# 读取跳过的目录关键词
def load_skip_keywords():
path = "visited_dirs.txt"
if not os.path.exists(path):
return set()
with open(path, encoding="utf-8") as f:
return set(line.strip() for line in f if line.strip())
skip_keywords = load_skip_keywords()
# 判断链接是否是文件链接
def is_direct_file_url(href):
href = href.lower()
return any(href.endswith(ext) for ext in file_extensions)
# 判断是否是“/download?”开头的下载链接
def is_download_link(href):
return href.startswith("/download?")
# 判断是否是目录链接(用于递归)
def is_directory_link(href):
return "path=" in href and "token=" in href
# 对文件名进行安全处理,去除不合法字符
def safe_filename(filename):
return "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
# 下载文件函数
def save_file(file_url, local_dir, filename, indent):
filename = unquote(filename)
filename = safe_filename(filename)
local_path = os.path.join(local_dir, filename)
# 如果文件已存在且非空,跳过下载
if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
print(f"{indent}⏩ 文件已存在,跳过: {filename}")
return
os.makedirs(local_dir, exist_ok=True)
try:
with requests.get(file_url, proxies=proxies, timeout=20, stream=True) as r:
if r.status_code == 403:
print(f"{indent}❌ Token 已过期,返回 403,终止程序。")
sys.exit(1) # token 过期时直接退出程序
elif r.status_code == 200:
total = int(r.headers.get('content-length', 0))
if total == 0:
print(f"{indent}⚠️ 文件大小为0,跳过: {filename}")
return
# 使用 tqdm 显示下载进度
with open(local_path, 'wb') as f, tqdm(
desc=f"{indent}⬇️ 下载: {filename}",
total=total,
unit='B',
unit_scale=True,
unit_divisor=1024,
leave=False
) as bar:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
bar.update(len(chunk))
else:
print(f"{indent}⚠️ 下载失败: 状态码 {r.status_code} -> {file_url}")
except Exception as e:
print(f"{indent}❌ 下载出错: {e}")
# 根据 URL 中的 path 参数,构造本地保存路径
def get_local_dir_from_url(url):
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
if 'path' in query_params:
dir_path = query_params['path'][0].replace('/', os.sep)
else:
dir_path = parsed.path.strip('/').replace('/', os.sep)
return os.path.join(download_root, dir_path)
# 判断是否需要跳过该目录(根据关键词)
def should_skip_dir(url):
decoded_url = unquote(url)
for keyword in skip_keywords:
if keyword.lower() in decoded_url.lower():
return True
return False
# 主爬虫函数,支持递归和多线程下载
def crawl(url, depth=0, executor=None, futures=None):
indent = ' ' * depth
if should_skip_dir(url):
print(f"{indent}⏭️ 目录跳过(整层不递归): {url}")
return
print(f"{indent} 访问目录: {url}")
try:
r = requests.get(url, proxies=proxies, timeout=15)
except Exception as e:
print(f"{indent}❌ 请求失败: {url} -> {e}")
return
if r.status_code == 403:
print(f"{indent} Token 过期(403),脚本自动终止")
os._exit(1)
if r.status_code != 200:
print(f"{indent}⚠️ 状态异常: {r.status_code} -> {url}")
return
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')
# 判断该页面是否包含可下载的文件
has_files = any(
is_direct_file_url(a.get('href', '')) or is_download_link(a.get('href', ''))
for a in links
)
if not has_files:
print(f"{indent} 无文件,跳过递归: {url}")
return # 不创建本地目录,也不递归
local_dir = get_local_dir_from_url(url)
os.makedirs(local_dir, exist_ok=True)
for a in links:
href = a.get('href', '')
if not href:
continue
full_url = urljoin(url, href)
if is_direct_file_url(href):
filename = os.path.basename(unquote(urlparse(href).path))
print(f"{indent} 文件下载: {filename} -> {full_url}")
futures.append(executor.submit(save_file, full_url, local_dir, filename, indent))
elif is_download_link(href):
params = parse_qs(urlparse(href).query)
path_param = params.get('path', ['unknown'])[0]
filename = os.path.basename(unquote(path_param))
print(f"{indent} 下载链接: {filename} -> {full_url}")
futures.append(executor.submit(save_file, full_url, local_dir, filename, indent))
elif is_directory_link(href):
time.sleep(1) # 防止访问过快导致封禁
crawl(full_url, depth + 1, executor, futures)
# 程序入口
if __name__ == "__main__":
start_time = time.time()
# TODO:替换成你要爬取的入口URL
start_url = "http://xxxxxxx.onion/?path=...&token=..."
max_workers = 8 # 并发下载线程数
skip_keywords = load_skip_keywords() # 加载跳过列表
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
crawl(start_url, executor=executor, futures=futures)
for _ in as_completed(futures):
pass
elapsed = time.time() - start_time
print(f"\n✅ 全部下载完成!总耗时:{elapsed:.2f} 秒")
这是 支持断点续传、Token 自动控制、目录结构还原、GUI 操作 的完整TOR .onion
网站爬虫系统。它专为需要授权访问、Token 有效期受限的网络目录站点设计。
总结了下TOR爬虫的设计要点,以便后续写其他项目的爬虫是可以提供帮助与参考。
?path=xxx
参数自动解析本地路径。urlparse
+ parse_qs
提取目录层级。os.path.join()
,兼容跨平台。visited_dirs.txt
,防止重复遍历目录。downloaded_files.json
,保存相对路径,防止重复下载。.pdf
、.xlsx
、.bak
、.7z
等),根据链接后缀或 /download?
模式判断是否是文件。concurrent.futures.ThreadPoolExecutor
控制下载线程数(推荐 max_workers=10
)。tqdm
显示文件下载进度。threading.Lock()
、任务列表共享等方式防止状态错乱。tkinter
实现 Token 输入、更新、控制按钮。time.sleep(0.2)
防止被服务器限速)。127.0.0.1:9150
端口(Socks5)。requests
、bs4
、tqdm
、tkinter
(内置)。proxies={'http': 'socks5h://...', 'https': 'socks5h://...'}
。 场景 |
适配说明 |
网站目录站(含 token 授权) |
完美适配,支持目录深层遍历、token 控制 |
文件托管站下载 |
多线程高效下载、支持文件跳过与记录 |
断点续传下载需求 |
完整状态记录、异常断点后可恢复 |
本博客所发布的《TOR爬虫的设计与思考》仅供技术研究与学习交流之用,严禁将文中内容用于任何非法用途。博主不鼓励、支持或参与任何形式的网络入侵、未授权的数据访问、侵犯隐私或违反国家法律法规的行为。
请读者在遵守所在国家/地区相关法律的前提下使用本博客提供的信息与技术。任何因擅自使用该技术用于非法活动所产生的后果,均由使用者自行承担,作者不负任何法律责任。
若您对本博客内容有任何疑问或发现违规之处,欢迎及时联系博主处理。