flyfish
一个基于 Wan2.1 文本到视频模型的自动化视频生成系统。
script.py
├── 读取 → config.json
│ ├── 模型配置 → 加载AI模型
│ ├── 生成参数 → 控制生成质量
│ └── 文件路径 → 定位其他文件
│
├── 读取 → prompt.json → 正向提示词
│
├── 读取 → negative_prompt_*.txt → 负向提示词
│
└── 输出 → 视频文件(如video_1.mp4)
config.json
(配置文件){
"model": {
"id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
"torch_dtype": "bfloat16"
},
"generation": {
"height": 480,
"width": 832,
"num_inference_steps": 30
}
}
prompt.json
(正向提示词库)[
{ "prompt": "一只猫在草地上行走,写实风格" },
{ "prompt": "一个人在海边跑步,日落场景" }
]
negative_prompt_cn.txt
(中文负向提示词)明亮色调,过曝,静态,细节模糊,低质量
negative_prompt_en.txt
(英文负向提示词)Bright tones, overexposed, static, blurred details, low quality
import torch
import json
import os
import time
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.utils import export_to_video
# ----------------------
# 读取配置文件
# ----------------------
try:
with open("config.json", "r", encoding="utf-8") as f:
config = json.load(f)
print("已加载配置文件")
except FileNotFoundError:
print("错误: 未找到config.json文件,请确保该文件与脚本在同一目录下")
exit(1)
except json.JSONDecodeError:
print("错误: config.json文件格式不正确,请检查JSON语法")
exit(1)
except Exception as e:
print(f"错误: 读取配置文件时发生异常: {e}")
exit(1)
# ----------------------
# 解析配置参数
# ----------------------
# 模型相关配置
model_config = config.get("model", {})
model_id = model_config.get("id", "Wan-AI/Wan2.1-T2V-1.3B-Diffusers")
vae_subfolder = model_config.get("vae_subfolder", "vae")
torch_dtype = model_config.get("torch_dtype", "bfloat16")
device = model_config.get("device", "cuda") # 支持"cuda"或"cpu"
# 生成参数配置
generation_config = config.get("generation", {})
height = generation_config.get("height", 480)
width = generation_config.get("width", 832)
num_frames = generation_config.get("num_frames", 81)
guidance_scale = generation_config.get("guidance_scale", 5.0)
fps = generation_config.get("fps", 15)
output_prefix = generation_config.get("output_prefix", "output_")
num_inference_steps = generation_config.get("num_inference_steps", 50) # 新增采样步数
# 负向提示词配置
negative_config = config.get("negative_prompt", {})
default_negative_lang = negative_config.get("default_lang", "cn") # 支持"cn"或"en"
negative_file_map = {
"cn": negative_config.get("cn_file", "negative_prompt_cn.txt"),
"en": negative_config.get("en_file", "negative_prompt_en.txt")
}
# 正向提示词配置
prompt_config = config.get("prompts", {})
prompt_file = prompt_config.get("file", "prompt.json")
prompt_key = prompt_config.get("key", "prompt") # JSON中提示词字段名
# ----------------------
# 读取负向提示词
# ----------------------
def read_negative_prompt(lang):
filename = negative_file_map[lang]
try:
with open(filename, "r", encoding="utf-8") as f:
return f.read().strip()
except FileNotFoundError:
print(f"错误: 未找到负向提示词文件 {filename}")
exit(1)
except Exception as e:
print(f"错误: 读取负向提示词文件时发生异常: {e}")
exit(1)
try:
negative_prompt = read_negative_prompt(default_negative_lang)
print(f"已加载{default_negative_lang.upper()}负向提示词")
except:
print("错误: 负向提示词加载失败")
exit(1)
# ----------------------
# 读取正向提示词
# ----------------------
try:
with open(prompt_file, "r", encoding="utf-8") as f:
prompts_data = json.load(f)
print(f"已加载 {len(prompts_data)} 个正向提示词")
except FileNotFoundError:
print(f"错误: 未找到正向提示词文件 {prompt_file}")
exit(1)
except json.JSONDecodeError:
print(f"错误: {prompt_file}文件格式不正确,请确保是有效的JSON数组")
exit(1)
# ----------------------
# 模型初始化
# ----------------------
start_time = time.time()
# 转换torch dtype
try:
dtype = getattr(torch, torch_dtype)
except AttributeError:
print(f"错误: 不支持的torch dtype: {torch_dtype}")
exit(1)
# 加载VAE
vae = AutoencoderKLWan.from_pretrained(
model_id,
subfolder=vae_subfolder,
torch_dtype=dtype
)
# 加载管道
pipe = WanPipeline.from_pretrained(
model_id,
vae=vae,
torch_dtype=dtype
)
pipe.to(device)
model_load_time = time.time() - start_time
print(f"模型加载完成,耗时: {model_load_time:.2f} 秒")
# ----------------------
# 批量生成视频
# ----------------------
total_generation_time = 0
success_count = 0
for i, item in enumerate(prompts_data, 1):
try:
prompt = item.get(prompt_key, "")
if not prompt:
print(f"警告: 第 {i} 个提示词字段为空,跳过")
continue
print(f"\n---- 生成第 {i} 个视频 ----")
print(f"正向提示词: {prompt[:50]}...")
print(f"负向提示词: {negative_prompt[:50]}...")
print(f"采样步数: {num_inference_steps}") # 新增提示
# 生成时间统计
gen_start = time.time()
# 生成视频
output = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps
).frames[0]
gen_time = time.time() - gen_start
total_generation_time += gen_time
success_count += 1
# 保存视频
output_path = f"{output_prefix}{i}.mp4"
export_to_video(output, output_path, fps=fps)
print(f"✅ 视频保存至: {output_path}")
print(f"⏱️ 生成耗时: {gen_time:.2f} 秒")
except Exception as e:
print(f"❌ 生成失败: {str(e)[:100]}...")
continue
# ----------------------
# 生成统计
# ----------------------
print("\n==================== 生成完成 ====================")
print(f"模型信息: {model_id}")
print(f"设备: {device}")
print(f"生成参数: {height}x{width}, {num_frames}帧, 引导尺度{guidance_scale}, 采样步数{num_inference_steps}")
if success_count > 0:
avg_time = total_generation_time / success_count
print(f"\n 统计结果:")
print(f" - 模型加载时间: {model_load_time:.2f} 秒")
print(f" - 成功生成: {success_count}/{len(prompts_data)}")
print(f" - 总生成时间: {total_generation_time:.2f} 秒")
print(f" - 平均耗时: {avg_time:.2f} 秒/视频")
else:
print("\n⚠️ 没有成功生成任何视频")
{
"model": {
"id": "/media/models/Wan-AI/Wan2___1-T2V-14B-Diffusers/",
"vae_subfolder": "vae",
"torch_dtype": "bfloat16",
"device": "cuda"
},
"generation": {
"height": 640,
"width": 480,
"num_frames": 81,
"guidance_scale": 5.0,
"fps": 15,
"output_prefix": "video_",
"num_inference_steps": 150
},
"negative_prompt": {
"default_lang": "cn",
"cn_file": "negative_prompt_cn.txt",
"en_file": "negative_prompt_en.txt"
},
"prompts": {
"file": "prompt.json",
"key": "prompt"
}
}
明亮色调,过曝,静态,细节模糊,字幕,风格,作品,绘画,图像,静态,整体灰暗,最差质量,低质量,JPEG压缩残留,丑陋,不完整,多余手指,绘制不佳的手,绘制不佳的脸,变形,毁容,畸形肢体,融合手指,静态图片,杂乱背景,三条腿,背景中有很多人,倒退行走
Vivid tones, overexposed, static, blurry details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression artifacts, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, morphologically deformed limbs, fused fingers, still image, cluttered background, three legs, many people in the background, walking backwards