方案1: Python + pydub / Audacity + phoneme recognition 来提取语音中的音素(phonemes)并输出为 JSON 供 3ds Max 使用
方案2: Papagayo输出.pgo 文件,通过 Python 脚本解析,然后转换成 JSON。
下面介绍下方案2:
1、需要软件Papagayo这软件把音频解释成.pgo文件,
import json
import re
def parse_ng_pgo_file(path):
phonemes = []
fps = 24
with open(path, 'r') as file:
lines = file.readlines()
for line in lines:
line = line.strip()
# 检查是否是音素行,例如 "37 E"
match = re.match(r'^(\d+)\s+([A-Z]+)$', line)
if match:
frame = int(match.group(1))
phoneme = match.group(2)
time = frame / fps
phonemes.append({"time": round(time, 3), "phoneme": phoneme})
return phonemes
# 用法
pgo_path = "Papagayo.pgo"
phoneme_data = parse_ng_pgo_file(pgo_path)
# 保存为 JSON
with open("output_phonemes.json", "w") as out:
json.dump(phoneme_data, out, indent=2)
再用pgo2json.py转为phonemes.json
[
{
"time": 0.333,
"phoneme": "E"
},
{
"time": 0.542,
"phoneme": "AI"
},
]
2,需要引入 Newtonsoft.Json.dll json解释器
3,定义骨骼的pose库 phonemesPose.json
-- 3. 定义音素姿态(每个音素对应的位置偏移)
#(
#("E", #(
#( "jaw", [0.886,-10,15.13] ),
#( "lipUpper", [-14.243,3.0,-0.604] ),
#( "lipLower", [16.014,-2.0,0.001] )
)),
#("AI", #(
#( "jaw", [0,0,0] ),
#( "lipUpper", [0,0,0] ),
#( "lipLower", [0,0,0] )
)),
#("L", #(
#( "jaw", [0,-6,0] ),
#( "lipUpper", [0,1,0] ),
#( "lipLower", [0,-1,0] )
)),
#("FV", #(
#( "jaw", [0,-3,0] ),
#( "lipUpper", [0,2,0] ),
#( "lipLower", [0,-2,0] )
))
)
然后用脚本读取 phonemes.json 和phonemesPose.json ,执行动画操作
-- 第1步:引入 Newtonsoft.Json.dll
scriptDir = getFilenamePath (getSourceFileName())
JsonDllPath = scriptDir + @"Newtonsoft.Json.dll"
JsonFilePath = scriptDir + @"phonemes.json"
-- 1.一次性读取JSON
fn ReadandProcess JsonFilePath =
(
-- 获取 UTF-8 编码对象
encoding = dotNetObject "System.Text.UTF8Encoding"
-- 读取文件的所有字节
fileBytes = (dotnetClass "System.IO.File").ReadAllBytes(JsonFilePath)
-- 将字节数组转换为字符串
jsonText = encoding.GetString(fileBytes)
-- 判断 JSON 数据是对象还是数组
local jsonType
if jsonText[1] == "[" then
jsonType = "Newtonsoft.Json.Linq.JArray"
else
jsonType = "Newtonsoft.Json.Linq.JObject"
-- 解析 JSON 数据
local jsonStruct = (dotNetClass jsonType).Parse jsonText
return jsonStruct
)
resetMaxFile #noPrompt
-- 定义音频文件的路径
audioFilePath = @"E:\捕鱼\--捕鱼3D资源--\25.4月工作\根据音频做表情\2Papagayo提取和输出音素\recently_short.wav"
prosound.append audioFilePath
trackview.open "Track View - Curve Editor"
dummyCount=10
dummyPrefix = "Dummy_"
-- 创建 Dummy 并命名
dummy name:"CTRL_Jaw" POS:[0, 0, 0]
dummy name:"CTRL_Lip_Upper" POS:[0, 0, 10]
dummy name:"CTRL_Lip_Lower" POS:[0, 0, 20]
-- 刷新视图
redrawViews()
-- 2. 骨骼控制器绑定
global ctrl_jaw = $CTRL_Jaw
global ctrl_lipUpper = $CTRL_Lip_Upper
global ctrl_lipLower = $CTRL_Lip_Lower
-- 3. 定义音素姿态(每个音素对应的位置偏移)
global phonemePoses = #(
#("E", #(
#( "jaw", [0.886,-10,15.13] ),
#( "lipUpper", [-14.243,3.0,-0.604] ),
#( "lipLower", [16.014,-2.0,0.001] )
)),
#("AI", #(
#( "jaw", [0,0,0] ),
#( "lipUpper", [0,0,0] ),
#( "lipLower", [0,0,0] )
)),
#("L", #(
#( "jaw", [0,-6,0] ),
#( "lipUpper", [0,1,0] ),
#( "lipLower", [0,-1,0] )
)),
#("FV", #(
#( "jaw", [0,-3,0] ),
#( "lipUpper", [0,2,0] ),
#( "lipLower", [0,-2,0] )
))
)
-- 4. 应用姿态打关键帧
fn applyPhonemePoses phonemeData =
(
if phonemeData == undefined then (
format "No valid phoneme data provided.\n"
return false
)
for i = 0 to phonemeData.Count - 1 do
(
local element = phonemeData.item[i]
local timeSec = element.Item["time"].Value as float
local phoneme = element.Item["phoneme"].Value as string
local frameNum = timeSec * frameRate
set animate on
-- 查找对应音素姿态
for p in phonemePoses do
(
if p[1] == phoneme do
(
local pose = p[2]
at time frameNum (
for subPose in pose do
(
if subPose[1] == "jaw" do (ctrl_jaw.position = subPose[2])
if isValidNode ctrl_jaw do addNewKey ctrl_jaw frameNum
if subPose[1] == "lipUpper" do (ctrl_lipUpper.position = subPose[2])
if isValidNode ctrl_lipUpper do addNewKey ctrl_lipUpper frameNum
if subPose[1] == "lipLower" do (ctrl_lipLower.position = subPose[2])
if isValidNode ctrl_lipLower do addNewKey ctrl_lipLower frameNum
)
)
)
)
set animate off
)
)
-- 5. 主函数入口
phonemeData = ReadandProcess JsonFilePath
applyPhonemePoses phonemeData