CPU计算瓶颈:万级以上粒子时,逐粒子计算导致主线程阻塞
DrawCall开销:每个粒子单独提交渲染指令,引发性能悬崖
内存带宽限制:CPU与GPU间频繁传输粒子数据
指标 | CPU方案(10k粒子) | GPU方案(100k粒子) |
---|---|---|
计算耗时 | 15ms | 0.3ms |
DrawCall数量 | 10k | 1 |
内存带宽占用 | 60MB/s | <1MB/s |
mermaid
复制
graph LR A[CPU] -->|初始化| B[ComputeBuffer] B --> C[ComputeShader] C -->|更新| D[GPU显存] D --> E[渲染管线]
Compute Shader:负责粒子位置/速度/生命周期计算
Graphics Shader:负责粒子渲染(Billboard/Mesh)
C#脚本:资源管理、参数传递、调度控制
struct Particle { public Vector3 position; public Vector3 velocity; public float lifetime; public float size; public Color color; public static int Size = 3 * sizeof(float) * 2 + sizeof(float) * 2 + sizeof(float) * 4; }
public class GPUParticleSystem : MonoBehaviour { public ComputeShader computeShader; public Material particleMaterial; public Mesh particleMesh; private ComputeBuffer particleBuffer; private ComputeBuffer argsBuffer; private uint[] args = new uint[5] { 0, 0, 0, 0, 0 }; void Start() { int particleCount = 100000; // 创建粒子缓冲区 particleBuffer = new ComputeBuffer(particleCount, Particle.Size); // 初始化粒子数据 Particle[] initParticles = new Particle[particleCount]; for(int i=0; i
四、Compute Shader实现
1. 粒子更新核心逻辑
#pragma kernel CSMain struct Particle { float3 position; float3 velocity; float lifetime; float size; float4 color; }; RWStructuredBufferparticles; float deltaTime; float3 externalForce; [numthreads(256,1,1)] void CSMain (uint3 id : SV_DispatchThreadID) { uint idx = id.x; Particle p = particles[idx]; // 生命周期检测 if(p.lifetime <= 0) { ResetParticle(p); } else { // 物理模拟 p.velocity += externalForce * deltaTime; p.position += p.velocity * deltaTime; p.lifetime -= deltaTime; // 颜色渐变 p.color = lerp(float4(1,0,0,1), float4(1,1,0,0.5), saturate(1 - p.lifetime)); } particles[idx] = p; } void ResetParticle(inout Particle p) { p.position = float3(0,0,0); p.velocity = float3( rand()*2-1, rand()*5, rand()*2-1 ); p.lifetime = 5.0; } 2. 随机数生成函数
// 高效随机数生成器 float rand(uint seed) { seed = (seed ^ 61) ^ (seed >> 16); seed *= 9; seed = seed ^ (seed >> 4); seed *= 0x27d4eb2d; seed = seed ^ (seed >> 15); return float(seed) * (1.0 / 4294967296.0); }
五、渲染系统实现
1. 间接绘制调用(C#)
void Update() { // 更新Compute Shader参数 computeShader.SetBuffer(0, "particles", particleBuffer); computeShader.SetFloat("deltaTime", Time.deltaTime); computeShader.SetVector("externalForce", Physics.gravity); // 调度计算 int threadGroups = Mathf.CeilToInt(particleCount / 256.0f); computeShader.Dispatch(0, threadGroups, 1, 1); // 渲染粒子 particleMaterial.SetBuffer("_Particles", particleBuffer); Graphics.DrawMeshInstancedIndirect( particleMesh, 0, particleMaterial, new Bounds(transform.position, Vector3.one * 50f), argsBuffer ); }2. 粒子渲染Shader(HLSL)
StructuredBuffer_Particles; v2f vert(uint vertexID : SV_VertexID, uint instanceID : SV_InstanceID) { Particle p = _Particles[instanceID]; // Billboard计算 float3 viewPos = mul(UNITY_MATRIX_V, float4(p.position, 1)).xyz; float2 scale = p.size * float2( UNITY_MATRIX_P[0][0], UNITY_MATRIX_P[1][1] ); // 顶点偏移 float2 quadPos = float2( (vertexID == 0 || vertexID == 3) ? -1 : 1, (vertexID == 0 || vertexID == 1) ? -1 : 1 ); viewPos.xy += quadPos * scale; // 转换到裁剪空间 float4 clipPos = mul(UNITY_MATRIX_P, float4(viewPos, 1)); v2f o; o.pos = clipPos; o.color = p.color; return o; } fixed4 frag(v2f i) : SV_Target { return i.color; }
六、高级功能扩展
1. 碰撞检测优化
// 球体碰撞检测 void HandleCollision(inout Particle p) { float3 center = float3(0, -5, 0); float radius = 5.0; float3 toCenter = p.position - center; float distance = length(toCenter); if(distance < radius) { float3 normal = normalize(toCenter); p.position = center + normal * radius; p.velocity = reflect(p.velocity, normal) * 0.8; } }2. 动态批次管理
// 粒子对象池管理 ListactiveBuffers = new List (); List inactiveBuffers = new List (); ComputeBuffer GetParticleBuffer() { if(inactiveBuffers.Count > 0) { ComputeBuffer buf = inactiveBuffers[0]; inactiveBuffers.RemoveAt(0); return buf; } return new ComputeBuffer(batchSize, Particle.Size); } void RecycleBuffer(ComputeBuffer buffer) { buffer.SetData(new Particle[batchSize]); inactiveBuffers.Add(buffer); }
七、性能优化策略
1. 内存访问优化
策略 实现方法 性能提升 结构体对齐 使用float4代替float3 15% 缓存友好访问 按生命周期分组粒子数据 30% 异步传输 使用AsyncGPUReadback回读数据 20% 2. 计算优化技巧
// 避免分支语句 p.lifetime = max(p.lifetime - deltaTime, 0); float reset = step(p.lifetime, 0); p.position = lerp(p.position, 0, reset);
八、调试与可视化
1. 调试工具集成
// 粒子数据可视化 void OnDrawGizmos() { if(particleBuffer != null && particleBuffer.count > 0) { Particle[] debugParticles = new Particle[100]; particleBuffer.GetData(debugParticles, 0, 0, 100); foreach(var p in debugParticles) { Gizmos.color = p.color; Gizmos.DrawSphere(p.position, p.size * 0.5f); } } }2. 性能统计面板
void OnGUI() { GUI.Label(new Rect(10,10,200,30), $"Particles: {particleCount}"); GUI.Label(new Rect(10,30,200,30), $"FPS: {1/Time.deltaTime}"); GUI.Label(new Rect(10,50,200,30), $"GPU Time: {gpuTime}ms"); }
九、完整项目参考
通过本方案可实现百万级粒子的实时模拟,关键点在于:
完全GPU驱动:避免CPU-GPU数据传输瓶颈
间接绘制:单DrawCall渲染全部粒子
计算着色器优化:最大化GPU并行计算能力
建议在移动端使用时:
将粒子数量控制在1万以内
禁用复杂碰撞检测
使用半精度浮点数(需设备支持)