上一篇 读书笔记:Algorithms for Decision Making(12)
下一篇 读书笔记:Algorithms for Decision Making(14)
本部分将简单游戏扩展到具有多个状态的连续上下文。马尔可夫博弈可以看作是多个具有自己奖励函数的智能体的马尔可夫决策过程。
马尔可夫博弈的结构如下:
struct MG
γ # discount factor
ℐ # agents
# state space
# joint action space
T # transition function
R # joint reward function
end
马尔可夫决策是从状态到简单博弈决策的映射。
struct MGPolicy
p # dictionary mapping states to simple game policies
MGPolicy(p::Base.Generator) = new(Dict(p))
end
(πi::MGPolicy)(s, ai) = πi.p[s](ai)
(πi::SimpleGamePolicy)(s, ai) = πi(ai)
probability(::MG, s, π, a) = prod(πj(s, aj) for (πj, aj) in zip(π, a))
reward(::MG, s, π, i) =
sum(.R(s,a)[i]*probability(,s,π,a) for a in joint(.))
transition(::MG, s, π, s′) =
sum(.T(s,a,s′)*probability(,s,π,a) for a in joint(.))
function policy_evaluation(::MG, π, i)
, , R, T, γ = ., ., .R, .T, .γ
p(s,a) = prod(πj(s, aj) for (πj, aj) in zip(π, a))
R′ = [sum(R(s,a)[i]*p(s,a) for a in joint()) for s in ]
T′ = [sum(T(s,a,s′)*p(s,a) for a in joint()) for s in , s′ in ]
return (I - γ*T′)\R′
end
function best_response(::MG, π, i)
, , R, T, γ = ., ., .R, .T, .γ
T′(s,ai,s′) = transition(, s, joint(π, SimpleGamePolicy(ai), i), s′)
R′(s,ai) = reward(, s, joint(π, SimpleGamePolicy(ai), i), i)
πi = solve(MDP(γ, , [i], T′, R′))
return MGPolicy(s => SimpleGamePolicy(πi(s)) for s in )
end
function softmax_response(::MG, π, i, λ)
, , R, T, γ = ., ., .R, .T, .γ
T′(s,ai,s′) = transition(, s, joint(π, SimpleGamePolicy(ai), i), s′)
R′(s,ai) = reward(, s, joint(π, SimpleGamePolicy(ai), i), i)
mdp = MDP(γ, , joint(), T′, R′)
πi = solve(mdp)
Q(s,a) = lookahead(mdp, πi.U, s, a)
p(s) = SimpleGamePolicy(a => exp(λ*Q(s,a)) for a in [i])
return MGPolicy(s => p(s) for s in )
end
Nash均衡与之前形式相同,可用非线性规划来解决:
function tensorform(::MG)
ℐ, , , R, T = .ℐ, ., ., .R, .T
ℐ′ = eachindex(ℐ)
′ = eachindex()
′ = [eachindex([i]) for i in ℐ]
R′ = [R(s,a) for s in , a in joint()]
T′ = [T(s,a,s′) for s in , a in joint(), s′ in ]
return ℐ′, ′, ′, R′, T′
end
function solve(M::NashEquilibrium, ::MG)
ℐ, , , R, T = tensorform()
′, ′, γ = ., ., .γ
model = Model(Ipopt.Optimizer)
@variable(model, U[ℐ, ])
@variable(model, π[i=ℐ, , ai=[i]] ≥ 0)
@NLobjective(model, Min,
sum(U[i,s] - sum(prod(π[j,s,a[j]] for j in ℐ)
* (R[s,y][i] + γ*sum(T[s,y,s′]*U[i,s′] for s′ in ))
for (y,a) in enumerate(joint())) for i in ℐ, s in ))
@NLconstraint(model, [i=ℐ, s=, ai=[i]],
U[i,s] ≥ sum(
prod(j==i ? (a[j]==ai ? 1.0 : 0.0) : π[j,s,a[j]] for j in ℐ)
* (R[s,y][i] + γ*sum(T[s,y,s′]*U[i,s′] for s′ in ))
for (y,a) in enumerate(joint())))
@constraint(model, [i=ℐ, s=], sum(π[i,s,ai] for ai in [i]) == 1)
optimize!(model)
π′ = value.(π)
πi′(i,s) = SimpleGamePolicy(′[i][ai] => π′[i,s,ai] for ai in [i])
πi′(i) = MGPolicy(′[s] => πi′(i,s) for s in )
return [πi′(i) for i in ℐ]
end
mutable struct MGFictitiousPlay
# Markov game
i # agent index
Qi # state-action value estimates
Ni # state-action counts
end
function MGFictitiousPlay(::MG, i)
ℐ, , , R = .ℐ, ., ., .R
Qi = Dict((s, a) => R(s, a)[i] for s in for a in joint())
Ni = Dict((j, s, aj) => 1.0 for j in ℐ for s in for aj in [j])
return MGFictitiousPlay(, i, Qi, Ni)
end
function (πi::MGFictitiousPlay)(s)
, i, Qi = πi., πi.i, πi.Qi
ℐ, , , T, R, γ = .ℐ, ., ., .T, .R, .γ
πi′(i,s) = SimpleGamePolicy(ai => πi.Ni[i,s,ai] for ai in [i])
πi′(i) = MGPolicy(s => πi′(i,s) for s in )
π = [πi′(i) for i in ℐ]
U(s,π) = sum(πi.Qi[s,a]*probability(,s,π,a) for a in joint())
Q(s,π) = reward(,s,π,i) + γ*sum(transition(,s,π,s′)*U(s′,π) for s′ in )
Q(ai) = Q(s, joint(π, SimpleGamePolicy(ai), i))
ai = argmax(Q, .[πi.i])
return SimpleGamePolicy(ai)
end
function update!(πi::MGFictitiousPlay, s, a, s′)
, i, Qi = πi., πi.i, πi.Qi
ℐ, , , T, R, γ = .ℐ, ., ., .T, .R, .γ
for (j,aj) in enumerate(a)
πi.Ni[j,s,aj] += 1
end
πi′(i,s) = SimpleGamePolicy(ai => πi.Ni[i,s,ai] for ai in [i])
πi′(i) = MGPolicy(s => πi′(i,s) for s in )
π = [πi′(i) for i in ℐ]
U(π,s) = sum(πi.Qi[s,a]*probability(,s,π,a) for a in joint())
Q(s,a) = R(s,a)[i] + γ*sum(T(s,a,s′)*U(π,s′) for s′ in )
for a in joint()
πi.Qi[s,a] = Q(s,a)
end
end
mutable struct MGGradientAscent
# Markov game
i # agent index
t # time step
Qi # state-action value estimates
πi # current policy
end
function MGGradientAscent(::MG, i)
ℐ, , = .ℐ, ., .
Qi = Dict((s, a) => 0.0 for s in , a in joint())
uniform() = Dict(s => SimpleGamePolicy(ai => 1.0 for ai in .[i]) for s in )
return MGGradientAscent(, i, 1, Qi, uniform())
end
function (πi::MGGradientAscent)(s)
i, t = πi..[πi.i], πi.t
ϵ = 1 / sqrt(t)
πi′(ai) = ϵ/length(i) + (1-ϵ)*πi.πi[s](ai)
return SimpleGamePolicy(ai => πi′(ai) for ai in i)
end
function update!(πi::MGGradientAscent, s, a, s′)
, i, t, Qi = πi., πi.i, πi.t, πi.Qi
ℐ, , i, R, γ = .ℐ, ., .[πi.i], .R, .γ
jointπ(ai) = Tuple(j == i ? ai : a[j] for j in ℐ)
α = 1 / sqrt(t)
Qmax = maximum(Qi[s′, jointπ(ai)] for ai in i)
πi.Qi[s, a] += α * (R(s, a)[i] + γ * Qmax - Qi[s, a])
u = [Qi[s, jointπ(ai)] for ai in i]
π′ = [πi.πi[s](ai) for ai in i]
π = project_to_simplex(π′ + u / sqrt(t))
πi.t = t + 1
πi.πi[s] = SimpleGamePolicy(ai => p for (ai, p) in zip(i, π))
end
mutable struct NashQLearning
# Markov game
i # agent index
Q # state-action value estimates
N # history of actions performed
end
function NashQLearning(::MG, i)
ℐ, , = .ℐ, ., .
Q = Dict((j, s, a) => 0.0 for j in ℐ, s in , a in joint())
N = Dict((s, a) => 1.0 for s in , a in joint())
return NashQLearning(, i, Q, N)
end
function (πi::NashQLearning)(s)
, i, Q, N = πi., πi.i, πi.Q, πi.N
ℐ, , , i, γ = .ℐ, ., ., .[πi.i], .γ
M = NashEquilibrium()
= SimpleGame(γ, ℐ, , a -> [Q[j, s, a] for j in ℐ])
π = solve(M, )
ϵ = 1 / sum(N[s, a] for a in joint())
πi′(ai) = ϵ/length(i) + (1-ϵ)*π[i](ai)
return SimpleGamePolicy(ai => πi′(ai) for ai in i)
end
function update!(πi::NashQLearning, s, a, s′)
, ℐ, , , R, γ = πi., πi..ℐ, πi.., πi.., πi..R, πi..γ
i, Q, N = πi.i, πi.Q, πi.N
M = NashEquilibrium()
= SimpleGame(γ, ℐ, , a′ -> [Q[j, s′, a′] for j in ℐ])
π = solve(M, )
πi.N[s, a] += 1
α = 1 / sqrt(N[s, a])
for j in ℐ
πi.Q[j,s,a] += α*(R(s,a)[j] + γ*utility(,π,j) - Q[j,s,a])
end
end