TODO: 结合RL原理、图示、代码,对PPO计算流程加以说明

ϕ=arg maxϕEsp(s;ϕ)12min(rt+γVπ(st+1)V~π(st)22,rt+γVπ(st+1)clip(V~π(st),Vminπ,Vmaxπ)22)\phi = \argmax_{\phi} E_{s \sim p(s;\phi)} \frac{1}{2} \min \left( \underline{ || r_t + \gamma V^{\pi}(s_{t+1}) - \tilde{V}^{\pi}(s_t) ||_2^2 }, \underline{ || r_t + \gamma V^{\pi}(s_{t+1}) - \text{clip}( \tilde{V}^{\pi}(s_t), V^{\pi}_{min}, V^{\pi}_{max} ) ||_2^2 } \right)

θ~=arg maxθ~Esp(s;θ),aπ(as;θ)min(π(as;θ~)π(as;θ)A(s,a;θ),clip(π(as;θ~)π(as;θ),1ϵ,1+ϵ)A(s,a;θ))\tilde{\theta} = \argmax_{\tilde{\theta}} E_{s \sim p(s;\theta), a \sim \pi(a|s;\theta)} \min \left( \underline{ \frac{\pi(a|s;\tilde{\theta})}{\pi(a|s; \theta)} A(s, a; \theta) }, \underline{ \text{clip} ( \frac{\pi(a|s;\tilde{\theta})}{\pi(a|s; \theta)}, 1 - \epsilon, 1 + \epsilon ) A(s, a; \theta) } \right)