2d:[[["$","$L30","0",{"heading":"Abstract","index":0,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Group Relative Policy Optimization (GRPO) was introduced in ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:3"}]}],["$","$1","4",{"children":"] and used successfully to train DeepSeek-R1 ["}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:5"}]}],["$","$1","6",{"children":", "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:7"}]}],["$","$1","8",{"children":"] models for promoting reasoning capabilities of LLMs using verifiable or binary rewards. We show in this paper that GRPO with verifiable rewards can be written as a Kullback–Leibler ("}],["$","$1","9",{"children":"KL"}],["$","$1","10",{"children":") regularized contrastive loss, where the contrastive samples are synthetic data sampled from the old policy. The optimal GRPO policy "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:11"}]]}]}],["$","$1","12",{"children":"expressed explicitly in terms of the binary reward, as well as the first- and second-order statistics of the old policy ("}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:13"}]]}]}],["$","$1","14",{"children":") and the reference policy "}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:15"}]]}]}],["$","$1","16",{"children":". Iterating this scheme, we obtain a sequence of policies "}],["$","$1","17",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:17"}]]}]}],["$","$1","18",{"children":"for which we can quantify the probability of success "}],["$","$1","19",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:19"}]]}]}],["$","$1","20",{"children":". We show that the probability of success of the policy satisfies a recurrence that converges to a fixed point of a function that depends on the initial probability of success "}],["$","$1","21",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:21"}]]}]}],["$","$1","22",{"children":"and the regularization parameter "}],["$","$1","23",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:23"}]]}]}],["$","$1","24",{"children":"regularizer. We show that the fixed point "}],["$","$1","25",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:25"}]]}]}],["$","$1","26",{"children":"is guaranteed to be larger than "}],["$","$1","27",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:0:paragraphs:0:27"}]]}]}],["$","$1","28",{"children":", thereby demonstrating that GRPO effectively amplifies the probability of success of the policy."}]]}]]}],["$","$L30","1",{"heading":"1. Introduction","index":1,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In Reinforcement Learning (RL), a policy is learned by maximizing a reward that encodes constraints or an objective we want the policy to conform to or achieve. Policy gradient methods and actor-critic methods ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:3"}]}],["$","$1","4",{"children":"], enable RL-based training of parametric policies, including Large Language Models (LLMs), particularly when dealing with non-differentiable rewards. Unlike supervised learning or preference optimization, which require labeled training data, reinforcement learning generates "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:5:style","children":"synthetic data "}]}],["$","$1","6",{"children":"sampled online from the learned policy as training progresses."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Proximal Policy Optimization (PPO), introduced in ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:1:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:1:3"}]}],["$","$1","4",{"children":"], is a widely used algorithm that facilitates such training. PPO relies on importance sampling from the model’s previous (“old”) policy while ensuring that updates remain within a certain proximity to the old policy. Policy gradient methods are known for their high variance, and PPO mitigates this by learning a critic that reduces the variance of gradient estimates. The critic normalizes the reward, and PPO’s advantage function—defined as the difference between the reward and the critic’s evaluation—drives the optimization process."}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Group Relative Policy Optimization (GRPO) was recently introduced in DeepSeekMath ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:1"}]}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:2"}]}],["$","$1","3",{"children":", "}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:4"}]}],["$","$1","5",{"children":"]. GRPO closely follows PPO’s optimization framework but differs in how the advantage is computed. Specifically, GRPO estimates the advantage using Monte Carlo rollouts rather than a learned critic. Additionally, GRPO applies whitening to the advantage function, meaning it standardizes the reward’s mean and variance. These statistics are estimated from a “group” of Monte Carlo rollouts corresponding to samples from the LLM policy conditioned on a single input or query to the policy. Whitening the advantage function has been recognized in many PPO implementations as an important ingredient for training stability ["}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:6"}]}],["$","$1","7",{"children":", "}],["$","$1","8",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:8"}]}],["$","$1","9",{"children":", "}],["$","$1","10",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:10"}]}],["$","$1","11",{"children":", "}],["$","$1","12",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:12"}]}],["$","$1","13",{"children":"]."}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"GRPO therefore eliminates the need for training a separate critic network alongside the LLM policy, instead leveraging efficient sampling from the LLM’s policy. This is made feasible by optimized model serving through VLLM ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:3"}]}],["$","$1","4",{"children":"]. GRPO has been employed in the DeepSeek model series, including DeepSeek-v3 ["}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:5"}]}],["$","$1","6",{"children":", "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:7"}]}],["$","$1","8",{"children":"] and DeepSeek-R1 ["}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:9"}]}],["$","$1","10",{"children":", "}],["$","$1","11",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:11"}]}],["$","$1","12",{"children":"]. DeepSeek-R1 unlocked reasoning capabilities in open-source models, and its success can be attributed to several factors and innovations, among them: (1) A strong pre-trained model (DeepSeek-v3), (2) The reasoning chain of thoughts "}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:13:style","children":"... ... "}]}],["$","$1","14",{"children":"and (3) The use of verifiable binary rewards with GRPO to fine-tune the models on reasoning and math tasks."}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We focus in this paper on Reinforcement Learning with Verifiable Rewards (RLVR) using GRPO, as recently termed by ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:3"}]}],["$","$1","4",{"children":"]. Following ["}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:5"}]}],["$","$1","6",{"children":", "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:7"}]}],["$","$1","8",{"children":"], we distinguish three types of verifiable rewards in the context of LLM training:"}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(1) "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:1:style","children":"Correctness Verification. "}]}],["$","$1","2",{"children":"This corresponds to a binary reward that can be obtained via string matching between the generated response and a gold-standard answer—if such an answer exists—for example, in math problems with known solutions. This type of reward has been used in ["}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:5"}]}],["$","$1","6",{"children":"] and subsequently in open-source implementations such as Open-R1 ["}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:7"}]}],["$","$1","8",{"children":", "}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:9"}]}],["$","$1","10",{"children":"] and DeepScaleR ["}],["$","$1","11",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:11"}]}],["$","$1","12",{"children":", "}],["$","$1","13",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:13"}]}],["$","$1","14",{"children":"]. When a gold-standard answer does not exist, one can resort to an LLM as a judge to assess the correctness of the response within the training loop, as done in deliberative alignment ["}],["$","$1","15",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:15"}]}],["$","$1","16",{"children":", "}],["$","$1","17",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:5:17"}]}],["$","$1","18",{"children":"]."}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(2) "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:6:1:style","children":"Verification via Execution. "}]}],["$","$1","2",{"children":"In code generation, a code interpreter is used to execute the generated code, producing a 0/1 reward for fail/pass. A battery of unit tests can also be executed to verify the correctness of the code, resulting in a binary reward. Open-R1 ["}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:6:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:6:5"}]}],["$","$1","6",{"children":"] recently open-sourced this type of reward evaluation."}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(3) "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:7:1:style","children":"Verifiable Constraints. "}]}],["$","$1","2",{"children":"Finally, formatting constraints on the output or refusals to answer can be enforced using simple binary rewards to guide RL training for LLMs ["}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:7:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:7:5"}]}],["$","$1","6",{"children":"] ["}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:7:7"}]}],["$","$1","8",{"children":", "}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:7:9"}]}],["$","$1","10",{"children":"]."}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Verifiable reward balance simplicity and bias and are thought to be less prone to reward hacking than reward models learned from preference data . Reward hacking is a common issue in reinforcement learning where the policy learns to over-optimize a reward leading to a lower quality of the model ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:3"}]}],["$","$1","4",{"children":"]. While verifiable rewards are more resilient to reward hacking, "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:5"}]}],["$","$1","6",{"children":"["}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:7"}]}],["$","$1","8",{"children":"] showed that for low regularization of the "}],["$","$1","9",{"children":"KL "}],["$","$1","10",{"children":"constraint to the reference model, reward hacking occurs when using verifiable constraints. Hence we study in this paper "}],["$","$1","11",{"children":"KL"}],["$","$1","12",{"children":"-regularized Reinforcement Learning with Verifiable Rewards using GRPO. We note that a recent paper ["}],["$","$1","13",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:13"}]}],["$","$1","14",{"children":", "}],["$","$1","15",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:8:15"}]}],["$","$1","16",{"children":"] studies GRPO with a focus on the policy obtained using an approximation of the "}],["$","$1","17",{"children":"KL "}],["$","$1","18",{"children":"divergence used in practical implementations."}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Our main contributions are :"}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(1) We show in Section "}],["$","$1","1",{"children":"2 "}],["$","$1","2",{"children":"that GRPO with verifiable Rewards can be cast as an adaptive weighted Contrastive Loss between samples from the old policy with 0/1 rewards."}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(2) Armed with this contrastive loss interpretation, we show in Section "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:11:1"}]}],["$","$1","2",{"children":"that GRPO dynamics as the old policy is replaced with the current optimal policy, result in a closed form recursion for the optimal policy, where the optimal policy "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:11:3"}]]}]}],["$","$1","4",{"children":"can be expressed in terms of the reference policy "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:11:5"}]]}]}],["$","$1","6",{"children":", the old policy "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:11:7"}]]}]}],["$","$1","8",{"children":"and the probability of success of the old policy "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:11:9"}]]}]}],["$","$1","10",{"children":"(the frequency of reward “1” of generated responses for a given prompt)."}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(3) Computing the probability of success under "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:12:1"}]]}]}],["$","$1","2",{"children":", we show in Section "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:12:3"}]}],["$","$1","4",{"children":"that it satisfies a recursion in time, leading to a fixed point equation. We show in Section "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:12:5"}]}],["$","$1","6",{"children":"that under mild assumptions GRPO’s probability success converges asymptotically to this fixed point solution."}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(4) We show in Section "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:13:1"}]}],["$","$1","2",{"children":"that this fixed point probability of success is guaranteed to be larger than the probability of success of the reference model, proving thereby that GRPO indeed amplifies the probability of success as observed experimentally."}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(5) Finally we show in Section "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:14:1"}]}],["$","$1","2",{"children":"that for approximate policies obtained for instance by gradient descent, the probability of success remains close to the fixed point probability of success as long as the approximation, statistical and optimization errors remain small."}]]}]]}],["$","$L30","2",{"heading":"2. GRPO With verifiable Rewards as an Adaptive Weighted Contrastive Loss","index":2,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Let "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:1"}]]}]}],["$","$1","2",{"children":"be a distribution of prompts or questions, and let "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:3:style","children":"r "}]}],["$","$1","4",{"children":"be a reward function that evaluates the output "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:5"}]]}]}],["$","$1","6",{"children":"of a policy. As discussed in the introduction, we restrict our analysis to verifiable rewards, meaning binary rewards, "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:7"}]]}]}],["$","$1","8",{"children":". Given a prompt "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:9"}]]}]}],["$","$1","10",{"children":", let "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:11"}]]}]}],["$","$1","12",{"children":"be the policy of an LLM, where "}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:13:style","children":"o "}]}],["$","$1","14",{"children":"represents the sequence outcome and "}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:15"}]]}]}],["$","$1","16",{"children":"the parameters of the model. "}],["$","$1","17",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:17"}]]}]}],["$","$1","18",{"children":"denotes the “old” policy or the policy from a previous iteration. "}],["$","$1","19",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:19"}]]}]}],["$","$1","20",{"children":"corresponds to the reference policy, and "}],["$","$1","21",{"children":"KL "}],["$","$1","22",{"children":"is the Kullback–Leibler divergence :"}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/2-8.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:0"}]]}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"For a regularization parameter "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:1"}]]}]}],["$","$1","2",{"children":", we start by recalling GRPO’s optimization problem ["}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:3"}]}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:4"}]}],["$","$1","5",{"children":", "}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:6"}]}],["$","$1","7",{"children":"] :"}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/2-10.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:3:0"}]]}]}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"where the advantage for an outcome "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:1:style","children":"o"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:3:style","children":"A"}]}],["$","$1","4",{"children":"("}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:5:style","children":"q, o"}]}],["$","$1","6",{"children":") "}],["$","$1","7",{"children":"is given by:"}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/2-11.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:5:0"}]]}]}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see that GRPO optimizes the whitened reward (the advantage "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:6:1:style","children":"A"}]}],["$","$1","2",{"children":"("}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:6:3:style","children":"q, o"}]}],["$","$1","4",{"children":")"}],["$","$1","5",{"children":") using importance sampling from the “old” policy while maintaining the optimized policy close to "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:6:6"}]]}]}],["$","$1","7",{"children":"as measured by the "}],["$","$1","8",{"children":"KL "}],["$","$1","9",{"children":"divergence. If furthermore the clipping is used as in ("}],["$","$1","10",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:6:10"}]}],["$","$1","11",{"children":"), the likelihood ratio between the policy and the old policy is maintained within a range "}],["$","$1","12",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:6:12"}]]}]}],["$","$1","13",{"children":"."}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"2.1. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:1:style","children":"GRPO with Clipping. "}]}],["$","$1","2",{"children":"Note that in our context "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:3"}]]}]}],["$","$1","4",{"children":"and the advantage "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:5:style","children":"A"}]}],["$","$1","6",{"children":"("}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:7:style","children":"q, o"}]}],["$","$1","8",{"children":") "}],["$","$1","9",{"children":"can be positive or negative and hence if "}],["$","$1","10",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:10:style","children":"A"}]}],["$","$1","11",{"children":"("}],["$","$1","12",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:12:style","children":"q, o"}]}],["$","$1","13",{"children":") "}],["$","$1","14",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:7:14:style","children":"> "}]}],["$","$1","15",{"children":"0 "}],["$","$1","16",{"children":"we have :"}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/2-15.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:8:0"}]]}]}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"and if "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:9:1:style","children":"A"}]}],["$","$1","2",{"children":"("}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:9:3:style","children":"q, o"}]}],["$","$1","4",{"children":") "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:9:5:style","children":"< "}]}],["$","$1","6",{"children":"0"}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/2-16.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:10:0"}]]}]}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:11:0"}]]}]}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 1. "}],["$","$1","1",{"children":"Weighting of GRPO with the probability of success of the old policy."}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Recall that our reward "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:13:1:style","children":"r "}]}],["$","$1","2",{"children":"is a verifiable reward that evaluates correctness of a reasoning or the execution of the code meaning that "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:13:3"}]]}]}],["$","$1","4",{"children":". We note the probability of success of the old policy "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:13:5"}]]}]}],["$","$1","6",{"children":":"}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-3.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:14:0"}]]}]}]]}],["$","$La","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Hence we have for mean and variance of a Bernoulli random variable :"}]]}],["$","$La","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-4.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:16:0"}]]}]}]]}],["$","$La","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Assuming "}],["$","$1","1",{"children":"0 "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:17:2:style","children":"< p < "}]}],["$","$1","3",{"children":"1 "}],["$","$1","4",{"children":"and replacing mean and variance in the advantage function ("}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:17:5"}]}],["$","$1","6",{"children":") we obtain :"}]]}],["$","$La","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-5.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:18:0"}]]}]}]]}],["$","$La","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Hence we have conditioning on "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:19:1:style","children":"q "}]}],["$","$1","2",{"children":"and using advantage expressions and "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:19:3"}]]}]}],["$","$1","4",{"children":"for positive and negative advantage we obtain:"}]]}],["$","$La","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-7.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:20:0"}]]}]}]]}],["$","$La","21",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"and hence the overall cost is obtained by taking expectation over "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:21:1:style","children":"q"}]}],["$","$1","2",{"children":", note that "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:21:3"}]]}]}],["$","$1","4",{"children":":"}]]}],["$","$La","22",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/3-9.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:22:0"}]]}]}]]}],["$","$La","23",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see that GRPO is effectively a weighted contrastive loss that is weighted by ratio depending on the probability of succes of "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:23:1"}]]}]}],["$","$1","2",{"children":". We see from the weights plots that :"}]]}],["$","$La","24",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:24:0:style","children":"• "}]}],["$","$1","1",{"children":"if the success probability of old policy is high ("}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:24:2:style","children":"p > "}]}],["$","$1","3",{"children":"0"}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:24:4:style","children":"."}]}],["$","$1","5",{"children":"5"}],["$","$1","6",{"children":"), the weighting for points with success is low since the old policy is already good, and for failing point the weight is high and hence they are more penalized."}]]}],["$","$La","25",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:25:0:style","children":"• "}]}],["$","$1","1",{"children":"if the success probability of old policy is low ("}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:25:2:style","children":"p < "}]}],["$","$1","3",{"children":"0"}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:25:4:style","children":"."}]}],["$","$1","5",{"children":"5"}],["$","$1","6",{"children":"), the weighting for points with success is high since we want to reinforce those successes, and for failing points these are still penalized but with a small weight."}]]}],["$","$La","26",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In summary, the standardized reward or the advantage function used in GRPO results in an interesting adaptive weighted contrastive loss : if the probability of success of the old policy is high, the wrong answers are more penalized than the correct ones are reinforced. If the probability of success of old policy is low, the correct answers are more reinforced than the wrong answers are penalized."}]]}],["$","$La","27",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"2.2. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:27:1:style","children":"Stabilized GRPO with Clipping. "}]}],["$","$1","2",{"children":"Note that in the previous section we assumed that "}],["$","$1","3",{"children":"0 "}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:27:4:style","children":"< p < "}]}],["$","$1","5",{"children":"1"}],["$","$1","6",{"children":", we alleviate this in the following by adding a smoothing factor "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:27:7"}]]}]}],["$","$1","8",{"children":"in the advantage as follows:"}]]}],["$","$La","28",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-1.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:28:0"}]]}]}]]}],["$","$La","29",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This results with the following advantage:"}]]}],["$","$La","30",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-2.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:30:0"}]]}]}]]}],["$","$La","31",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Replacing the stabilized advantage in Equation ("}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:31:1"}]}],["$","$1","2",{"children":"), conditionally on a prompt "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:31:3:style","children":"q "}]}],["$","$1","4",{"children":"we obtain the following contrastive loss:"}]]}],["$","$La","32",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-3.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:32:0"}]]}]}]]}],["$","$La","33",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"which results in the following contrastive optimization :"}]]}],["$","$La","34",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-4.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:34:0"}]]}]}]]}],["$","$La","35",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"2.3. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:35:1:style","children":"Stabilized GRPO with No Clipping. "}]}],["$","$1","2",{"children":"Taking the clipping parameter "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:35:3"}]]}]}],["$","$1","4",{"children":"we obtain GRPO with no clipping equivalent contrastive optimization as follows:"}]]}],["$","$La","36",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-6.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:36:0"}]]}]}]]}],["$","$La","37",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"which is equivalent to the following problem:"}]]}],["$","$La","38",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/4-7.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:38:0"}]]}]}]]}],["$","$La","39",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We will focus in what follows on this non-clipped version."}]]}],["$","$La","40",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/5-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:40:0"}]]}]}]]}],["$","$La","41",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"2.4. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:41:1:style","children":"GRPO Iterations. "}]}],["$","$1","2",{"children":"Algorithm "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:41:3"}]}],["$","$1","4",{"children":"summarizes GRPO iterations (Stabilized and no clipping). We see that GRPO iterations can be written as a sequence optimization we denote by "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:41:5"}]]}]}],["$","$1","6",{"children":", the policy at iteration "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:41:7:style","children":"n"}]}],["$","$1","8",{"children":". We see that GRPO iterations can be written for "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:41:9"}]]}]}],["$","$1","10",{"children":":"}]]}],["$","$La","42",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/5-3.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:42:0"}]]}]}]]}],["$","$La","43",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Note that in Algorithm "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:43:1"}]}],["$","$1","2",{"children":", expectations are estimated using importance sampling from "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:43:3"}]]}]}],["$","$1","4",{"children":", and each maximization problem is solved via gradient for "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:43:5"}]]}]}],["$","$1","6",{"children":"steps."}]]}],["$","$La","44",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In the following we will replace the maximization on the parameter space of the policy by maximizing over the space of policies (i.e optimization on the probability space) in order to analyze the dynamics of GRPO iterations as follows, for "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:44:1"}]]}]}],["$","$1","2",{"children":":"}]]}],["$","$La","45",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:45:0"}]]}]}],["$","$1","1",{"children":"(GRPO Iterations)"}]]}],["$","$La","46",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-21","style":"$undefined","children":"where "}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:46:1"}]]}]}],["$","$1","2",{"children":"is the probability of success of the policy "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:46:3"}]]}]}],["$","$1","4",{"children":":"}]]}],["$","$La","47",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/5-10.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:47:0"}]]}]}]]}],["$","$La","48",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"and the weights "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:1"}]]}]}],["$","$1","2",{"children":"and "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:3"}]]}]}],["$","$1","4",{"children":"are given in Equation ("}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:5"}]}],["$","$1","6",{"children":"). We assume all throughout the paper that "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:7"}]]}]}],["$","$1","8",{"children":"Note that moving the optimization from a parametric space to the probability space can be seen as assuming that the hypothesis class of the parametric policies is large enough to represent all policies. Note that in GRPO iterations the policy at iteration "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:9:style","children":"n "}]}],["$","$1","10",{"children":"depends upon the policy "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:11"}]]}]}],["$","$1","12",{"children":"via the probability of success "}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:48:13"}]]}]}],["$","$1","14",{"children":", as well on the reference policy via the "}],["$","$1","15",{"children":"KL "}],["$","$1","16",{"children":"regularizer."}]]}]]}],["$","$L30","3",{"heading":"3. GRPO Dynamics: Fixed Point iteration for Probability of Success","index":3,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-14","style":"$undefined","children":"Our goal in this Section is to analyze the dynamics of the GRPO iterations given in Equation "}]}],["$","$1","1",{"children":"("}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:0:2"}]}],["$","$1","3",{"children":")."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-30","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:0:style","children":"Theorem 1 "}]}],["$","$1","1",{"children":"(GRPO Policy Dynamic)"}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:2:style","children":". "}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:3:style","children":"Optimal GRPO iterations policies solving Equation "}]}],["$","$1","4",{"children":"("}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:5"}]}],["$","$1","6",{"children":") "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:7:style","children":"satisfy the following recursion, for "}]}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:8"}]]}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:1:9:style","children":":"}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/5-17.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:2:0"}]]}]}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:3:0:style","children":"where"}]}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/5-18.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:4:0"}]]}]}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:0:style","children":"where the weights "}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:1"}]]}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:2:style","children":"are given in Equation "}]}],["$","$1","3",{"children":"("}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:4"}]}],["$","$1","5",{"children":")"}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:6:style","children":", the probability of success "}]}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:7"}]]}]}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:8"}]]}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:9:style","children":"is given in Equation "}]}],["$","$1","10",{"children":"("}],["$","$1","11",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:11"}]}],["$","$1","12",{"children":")"}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:13:style","children":", and "}]}],["$","$1","14",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:14"}]]}]}],["$","$1","15",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:15:style","children":"is the probability of success of the reference policy "}]}],["$","$1","16",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:16"}]]}]}],["$","$1","17",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:17:style","children":"."}]}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We turn now to the recursion satisfied by the probability of success "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:6:1"}]]}]}],["$","$1","2",{"children":"of the policy "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:6:3"}]]}]}],["$","$1","4",{"children":"have the following theorem that shows that this success probability satisfies a fixed point iteration:"}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:0:style","children":"Theorem 2 "}]}],["$","$1","1",{"children":"(GRPO’s Probability of Success Fixed Point Iteration)"}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:2"}]]}]}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:3"}]]}]}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/6-9.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:8:0"}]]}]}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:9:0:style","children":"The probability of success along GRPO’s iteration satisfies the following fixed point iteration i.e we have almost surely for all "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:9:1:style","children":"q "}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:9:2:style","children":"for "}]}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:9:3"}]]}]}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/6-11.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:10:0"}]]}]}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:0"}]]}]}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:0:style","children":"Remark 1 "}]}],["$","$1","1",{"children":"(Importance of "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:3:style","children":"is no longer continuous at "}]}],["$","$1","4",{"children":"0 "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:5:style","children":"and "}]}],["$","$1","6",{"children":"1 "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:7:style","children":"and we "}]}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"id-25","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:8:style","children":"can no longer guarantee existence of fixed points on "}]}],["$","$1","9",{"children":"[0"}],["$","$1","10",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:10:style","children":", "}]}],["$","$1","11",{"children":"1]"}],["$","$1","12",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:12:style","children":"."}]}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/6-14.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:13:0"}]]}]}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 2. "}],["$","$1","1",{"children":"Fixed points as function of "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:14:2"}]]}]}],["$","$1","3",{"children":"and "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:14:4"}]]}]}],["$","$1","5",{"children":"for "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:14:6"}]]}]}],["$","$1","7",{"children":"."}]]}],["$","$La","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We study in the following proposition propreties of the function "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:15:1"}]]}]}],["$","$1","2",{"children":":"}]]}],["$","$La","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-24","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:16:0:style","children":"Proposition 1 "}]}],["$","$1","1",{"children":"(Propreties of "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:16:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:16:3:style","children":"satisfies the following propreties:"}]}]]}],["$","$La","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/6-20.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:17:0"}]]}]}]]}],["$","$La","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We drop in the sequel "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:1:style","children":"q"}]}],["$","$1","2",{"children":", when referring to the sequence "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:3"}]]}]}],["$","$1","4",{"children":", and write for short "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:5"}]]}]}],["$","$1","6",{"children":"(the reader is referred to Remark "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:7"}]}],["$","$1","8",{"children":"for a discussion). If the sequence defined in GRPO’s probability of success iteration ("}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:9"}]}],["$","$1","10",{"children":") converges we have therefore by continuity of "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:18:11"}]]}]}],["$","$1","12",{"children":":"}]]}],["$","$La","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/6-24.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:19:0"}]]}]}]]}],["$","$La","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"and hence "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:1"}]]}]}],["$","$1","2",{"children":", and the limit point probability of success of GRPO "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:3"}]]}]}],["$","$1","4",{"children":"point of "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:5"}]]}]}],["$","$1","6",{"children":"(fixed points exist by virtue of proposition "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:7"}]}],["$","$1","8",{"children":"). Note that the fixed point "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:9"}]]}]}],["$","$1","10",{"children":"is indeed function of "}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:11:style","children":"q"}]}],["$","$1","12",{"children":", and this dependency in "}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:13"}]]}]}],["$","$1","14",{"children":"is via "}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:15"}]]}]}],["$","$1","16",{"children":"."}]]}],["$","$La","21",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see in Figure "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:1"}]}],["$","$1","2",{"children":"various plots of the function "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:3"}]]}]}],["$","$1","4",{"children":"for different values of "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:5"}]]}]}],["$","$1","6",{"children":"and initialization "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:7"}]]}]}],["$","$1","8",{"children":", as well as the plot of the function "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:9:style","children":"y "}]}],["$","$1","10",{"children":"= "}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:11:style","children":"p"}]}],["$","$1","12",{"children":". Fixed points correspond to the intersections of this line with the curve of "}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:13"}]]}]}],["$","$1","14",{"children":". We see that the fixed points are not unique in general, and "}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:15"}]]}]}],["$","$1","16",{"children":"is almost always a fixed point."}]]}]]}],["$","$L30","4",{"heading":"4. GRPO: Fixed Point Iteration Convergence and Success Amplification","index":4,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-15","style":"$undefined","children":"In this Section we answer the following two questions:"}]}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(1) Do fixed points of GRPO iterations, "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:1"}]]}]}],["$","$1","2",{"children":", lead to a probability of success "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:3"}]]}]}],["$","$1","4",{"children":"that is higher than the reference initialization "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:5"}]]}]}],["$","$1","6",{"children":"?"}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"(2) Under which conditions do we have local convergence of the GRPO’s probability of success sequence given in ("}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:2:1"}]}],["$","$1","2",{"children":") to a fixed point "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:2:3"}]]}]}],["$","$1","4",{"children":"of "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:2:5"}]]}]}],["$","$1","6",{"children":"?"}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-26","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:3:0:style","children":"Theorem 3 "}]}],["$","$1","1",{"children":"(GRPO amplifies the probability of success)"}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:3:2"}]]}]}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:3:3"}]]}]}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see from Theorem "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:1"}]}],["$","$1","2",{"children":"that the fixed point "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:3"}]]}]}],["$","$1","4",{"children":"of the GRPO iteration leads to an amplification of the probability of success of the reference model "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:5"}]]}]}],["$","$1","6",{"children":", the iteration will lead to "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:7"}]]}]}],["$","$1","8",{"children":"and "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:9"}]]}]}],["$","$1","10",{"children":"respectively."}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We now turn to the second question regarding the convergence of the GRPO sequence given in ("}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:5:1"}]}],["$","$1","2",{"children":") to a fixed point "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:5:3"}]]}]}],["$","$1","4",{"children":". Given the properties of "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:5:5"}]]}]}],["$","$1","6",{"children":", we can characterize the limit point of the GRPO iteration as "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:5:7"}]]}]}],["$","$1","8",{"children":"as follows, as a consequence of the local Banach fixed-point theorem:"}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-42","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:0:style","children":"Theorem 4 "}]}],["$","$1","1",{"children":"(Local Fixed Point Convergence)"}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:3:style","children":"be a fixed point of "}]}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:4"}]]}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:5:style","children":"and assume that have "}]}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:6"}]]}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:7:style","children":".Given that "}]}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:8"}]]}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:9:style","children":"are continuous in "}]}],["$","$1","10",{"children":"[0"}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:11:style","children":", "}]}],["$","$1","12",{"children":"1]"}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:13:style","children":", then there exists "}]}],["$","$1","14",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:14"}]]}]}],["$","$1","15",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:15:style","children":"iteration "}]}],["$","$1","16",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:16"}]]}]}],["$","$1","17",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:17:style","children":"converges to "}]}],["$","$1","18",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:18"}]]}]}],["$","$1","19",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:19:style","children":". In other words under this condition we have:"}]}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/7-32.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:7:0"}]]}]}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:0:style","children":"Lemma 1. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:1:style","children":"Let "}]}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:3:style","children":"be a fixed point: "}]}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:4"}]]}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:5:style","children":", then we have:"}]}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/7-35.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:9:0"}]]}]}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"One condition for local convergence is therefore to have: "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:10:1"}]]}]}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"1 "}],["$","$1","1",{"children":"which is satisfied if : "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:11:2"}]]}]}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see from Figure "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:1"}]}],["$","$1","2",{"children":"the lower bound on "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:3"}]]}]}],["$","$1","4",{"children":"required to ensure local convergence of GRPO iterations to a fixed point "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:5"}]]}]}],["$","$1","6",{"children":"shows iteration ("}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:7"}]}],["$","$1","8",{"children":") as a function of "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:9:style","children":"n "}]}],["$","$1","10",{"children":"for different values of "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:11"}]]}]}],["$","$1","12",{"children":"and "}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:13"}]]}]}],["$","$1","14",{"children":". We see that in most cases, there is a sharp transition where we observe fast convergence to "}],["$","$1","15",{"children":"1 "}],["$","$1","16",{"children":"or to a fixed point "}],["$","$1","17",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:17"}]]}]}],["$","$1","18",{"children":". For "}],["$","$1","19",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:19"}]]}]}],["$","$1","20",{"children":"and "}],["$","$1","21",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:21"}]]}]}],["$","$1","22",{"children":", we see a divergent behavior."}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-22","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:0:style","children":"Remark 2. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:1:style","children":"Note that the condition on "}]}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:3:style","children":"is stated conditionally on a prompt "}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:4:style","children":"q"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:5:style","children":", to obtain results uniformly on "}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:6:style","children":"q "}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:7:style","children":"we need to take "}]}],["$","$1","8",{"children":"sup "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:9:style","children":"on "}]}],["$","$1","10",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:10:style","children":"q "}]}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:11:style","children":"in all lower bounds."}]}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/8-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:14:0"}]]}]}]]}],["$","$La","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 3. "}],["$","$1","1",{"children":"Lower bound on "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:15:2"}]]}]}],["$","$1","3",{"children":"to ensure local convergence of GRPO fixed point iteration."}]]}],["$","$La","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/8-2.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:16:0"}]]}]}]]}],["$","$La","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 4. "}],["$","$1","1",{"children":"GRPO Recursion and convergence to fixed points of "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:17:2"}]]}]}],["$","$1","3",{"children":", for "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:17:4"}]]}]}]]}]]}],["$","$L30","5",{"heading":"5. Back to Parametric GRPO Iterations","index":5,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-16","style":"$undefined","children":"Let "}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:0:1"}]]}]}],["$","$1","2",{"children":", the sequence of parametric policies solutions of problem ("}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:0:3"}]}],["$","$1","4",{"children":") produced by gradient descent for example as in Algorithm "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:0:5"}]}],["$","$1","6",{"children":". We make the following assumption on the total variation distance "}],["$","$1","7",{"children":"TV "}],["$","$1","8",{"children":"between these parametric policies and the non-parametric GRPO policies "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:0:9"}]]}]}],["$","$1","10",{"children":"given in Theorem "}],["$","$1","11",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:0:11"}]}],["$","$1","12",{"children":". We show in this Section if we have approximate policies we can have still asymptotic convergence."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:0"}]]}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:1:style","children":"and assume for all "}]}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:2"}]]}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:3:style","children":", there exists "}]}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:4"}]]}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:1:5:style","children":"such that:"}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/9-5.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:2:0"}]]}]}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We have the following theorem:"}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-32","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:0:style","children":"Theorem 5. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:1:style","children":"Under Assumption "}]}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:2"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:3:style","children":"and assuming that "}]}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:4"}]]}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:5:style","children":"converges to "}]}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:6"}]]}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:7:style","children":"the fixed point of "}]}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:8"}]]}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:9:style","children":". Let "}]}],["$","$1","10",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:10"}]]}]}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:11:style","children":"the probability of success of the policy "}]}],["$","$1","12",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:12"}]]}]}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:4:13:style","children":"we have:"}]}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/9-11.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:5:0"}]]}]}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In Assumption "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:6:1"}]}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:6:2"}]]}]}],["$","$1","3",{"children":"represent statistical, approximation and optimization errors. We see from Theorem "}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:6:4"}]}],["$","$1","5",{"children":", that as long these error remain small, the probability of success of GRPO parametric policy (estimated from samples and optimized for instance with gradient descent) remains close to the fixed point probability success "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:5:paragraphs:6:6"}]]}]}],["$","$1","7",{"children":"."}]]}]]}],["$","$L30","6",{"heading":"6. Experimental Validation","index":6,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:0:style","children":"Setup. "}]}],["$","$1","1",{"children":"We use the "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:2"}]}],["$","$1","3",{"children":"dataset from "}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:4"}]}],["$","$1","5",{"children":"["}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:6"}]}],["$","$1","7",{"children":"] (MIT license), and "}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:8:style","children":"Qwen2.5-0.5B-Instruct "}]}],["$","$1","9",{"children":"(Apache 2.0 license) by "}],["$","$1","10",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:10"}]}],["$","$1","11",{"children":"["}],["$","$1","12",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:12"}]}],["$","$1","13",{"children":"] as the reference policy. We use GRPO implememtation in TRL ["}],["$","$1","14",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:14"}]}],["$","$1","15",{"children":", "}],["$","$1","16",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:16"}]}],["$","$1","17",{"children":"], and train on the training split of "}],["$","$1","18",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:18:style","children":"GSM8K "}]}],["$","$1","19",{"children":"on a node with 8 GPUs (GPU0 for the vLLM server and 7 other GPUs for distributed training). We use a learning "}],["$","$1","20",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:20"}]]}]}],["$","$1","21",{"children":"the KL regularizer "}],["$","$1","22",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:22"}]]}]}],["$","$1","23",{"children":"in Algorithm "}],["$","$1","24",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:24"}]}],["$","$1","25",{"children":"is set to "}],["$","$1","26",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:26"}]]}]}],["$","$1","27",{"children":". Other hyperparameters are given in Appendix "}],["$","$1","28",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:28"}]}],["$","$1","29",{"children":". We use the correctness of the LLM output as a reward."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:1:0:style","children":"Success Rate Amplification. "}]}],["$","$1","1",{"children":"The success rate of the policy is then evaluated on the test set consisting of 1319 math questions, where for each question the success rate is evaluated using "}],["$","$1","2",{"children":"50 "}],["$","$1","3",{"children":"samples. We see a success rate amplification from "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:1:4"}]]}]}],["$","$1","5",{"children":"originally (averaged on all prompts) at "}],["$","$1","6",{"children":"21% "}],["$","$1","7",{"children":"to "}],["$","$1","8",{"children":"37"}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:1:9:style","children":"."}]}],["$","$1","10",{"children":"5% "}],["$","$1","11",{"children":"at the end of the GRPO epoch."}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:0:style","children":"Trajectory of Success rates Along GRPO Iterations. "}]}],["$","$1","1",{"children":"We randomly select few prompts from GSM8K test set and plot in Figure "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:2"}]}],["$","$1","3",{"children":"the trajectory of the success rate of the model along the GRPO iteration (estimated from 50 samples from the model for each prompt). The success rate is computed from checkpoints of the model along the GRPO training. We see that the trajectory of the success rate "}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:4:style","children":"p"}]}],["$","$1","5",{"children":"("}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:6:style","children":"q"}]}],["$","$1","7",{"children":") "}],["$","$1","8",{"children":"resembles the trajectory of a fixed point algorithm (see Figure "}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:9"}]}],["$","$1","10",{"children":"in Appendix "}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:11:style","children":"?? "}]}],["$","$1","12",{"children":"). For some points the convegence is fast to the limit point "}],["$","$1","13",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:13"}]]}]}],["$","$1","14",{"children":", for others we see an oscillatory behavior (similar to the one in last row in Figure "}],["$","$1","15",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:15"}]}],["$","$1","16",{"children":"). Interestingly when "}],["$","$1","17",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:17"}]]}]}],["$","$1","18",{"children":", the probability of success does not move much along GRPO iterations as predicted by our theory."}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:0"}]]}]}],["$","$1","1",{"children":"We compute the success rate "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:2"}]]}]}],["$","$1","3",{"children":"for all prompts "}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:4:style","children":"q "}]}],["$","$1","5",{"children":"in the test of GSM8K, and compute the fixed point of "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:6"}]]}]}],["$","$1","7",{"children":"using a fixed point algorithm. This gives us "}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:8"}]]}]}],["$","$1","9",{"children":"for all points in the test set. We compute the "}],["$","$1","10",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:10"}]]}]}],["$","$1","11",{"children":"error between the success rate along the GRPO training "}],["$","$1","12",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:12"}]]}]}],["$","$1","13",{"children":"for a given prompt and the corresponding fixed point and plot the expected error on the prompts "}],["$","$1","14",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:14"}]]}]}],["$","$1","15",{"children":"in Figure "}],["$","$1","16",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:16"}]}],["$","$1","17",{"children":". We see that we have improvement"}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/10-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:4:0"}]]}]}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 5. "}],["$","$1","1",{"children":"Fixed Points like behavior of success rate along GRPO iterations."}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"in the convergence to fixed points up to a barrier due to computational and statistical errors as dicussed in Section "}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:6:1"}]}],["$","$1","2",{"children":". (Code to reproduce plots is in Supplementary material)"}]]}]]}],["$","$L30","7",{"heading":"7. Conclusion","index":7,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In conclusion, we have shown that GRPO with verifiable rewards can be viewed as an adaptive weighted contrastive loss (Section "}],["$","$1","1",{"children":"2"}],["$","$1","2",{"children":"). We derived a closed-form recursion for the optimal policy, expressed in terms of the reference and old policies, and the probability of success (Section "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:0:3"}]}],["$","$1","4",{"children":"). This leads to a fixed-point equation, with GRPO’s probability of success converging to a fixed-point solution under mild assumptions (Section "}],["$","$1","5",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:0:5"}]}],["$","$1","6",{"children":"). Moreover, we proved that GRPO amplifies the probability of success compared to the reference model (Section "}],["$","$1","7",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:0:7"}]}],["$","$1","8",{"children":"). Finally, we showed that for approximate policies, the probability of success remains close to the fixed-point value as long as approximation statistical and optimization errors are small (Section "}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:0:9"}]}],["$","$1","10",{"children":")."}]]}]]}],["$","$L30","8",{"heading":"Acknowledgement","index":8,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We thank Omar El Mansouri for pointing out that no condition on "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:0:1"}]]}]}],["$","$1","2",{"children":"is needed in Theorem "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:0:3"}]}],["$","$1","4",{"children":"."}]]}]]}],["$","$L30","9",{"heading":"References","index":9,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-33","style":"$undefined","children":"K. Cobbe, V. Kosaraju, M. Bavarian, M. Chen, H. Jun, L. Kaiser, M. Plappert, J. Tworek, J. Hilton, "}]}],["$","$1","1",{"children":"R. Nakano, C. Hesse, and J. Schulman. Training verifiers to solve math word problems. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:0:2:style","children":"arXiv preprint arXiv:2110.14168"}]}],["$","$1","3",{"children":", 2021."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-4","style":"$undefined","children":"L. Engstrom, A. Ilyas, S. Santurkar, D. Tsipras, F. Janoos, L. Rudolph, and A. Madry. Implementation "}]}],["$","$1","1",{"children":"matters in deep rl: A case study on ppo and trpo. In "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:1:2:style","children":"International Conference on Learning Representations"}]}],["$","$1","3",{"children":", 2020. URL "}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:1:4:style","children":"https://openreview"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:1:5:style","children":"."}]}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:1:6"}]}],["$","$1","7",{"children":"."}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-12","style":"$undefined","children":"L. Gao, J. Schulman, and J. Hilton. Scaling laws for reward model overoptimization. In "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:2:1:style","children":"International Conference on Machine Learning"}]}],["$","$1","2",{"children":", pages 10835–10866. PMLR, 2023."}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-11","style":"$undefined","children":"M. Y. Guan, M. Joglekar, E. Wallace, S. Jain, B. Barak, A. Helyar, R. Dias, A. Vallone, H. Ren, "}]}],["$","$1","1",{"children":"J. Wei, H. W. Chung, S. Toyer, J. Heidecke, A. Beutel, and A. Glaese. Deliberative alignment: Reasoning enables safer language models, 2025. URL "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:3:2"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:3:3:style","children":"."}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:3:4:style","children":"org/abs/2412"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:3:5:style","children":"."}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:3:6:style","children":"16339"}]}],["$","$1","7",{"children":"."}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-1","style":"$undefined","children":"D. Guo, D. Yang, H. Zhang, J. Song, R. Zhang, R. Xu, Q. Zhu, S. Ma, P. Wang, X. Bi, et al. "}]}],["$","$1","1",{"children":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:4:2:style","children":"arXiv preprint arXiv:2501.12948"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-5","style":"$undefined","children":"S. Huang, M. Noukhovitch, A. Hosseini, K. Rasul, W. Wang, and L. Tunstall. The n+ implementation "}]}],["$","$1","1",{"children":"details of RLHF with PPO: A case study on TL;DR summarization. In "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:5:2:style","children":"First Conference on Language Modeling"}]}],["$","$1","3",{"children":", 2024. URL "}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:5:4:style","children":"https://openreview"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:5:5:style","children":"."}]}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:5:6"}]}],["$","$1","7",{"children":"."}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-9","style":"$undefined","children":"Hugging Face. Open-R1. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:6:1:style","children":"https://github"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:6:2:style","children":"."}]}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:6:3"}]}],["$","$1","4",{"children":", 2024."}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-6","style":"$undefined","children":"W. Kwon, Z. Li, S. Zhuang, Y. Sheng, L. Zheng, C. H. Yu, J. E. Gonzalez, H. Zhang, and I. Stoica. "}]}],["$","$1","1",{"children":"Efficient memory management for large language model serving with pagedattention. In "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:7:2:style","children":"Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles"}]}],["$","$1","3",{"children":", 2023."}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-8","style":"$undefined","children":"N. Lambert, J. Morrison, V. Pyatkin, S. Huang, H. Ivison, F. Brahman, L. J. V. Miranda, A. Liu, "}]}],["$","$1","1",{"children":"N. Dziri, S. Lyu, et al. Tülu 3: Pushing frontiers in open language model post-training. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:8:2:style","children":"arXiv preprint arXiv:2411.15124"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-7","style":"$undefined","children":"A. Liu, B. Feng, B. Xue, B. Wang, B. Wu, C. Lu, C. Zhao, C. Deng, C. Zhang, C. Ruan, et al. "}]}],["$","$1","1",{"children":"Deepseek-v3 technical report. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:9:2:style","children":"arXiv preprint arXiv:2412.19437"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-10","style":"$undefined","children":"M. Luo, S. Tan, J. Wong, X. Shi, W. Y. Tang, M. Roongta, C. Cai, J. Luo, T. Zhang, L. E. Li, "}]}],["$","$1","1",{"children":"R. A. Popa, and I. Stoica. Deepscaler: Surpassing o1-preview with a 1.5b model by scaling rl. "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:10:2"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:10:3:style","children":"."}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:10:4:style","children":"com/5e9rs33z"}]}],["$","$1","5",{"children":", 2025. Notion Blog."}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-41","style":"$undefined","children":"Y. Mroueh. Information theoretic guarantees for policy alignment in large language models, 2024. "}]}],["$","$1","1",{"children":"URL "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:11:2"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:11:3:style","children":"."}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:11:4:style","children":"org/abs/2406"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:11:5:style","children":"."}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:11:6:style","children":"05883"}]}],["$","$1","7",{"children":"."}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-38","style":"$undefined","children":"A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, "}]}],["$","$1","1",{"children":"L. Antiga, A. Desmaison, A. Köpf, E. Yang, Z. DeVito, M. Raison, A. Tejani, S. Chilamkurthy, B. Steiner, L. Fang, J. Bai, and S. Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library, Dec. 2019."}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-3","style":"$undefined","children":"J. Schulman, F. Wolski, P. Dhariwal, A. Radford, and O. Klimov. Proximal policy optimization "}]}],["$","$1","1",{"children":"algorithms. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:13:2:style","children":"arXiv preprint arXiv:1707.06347"}]}],["$","$1","3",{"children":", 2017."}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-0","style":"$undefined","children":"Z. Shao, P. Wang, Q. Zhu, R. Xu, J. Song, X. Bi, H. Zhang, M. Zhang, Y. Li, Y. Wu, et al. "}]}],["$","$1","1",{"children":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:14:2:style","children":"arXiv preprint arXiv:2402.03300"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$La","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-2","style":"$undefined","children":"R. S. Sutton and A. G. Barto. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:1:style","children":"Reinforcement Learning: An Introduction"}]}],["$","$1","2",{"children":". MIT Press, 1998. URL "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:3:style","children":"http://www"}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:4:style","children":"."}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:5:style","children":"cs"}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:6:style","children":"."}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:7:style","children":"ualberta"}]}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:8:style","children":"."}]}],["$","$1","9",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:9"}]}],["$","$1","10",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:10:style","children":"."}]}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:15:11:style","children":"html"}]}],["$","$1","12",{"children":"."}]]}],["$","$La","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-13","style":"$undefined","children":"M. Vojnovic and S.-Y. Yun. What is the alignment objective of grpo?, 2025. URL "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:2:style","children":"arxiv"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:3:style","children":"."}]}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:4"}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:5:style","children":"."}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:16:6:style","children":"18548"}]}],["$","$1","7",{"children":"."}]]}],["$","$La","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-40","style":"$undefined","children":"L. von Werra, Y. Belkada, L. Tunstall, E. Beeching, T. Thrush, N. Lambert, S. Huang, K. Rasul, and "}]}],["$","$1","1",{"children":"Q. Gallouédec. Trl: Transformer reinforcement learning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:17:2:style","children":"https://github"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:17:3:style","children":"."}]}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:17:4"}]}],["$","$1","5",{"children":", 2020a."}]]}],["$","$La","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-35","style":"$undefined","children":"L. von Werra, Y. Belkada, L. Tunstall, E. Beeching, T. Thrush, N. Lambert, S. Huang, K. Rasul, and "}]}],["$","$1","1",{"children":"Q. Gallouédec. Trl: Transformer reinforcement learning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:18:2:style","children":"https://github"}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:18:3:style","children":"."}]}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:18:4"}]}],["$","$1","5",{"children":", 2020b."}]]}],["$","$La","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-39","style":"$undefined","children":"T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, "}]}],["$","$1","1",{"children":"M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T. L. Scao, S. Gugger, M. Drame, Q. Lhoest, and A. M. Rush. HuggingFace’s Transformers: State-of-the-art Natural Language Processing, July 2020."}]]}],["$","$La","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-34","style":"$undefined","children":"A. Yang, B. Yang, B. Hui, B. Zheng, B. Yu, C. Zhou, C. Li, C. Li, D. Liu, F. Huang, G. Dong, "}]}],["$","$1","1",{"children":"H. Wei, H. Lin, J. Tang, J. Wang, J. Yang, J. Tu, J. Zhang, J. Ma, J. Yang, J. Xu, J. Zhou, J. Bai, J. He, J. Lin, K. Dang, K. Lu, K. Chen, K. Yang, M. Li, M. Xue, N. Ni, P. Zhang, P. Wang, R. Peng, R. Men, R. Gao, R. Lin, S. Wang, S. Bai, S. Tan, T. Zhu, T. Li, T. Liu, W. Ge, X. Deng, X. Zhou, X. Ren, X. Zhang, X. Wei, X. Ren, X. Liu, Y. Fan, Y. Yao, Y. Zhang, Y. Wan, Y. Chu, Y. Liu, Z. Cui, Z. Zhang, Z. Guo, and Z. Fan. Qwen2 Technical Report, Sept. 2024."}]]}]]}],["$","$L30","10",{"heading":"Appendix A. Broader Impact and Limitations","index":10,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Our paper shows the GRPO advantage while simple enjoys a lot of good theoretical properties that ensure success rate amplification when using verifiable rewards. The main limitation in our work is that we study the non-clipped objective. This simplification makes the analysis possible and was used recently in ["}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:10:paragraphs:0:1"}]}],["$","$1","2",{"children":", "}],["$","$1","3",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:10:paragraphs:0:3"}]}],["$","$1","4",{"children":"]. Extending the results to clipped objectives or to proximal objectives with "}],["$","$1","5",{"children":"Kl "}],["$","$1","6",{"children":"regularization with respect to previous policy remains challenging technically and we hope the techniques introduced in this work will help overcoming those technical challenges. The core contribution being theoretical we don’t see negative societal impact of our work."}]]}]]}],["$","$L30","11",{"heading":"Appendix B. Assets","index":11,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-36","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:0:0:style","children":"Hardware setup. "}]}],["$","$1","1",{"children":"All our experiments were run on one compute node with Dual 48-core Intel Xeon 8468, 2TB of RAM, 8 NVIDIA HGX H100 80GB SMX5, 8x 3.4TB Enterprise NVMe U.2 Gen4, and 10x NVIDIA Mellanox Infiniband Single port NDR adapters, running RedHat Enterprise Linux 9.5."}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:1:0:style","children":"GRPO Config Setup. "}]}],["$","$1","1",{"children":"We use the group size "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:1:2:style","children":"G "}]}],["$","$1","3",{"children":"= 16 "}],["$","$1","4",{"children":"and per-device batch size "}],["$","$1","5",{"children":"16 "}],["$","$1","6",{"children":"meaning each on each GPU a single prompt "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:1:7:style","children":"x "}]}],["$","$1","8",{"children":"with "}],["$","$1","9",{"children":"16 "}],["$","$1","10",{"children":"corresponding responses is processed. To increase the overall batchsize we use gradient accumulation of "}],["$","$1","11",{"children":"4"}],["$","$1","12",{"children":", ending with an effective batch size of prompts of "}],["$","$1","13",{"children":"28"}],["$","$1","14",{"children":". The context length used for this experiment is "}],["$","$1","15",{"children":"200"}],["$","$1","16",{"children":", and the sampling temperature is set to "}],["$","$1","17",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:1:17"}]]}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:0:style","children":"Libraries. "}]}],["$","$1","1",{"children":"Our experiments rely on the open-source libraries "}],["$","$1","2",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:2"}]}],["$","$1","3",{"children":"["}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:4"}]}],["$","$1","5",{"children":", "}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:6"}]}],["$","$1","7",{"children":"] (license: BSD), "}],["$","$1","8",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:8"}]}],["$","$1","9",{"children":"["}],["$","$1","10",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:10"}]}],["$","$1","11",{"children":", "}],["$","$1","12",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:12"}]}],["$","$1","13",{"children":"] (Apache 2.0 license), and "}],["$","$1","14",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:14"}]}],["$","$1","15",{"children":"["}],["$","$1","16",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:16"}]}],["$","$1","17",{"children":", "}],["$","$1","18",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:11:paragraphs:2:18"}]}],["$","$1","19",{"children":"] (Apache 2.0 license)."}]]}]]}],["$","$L30","12",{"heading":"Appendix C. Proofs of Section 3","index":12,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:0:style","children":"Proof of Theorem "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:2:style","children":". "}]}],["$","$1","3",{"children":"The objective in Equation ("}],["$","$1","4",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:4"}]}],["$","$1","5",{"children":") is concave and hence setting the first order optimality conditions (See for example ["}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:6"}]}],["$","$1","7",{"children":", "}],["$","$1","8",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:0:8"}]}],["$","$1","9",{"children":"] ) we obtain:"}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/12-1.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:1:0"}]]}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"where"}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/12-2.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:3:0"}]]}]}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:4:0:style","children":"Proof of Theorem "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:4:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:4:2:style","children":". "}]}],["$","$1","3",{"children":"Replacing "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:4:4"}]]}]}],["$","$1","5",{"children":"by its expression from Theorem "}],["$","$1","6",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:4:6"}]}],["$","$1","7",{"children":"we have:"}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-1.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:5:0"}]]}]}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Replacing the weights expressions from Equations ("}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:6:1"}]}],["$","$1","2",{"children":") we obtain:"}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-2.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:7:0"}]]}]}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Define"}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-3.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:9:0"}]]}]}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We see therefore that GRPO’s probability of success satisfies the following iteration :"}]]}],["$","$La","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-4.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:11:0"}]]}]}]]}],["$","$La","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We assume here that "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:12:1"}]]}]}],["$","$1","2",{"children":".We can simplify "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:12:3"}]]}]}],["$","$1","4",{"children":"as follows:"}]]}],["$","$La","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-7.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:13:0"}]]}]}]]}],["$","$La","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:0:style","children":"Proof of Proposition "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:2:style","children":". "}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:3:style","children":"Existence of fixed points "}]}],["$","$1","4",{"children":"For "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:5"}]]}]}],["$","$1","6",{"children":"is continuous function from "}],["$","$1","7",{"children":"[0"}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:8:style","children":", "}]}],["$","$1","9",{"children":"1] "}],["$","$1","10",{"children":"to "}],["$","$1","11",{"children":"[0"}],["$","$1","12",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:12:style","children":", "}]}],["$","$1","13",{"children":"1] "}],["$","$1","14",{"children":"and hence by Brouwer’s Fixed Point Theorem at least a fixed point "}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:15"}]]}]}],["$","$1","16",{"children":"exists in "}],["$","$1","17",{"children":"[0"}],["$","$1","18",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:18:style","children":", "}]}],["$","$1","19",{"children":"1]"}],["$","$1","20",{"children":", i.e "}],["$","$1","21",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:21"}]]}]}],["$","$1","22",{"children":"such that "}],["$","$1","23",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:23"}]]}]}],["$","$1","24",{"children":"."}]]}],["$","$La","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/13-12.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:15:0"}]]}]}]]}],["$","$La","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"we have"}]]}],["$","$La","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:17:0"}]]}]}]]}],["$","$La","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Let us compute the derivative :"}]]}],["$","$La","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-1.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:19:0"}]]}]}]]}]]}],["$","$L30","13",{"heading":"Appendix D. Proofs of Section 4","index":13,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:0:0:style","children":"Proof of Theorem "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:0:2:style","children":". "}]}],["$","$1","3",{"children":"We claim that any fixed point "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:0:4"}]]}]}],["$","$1","5",{"children":"of "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:0:6"}]]}]}],["$","$1","7",{"children":"satisfies"}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-4.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:1:0"}]]}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Hence for any fixed point we have "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:2:1"}]]}]}],["$","$1","2",{"children":"and we have "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:2:3"}]]}]}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:3:0:style","children":"Proof of Theorem "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:3:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:3:2:style","children":". "}]}],["$","$1","3",{"children":"This is a direct application of local Banach fixed point theorem:"}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:0:style","children":"Theorem 6 "}]}],["$","$1","1",{"children":"(Local Contraction Mapping for One-Dimensional Functions)"}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:2:style","children":". "}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:3:style","children":"Let "}]}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:4"}]]}]}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:5:style","children":"be continuously differentiable, and suppose that "}]}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:6"}]]}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:7:style","children":"is a fixed point of "}]}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:8"}]]}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:9:style","children":"that "}]}],["$","$1","10",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:10"}]]}]}],["$","$1","11",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:4:11:style","children":"is continuous and that"}]}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-11.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:5:0"}]]}]}]]}],["$","$La","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:0:style","children":"Then, by the continuity of "}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:1"}]]}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:2:style","children":", there exists a radius "}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:3:style","children":"r > "}]}],["$","$1","4",{"children":"0 "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:5:style","children":"and a constant "}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:6:style","children":"k "}]}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:6:7:style","children":"with"}]}]]}],["$","$La","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-13.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:7:0"}]]}]}]]}],["$","$La","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:0:style","children":"Consequently, "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:1:style","children":"f "}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:2:style","children":"is a contraction on the interval "}]}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:3"}]]}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:4:style","children":", and for any initial guess "}]}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:5"}]]}]}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:8:6:style","children":"the iteration defined by"}]}]]}],["$","$La","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/14-16.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:9:0"}]]}]}]]}],["$","$La","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:10:0:style","children":"converges to the unique fixed point "}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:10:1"}]]}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:10:2:style","children":"in "}]}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:10:3:style","children":"I"}]}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:13:paragraphs:10:4:style","children":"."}]}]]}]]}],["$","$L30","14",{"heading":"Appendix E. Proofs of Section 5","index":14,"length":15,"content":[["$","$La","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:0:0:style","children":"Proof of Theorem "}]}],["$","$1","1",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:0:2:style","children":". "}]}],["$","$1","3",{"children":"Note that"}]]}],["$","$La","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/15-0.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:1:0"}]]}]}]]}],["$","$La","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We have:"}]]}],["$","$La","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/15-1.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:3:0"}]]}]}]]}],["$","$La","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:0"}]]}]}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:1"}]]}]}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:2"}]]}]}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:3"}]]}]}],["$","$1","4",{"children":"Assume the sequence "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:5"}]]}]}],["$","$1","6",{"children":"converges to "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:7"}]]}]}],["$","$1","8",{"children":"the fixed point of "}],["$","$1","9",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:9"}]]}]}],["$","$1","10",{"children":". Under Assumption "}],["$","$1","11",{"children":["$","$L31",null,{"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:4:11"}]}],["$","$1","12",{"children":"we have :"}]]}],["$","$La","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L33",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2503.06639/images/15-9.png","alt":"$undefined"}],["$","$L32",null,{"imgScale":4,"avgLineHeight":12.96,"fragment":"$c:props:children:props:children:3:1:props:paperJSON:sections:14:paragraphs:5:0"}]]}]}]]}]]}]],["$","$L34",null,{"paper":"$c:props:children:props:children:0:props:product"}]]