1b:["$","$L29",null,{"isWhiteLabelled":false,"children":["$","$Lb",null,{"pt":{"compact":0,"expanded":3},"children":[["$","$L2a",null,{"noStar":true,"publisher":true,"task":true,"params":true,"size":"xl","product":{"id":"eyJwYXBlcklEIjoiMjUwNS4xMzQzOCIsInB1Ymxpc2hlciI6ImFyeGl2In0=","publisher":"arxiv","updated":"2025-06-05T00:00:00.000Z","paperID":"2505.13438","published":"2025-05-19T00:00:00.000Z","authors":"[\"Qi Penghui\",\"Liu Zichen\",\"Pang Tianyu\",\"Du Chao\",\"Lee Wee Sun\",\"Lin Min\"]","title":"Optimizing Anytime Reasoning via Budget Relative Policy Optimization","scoreTrending":null,"summary":"$2b","lastCheckedForCode":"2025-06-06T04:19:30.547Z","links":[{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9kYXRhc2V0L21hdGgifQ==","type":"dataset","url":"https://paperswithcode.com/dataset/math","data":"{\"name\":\"MATH\"}"},{"id":"eyJ1cmwiOiJodHRwczovL3BhcGVyc3dpdGhjb2RlLmNvbS9wYXBlci9vcHRpbWl6aW5nLWFueXRpbWUtcmVhc29uaW5nLXZpYS1idWRnZXQifQ==","type":"pwc","url":"https://paperswithcode.com/paper/optimizing-anytime-reasoning-via-budget","data":"{\"date\":\"2025-06-08T04:04:13.718Z\"}"}],"reposConnection":{"edges":[{"official":null,"node":{"id":"eyJyZXBvSUQiOiI5ODYxNzI4NjEiLCJzb3VyY2UiOiJnaXRodWIifQ==","source":"github","repoID":"986172861","url":"https://github.com/sail-sg/AnytimeReasoner","title":"AnytimeReasoner","language":"python","stars":36,"forks":2,"framework":null,"scoreTrending":null,"updated":null,"created":null,"downloads":null,"likes":null,"owner":[{"username":"sail-sg","avatar":"https://avatars.githubusercontent.com/u/85740051?v=4"}]}}]},"models":[],"tags":[{"id":"eyJuYW1lIjoibWF0aGVtYXRpY2FsIHJlYXNvbmluZyIsInR5cGUiOiJ0YXNrIn0=","name":"mathematical reasoning","description":"In mathematical reasoning, the input is a mathematical problem or equation, and the output is the solution or proof. This task is used in real-world scenarios such as solving complex mathematical problems, optimizing systems, and developing algorithms.","scoreTrending":0.151597119158488,"count":{"stars":1577,"papers":629,"models":1480},"__typename":"Tag"},{"id":"eyJuYW1lIjoicmVpbmZvcmNlbWVudCBsZWFybmluZyIsInR5cGUiOiJ0YXNrIn0=","name":"reinforcement learning","description":"In reinforcement learning, an agent learns to make decisions by taking actions in an environment to achieve maximum reward. It's used in real-world scenarios like game playing, robotics, resource management, where the goal is to improve performance based on feedback.","scoreTrending":0.18903618593024993,"count":{"stars":15619,"papers":8586,"models":4659},"__typename":"Tag"}],"summaries":[],"emailsConnection":{"edges":[]},"__typename":"paper","authorArray":["Qi Penghui","Liu Zichen","Pang Tianyu","Du Chao","Lee Wee Sun","Lin Min"]}}],["$","$L18",null,{"container":true,"columns":100,"spacing":{"compact":0,"expanded":2,"large":3},"children":[["$","$L18",null,{"size":{"compact":100,"expanded":100,"large":68},"children":[["$","$7",null,{"children":["$","$L2c",null,{"publisher":"arxiv","paperID":"2505.13438","product":{"paper":"$1b:props:children:props:children:0:props:product","models":"$1b:props:children:props:children:0:props:product:models"},"isWhiteLabelled":false}]}],["$","$7",null,{"children":["$","$L2d",null,{"article":"$L2e","model":"$undefined"}]}]]}],["$","$L18",null,{"size":"grow","children":["$","$L2f",null,{}]}]]}],["$","$7",null,{"children":null}],[["$","audio",null,{"id":"tts"}],["$","$L30",null,{"paperID":"2505.13438","publisher":"arxiv","paperJSON":{"title":"Optimizing Anytime Reasoning via Budget Relative Policy Optimization","paperID":"2505.13438","avgLineHeight":10.64,"imgScale":4,"sections":[{"heading":"Abstract","paragraphs":[[{"id":"id-13","text":"Scaling test-time compute is crucial for enhancing the reasoning capabilities of ","element":"span"},{"text":"large language models (LLMs). Existing approaches typically employ reinforcement learning (RL) to maximize a verifiable reward obtained at the end of reasoning traces. However, such methods optimize only the final performance under a large and fixed token budget, which hinders efficiency in both training and deployment. In this work, we present a novel framework, ","element":"span"},{"style":{"fontWeight":"bold"},"text":"AnytimeReasoner","element":"span"},{"text":", to optimize ","element":"span"},{"style":{"fontWeight":"bold"},"text":"anytime reasoning performance","element":"span"},{"text":", which aims to improve token efficiency and the flexibility of reasoning under varying thinking budget constraints. To achieve this, we truncate the complete thinking process to fit within sampled token budgets from a prior distribution, compelling the model to summarize the optimal answer for each truncated thinking for verification. This introduces ","element":"span"},{"style":{"fontWeight":"bold"},"text":"verifiable dense rewards ","element":"span"},{"text":"into the reasoning process, facilitating more effective credit assignment in RL optimization. We then optimize the thinking and summary policies in a decoupled manner to maximize the cumulative reward. Additionally, we introduce a novel variance reduction technique, ","element":"span"},{"style":{"fontWeight":"bold"},"text":"B","element":"span"},{"text":"udget ","element":"span"},{"style":{"fontWeight":"bold"},"text":"R","element":"span"},{"text":"elative ","element":"span"},{"style":{"fontWeight":"bold"},"text":"P","element":"span"},{"text":"olicy ","element":"span"},{"style":{"fontWeight":"bold"},"text":"O","element":"span"},{"text":"ptimization (","element":"span"},{"style":{"fontWeight":"bold"},"text":"BRPO","element":"span"},{"text":"), to enhance the robustness and efficiency of the learning process when reinforcing the thinking policy. Empirical results in mathematical reasoning tasks demonstrate that our method consistently outperforms GRPO across all thinking budgets under various prior distributions, enhancing both training and token efficiency.","element":"span"}]]},{"heading":"1 Introduction","paragraphs":[[{"text":"OpenAI o1 [","element":"span"},{"href":"#id-0","referenceIndex":28,"text":"OpenAI","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":28,"text":"2024","element":"a"},{"text":"] and DeepSeek-R1 [","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"Guo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"2025","element":"a"},{"text":"] have shown that scaling test-time compute via RL is crucial for LLM reasoning. This involves an extensive thinking process using the chain of thought (CoT) [","element":"span"},{"href":"#id-2","referenceIndex":36,"text":"Wei et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-2","referenceIndex":36,"text":"2022","element":"a"},{"text":"] before producing an answer. RL is then employed to maximize the outcome reward provided by a rule-based verifier to check the correctness of the generated answer. While RL for LLM reasoning is an active area of research, most existing work focuses on optimizing final performance based on the complete thinking process. This approach can be inefficient in both training and deployment, as long CoTs are costly, especially for online services.","element":"span"}],[{"text":"In our work, we focus on ","element":"span"},{"style":{"fontWeight":"bold"},"text":"optimizing anytime reasoning for LLMs via RL","element":"span"},{"text":". This is conceptually similar to the ","element":"span"},{"style":{"fontStyle":"italic"},"text":"anytime algorithms ","element":"span"},{"text":"introduced in ","element":"span"},{"href":"#id-3","referenceIndex":8,"text":"Dean and Boddy ","element":"a"},{"text":"[","element":"span"},{"href":"#id-3","referenceIndex":8,"text":"1988","element":"a"},{"text":"], ","element":"span"},{"href":"#id-4","referenceIndex":41,"text":"Zilberstein and Russell ","element":"a"},{"text":"[","element":"span"},{"href":"#id-4","referenceIndex":41,"text":"1995","element":"a"},{"text":"], where the system can be interrupted at any point during computation, providing the best possible solution so far and is expected to improve the solution quality when more resources are allocated. Concretely in LLM reasoning, we assume the thinking process can be interrupted at any time, and the model should be able to summarize the best solution from incomplete thinking. This capability can significantly extend the serving capacity for online services with limited computing resources. When there are too many requests to handle, the service can choose to interrupt in-progress requests once the thinking length is able to give sufficient accuracy, reserving longer thinking with better accuracy when resources are available. Moreover, users may want to control the thinking budget as in Gemini 2.5[","element":"span"},{"href":"#id-5","referenceIndex":35,"text":"Team et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-5","referenceIndex":35,"text":"2023","element":"a"},{"text":"], but the optimal budget is often agnostic. Compared to budgetaware reasoning[","element":"span"},{"href":"#id-6","referenceIndex":10,"text":"Han et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-6","referenceIndex":10,"text":"2024","element":"a"},{"text":"], our design supports an economical strategy by incrementally increasing the budget, as it allows for continued thinking and reuses the computation already spent.","element":"span"}],[{"text":"To achieve optimal performance for anytime reasoning, we propose ","element":"span"},{"style":{"fontWeight":"bold"},"text":"sampling the thinking budget from a prior distribution ","element":"span"},{"text":"while learning, rather than using a fixed, large budget as in prior work [","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu ","element":"a"},{"href":"#id-7","referenceIndex":23,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"Zeng et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"Luo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"2025","element":"a"},{"text":"]. This approach makes the model performance robust to potential interruptions in the thinking process, while incentivizing it to reach correct answers more efficiently. By achieving a balance between token efficiency and thorough exploration [","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"Qu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"2025","element":"a"},{"text":"], these models are also able to obtain better performance when given larger budgets.","element":"span"}],[{"text":"We investigate how to efficiently train LLMs with RL under sampled thinking budgets. By forcing the model to summarize the answers at predefined thinking budgets (drawn from the support of the prior distribution), we introduce ","element":"span"},{"style":{"fontWeight":"bold"},"text":"verifiable dense rewards ","element":"span"},{"text":"into the reasoning process. These rewards provide richer signals and better credit assignment during training [","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"Qu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":6,"text":"Cui ","element":"a"},{"href":"#id-11","referenceIndex":6,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-11","referenceIndex":6,"text":"2025","element":"a"},{"text":"]. We also propose ","element":"span"},{"style":{"fontWeight":"bold"},"text":"a novel variance reduction technique termed Budget Relative Policy Optimization (BRPO) that advances beyond GRPO ","element":"span"},{"text":"[","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"Shao et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"2024","element":"a"},{"text":"] to improve training stability and efficiency under this dense reward framework. As illustrate in Figure ","element":"span"},{"href":"#id-13","text":"1 ","element":"a"},{"text":"(right), we leverage rewards at previous budgets to compute the advantage function, combining with the average return of a group of reasoning trajectories. Empirically, we observe that generating a high-quality summary is critical for both final and anytime performance. Thus, we ","element":"span"},{"style":{"fontWeight":"bold"},"text":"decouple the optimization of the thinking and summary policies","element":"span"},{"text":", always sampling from a uniform distribution to derive a better summary policy, thereby improving training efficiency.","element":"span"}],[{"text":"We term our overall framework as ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner","element":"span"},{"text":". Experimental results demonstrate that ","element":"span"},{"style":{"fontWeight":"bold"},"text":"AnytimeReasoner consistently surpasses GRPO in both final and anytime performance","element":"span"},{"text":". We conduct extensive ablation studies to evaluate the impact of each component. By independently incorporating decoupled optimization, variance reduction, and budget sampling into GRPO, we observe significant performance enhancements, underscoring the effectiveness of our methods. Notably, even when merely using the maximum token budget (without budget sampling), our method still outperforms GRPO in both standard and anytime reasoning, highlighting the robustness of our approach.","element":"span"}]]},{"heading":"2 Methodology","paragraphs":[[{"text":"In a training paradigm similar to R1-Zero [","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"Guo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"2025","element":"a"},{"text":"], the model is tasked with generating a comprehensive CoT within a designated \"thinking box\" upon receiving a question. Subsequently, the model summarizes the answer based on this thinking process. A rule-based reward is then calculated according to the summarized answer. The RL objective is to maximize the expected reward:","element":"span"}],[{"style":{"width":"78%"},"width":1246,"height":109,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/1-0.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x ","element":"span"},{"text":"represents the question, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z ","element":"span"},{"text":"denotes the thinking process, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"y ","element":"span"},{"text":"is the summarized answer, and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"r","element":"span"},{"text":"(","element":"span"},{"style":{"fontStyle":"italic"},"text":"x, y","element":"span"},{"text":") ","element":"span"},{"text":"is the reward function.","element":"span"}],[{"text":"In previous studies [","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"Zeng et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"Luo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"2025","element":"a"},{"text":"], the generation of thinking process and summary are typically sampled together. If the thinking process exceeds the predefined generation limit, the response is considered a negative sample. We contend that this approach is impractical, particularly in online services where a valid summary should be provided even if the thinking process is incomplete. We propose decoupling the generation of the thinking process and its summary, allocating separate token budgets for each. When the thinking process is halted due to budget constraints, we insert ellipses followed by a ","element":"span"},{"style":{"fontStyle":"italic"},"text":" ","element":"span"},{"text":"to prompt the model to produce a summary (see Appendix ","element":"span"},{"text":"A","element":"span"},{"text":"), similar to ","element":"span"},{"href":"#id-14","referenceIndex":26,"text":"Muennighoff et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-14","referenceIndex":26,"text":"2025","element":"a"},{"text":"] and ","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"Qu et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"2025","element":"a"},{"text":"].","element":"span"}],[{"text":"To differentiate between the thinking and summary policies, we denote the thinking policy as ","element":"span"},{"style":{"height":9.19},"width":37.72,"height":22.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-0.png","element":"img","alt":" πθ","inline":true,"padRight":true},{"text":"and the summary policy as ","element":"span"},{"style":{"height":11.59},"width":43.72,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-1.png","element":"img","alt":" πϕ","inline":true},{"text":". By defining ","element":"span"},{"style":{"height":26.32},"width":499.58,"height":65.79,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-2.png","element":"img","alt":" rϕ(x, z) = Ey∼πϕ(·|x,z) [r(x, y)]","inline":true},{"text":", the objective can be ","element":"span"},{"text":"expressed as:","element":"span"}],[{"style":{"width":"68%"},"width":1086,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-3.png","element":"img"}],[{"text":"Given that ","element":"span"},{"style":{"height":16},"width":147.45,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-4.png","element":"img","alt":" |y| ≪ |z|","inline":true},{"text":", multiple summaries can be sampled to better estimate the expected reward for each thinking process, while incurring only a small computational overhead.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Optimizing Anytime Reasoning","element":"span"}],[{"text":"Test-time scaling [","element":"span"},{"href":"#id-0","referenceIndex":28,"text":"OpenAI","element":"a"},{"text":", ","element":"span"},{"href":"#id-0","referenceIndex":28,"text":"2024","element":"a"},{"text":"] is crucial for enhancing the reasoning capabilities of LLMs. This concept operates on the premise that increased computational effort during the reasoning process generally leads to better performance. However, in typical RL training setups like R1-Zero-like [","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"Guo ","element":"a"},{"href":"#id-1","referenceIndex":9,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"2025","element":"a"},{"text":"], the performance on anytime reasoning is not guaranteed. The reward evaluation is based on the entire thinking process, lacking insight into whether incremental thinking consistently improves performance [","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"Qu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"2025","element":"a"},{"text":"].","element":"span"}],[{"text":"To optimize anytime reasoning, we propose sampling the thinking budget from a prior distribution rather than using a fixed token budget. Let ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b ","element":"span"},{"text":"represent the token budget for thinking, sampled from a prior distribution ","element":"span"},{"style":{"height":10},"width":41.04,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-5.png","element":"img","alt":" pB","inline":true,"padRight":true},{"text":"over a set of increasing budgets ","element":"span"},{"style":{"height":16.79},"width":508.86,"height":41.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-6.png","element":"img","alt":" {b1, . . . , bm} (Pj = pB(b = bj)","inline":true,"padRight":true},{"text":"for simplicity). The anytime reasoning objective is:","element":"span"}],[{"id":"id-22","style":{"width":"96%"},"width":1530,"height":145,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-7.png","element":"img"}],[{"text":"where ","element":"span"},{"style":{"height":11.59},"width":57.44,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-8.png","element":"img","alt":" z≤b","inline":true,"padRight":true},{"text":"is the truncated thinking process at length of the token budget ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b","element":"span"},{"text":",","element":"span"}],[{"style":{"width":"36%"},"width":573,"height":96,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-9.png","element":"img"}],[{"text":"Instead of focusing solely on the final score based on the entire thinking process as in standard reasoning task, we maximize the expected score over all possible budgets with distribution ","element":"span"},{"style":{"height":10},"width":41.05,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-10.png","element":"img","alt":" pB","inline":true},{"text":". As illustrated in Figure ","element":"span"},{"href":"#id-13","text":"1","element":"a"},{"text":", this is akin to maximizing the area under the score curve when ","element":"span"},{"style":{"height":10},"width":41.05,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-11.png","element":"img","alt":" pB","inline":true,"padRight":true},{"text":"is a uniform distribution across every token budget. However, evaluating for all token budgets is impractical and unnecessary, so we evaluate the score only at a small predefined budget support (with ","element":"span"},{"style":{"height":13.2},"width":212.92,"height":33,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/2-12.png","element":"img","alt":" m ≤ 8 in our","inline":true,"padRight":true},{"text":"experiments).","element":"span"}],[{"text":"It is important to note that this approach transforms the problem into a dense reward framework, introducing verifiable dense rewards for each thinking budget. This facilitates better credit assignment during RL training and enhances the identification of each component’s contribution to a successful reasoning process. As illustrated in Figure ","element":"span"},{"href":"#id-15","text":"2","element":"a"},{"text":", the dense rewards for budgets prior to reaching a correct answer are low. However, the cumulative return is relatively higher if the reasoning process ultimately arrives at a correct answer. In contrast, the cumulative return after the first correct answer is relatively low, localizing and highlighting the tokens that contributed to the initial correct answer. This approach is distinct from typical sparse reward RL training for standard reasoning tasks, where all tokens receive the same return. Such sparse reward structures typically lead to unstable and inefficient RL training, while our dense reward approach provides more informative learning signals throughout the entire reasoning process.","element":"span"}],[{"style":{"width":"40%"},"width":645,"height":30,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-0.png","element":"img"}],[{"id":"id-15","text":"Reasoning ","element":"span"},{"text":"process ","element":"span"},{"style":{"height":8.63},"width":191.01,"height":21.56,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-1.png","element":"img","alt":"𝑥 question","inline":true}],[{"style":{"width":"89%"},"width":1427,"height":155,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-2.png","element":"img"}],[{"text":"Figure 2: By introducing dense rewards, we achieve better credit assignment during RL training. We assume a uniform distribution over thinking budgets and omit the probability for simplicity.","element":"figcaption","subtype":"caption"}],[{"style":{"fontWeight":"bold"},"text":"Relation to Standard Reasoning Tasks ","element":"span"},{"text":"A larger thinking budget is supposed to yield better performance in expectation. Since ","element":"span"},{"style":{"height":11.59},"width":57.44,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-3.png","element":"img","alt":" z≤b","inline":true,"padRight":true},{"text":"is always a prefix of ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z","element":"span"},{"text":", the optimal summary policy ","element":"span"},{"style":{"height":11.59},"width":56.88,"height":28.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-4.png","element":"img","alt":" πϕ∗","inline":true,"padRight":true},{"text":"should satisfy:","element":"span"}],[{"id":"id-56","style":{"width":"73%"},"width":1161,"height":62,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-5.png","element":"img"}],[{"text":"for any ","element":"span"},{"style":{"fontStyle":"italic"},"text":"b ","element":"span"},{"text":"and ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":". Then we have:","element":"span"}],[{"style":{"width":"63%"},"width":1005,"height":43,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-6.png","element":"img"}],[{"text":"This justifies the anytime reasoning objective as a lower bound of the standard reasoning objective. Therefore, maximizing performance in anytime reasoning should also enhance performance in standard reasoning tasks. In an extreme case where ","element":"span"},{"style":{"height":13.19},"width":129,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-7.png","element":"img","alt":" Pm = 1","inline":true,"padRight":true},{"text":"(training only with full reasoning length), ","element":"span"},{"style":{"height":15.59},"width":116.08,"height":38.97,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-8.png","element":"img","alt":"Janytime","inline":true,"padRight":true},{"text":"falls back to the standard reasoning objective ","element":"span"},{"style":{"fontStyle":"italic"},"text":"J ","element":"span"},{"text":". For detailed proof, refer to Appendix ","element":"span"},{"text":"C","element":"span"},{"text":".","element":"span"}],[{"id":"id-25","style":{"fontWeight":"bold"},"text":"2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Budget Relative Policy Optimization","element":"span"}],[{"text":"By defining ","element":"span"},{"style":{"height":15.59},"width":337.12,"height":38.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-9.png","element":"img","alt":" jt = arg minj bj ≥ t","inline":true},{"text":", which represents the nearest token budget after ","element":"span"},{"style":{"fontStyle":"italic"},"text":"t","element":"span"},{"text":", the gradient for the thinking policy can be computed as follows:","element":"span"}],[{"style":{"width":"91%"},"width":1456,"height":130,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-10.png","element":"img"}],[{"text":"where","element":"span"}],[{"style":{"width":"33%"},"width":531,"height":107,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-11.png","element":"img"}],[{"text":"and ","element":"span"},{"style":{"height":16},"width":161.56,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/3-12.png","element":"img","alt":" V (x, z ","element":"span"},{"text":"or when exceeding 8000 tokens). We sample 4 answers to calculate the average score at each thinking budget, which is used to compute the advantage function as in Dr. GRPO [","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":"]. The summary length is restricted to 128 tokens. We extract the first answer and use a rule-based verifier to determine the 0/1 outcome reward. As detailed in Section ","element":"span"},{"href":"#id-26","text":"2.3","element":"a"},{"text":", we employ different prior distributions for the thinking and summary policies. Unless otherwise specified, the prior distribution ","element":"span"},{"style":{"height":11.19},"width":41.05,"height":27.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/5-1.png","element":"img","alt":" p′B ","inline":true,"padRight":true},{"text":"for the summary policy ","element":"span"},{"text":"is set to a uniform distribution.","element":"span"}],[{"text":"We fine-tuned DeepSeek-R1-Distill-Qwen-1.5B [","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"Guo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"2025","element":"a"},{"text":"] on 40,315 math problems from DeepScaleR [","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"Luo et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-9","referenceIndex":24,"text":"2025","element":"a"},{"text":"] for a single epoch, using a batch size of 64 questions per policy iteration. Our experiments were conducted on 8 NVIDIA A100 80G GPUs, with each experiment taking approximately 30 hours to complete (less than 10% overhead in total compared to GRPO). During training, we evaluate the average scores of AIME2024 and AMC2022 every 20 steps and report their performance curves, sampling 32 responses for each question. After training, we assess the final model using five benchmarks: AIME2024 [","element":"span"},{"href":"#id-27","referenceIndex":20,"text":"Li et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":20,"text":"2024a","element":"a"},{"text":"], AMC2022 [","element":"span"},{"href":"#id-27","referenceIndex":20,"text":"Li et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-27","referenceIndex":20,"text":"2024a","element":"a"},{"text":"], MATH500 [","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"Hendrycks et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-28","referenceIndex":12,"text":"2021","element":"a"},{"text":"], Minerva Math [","element":"span"},{"href":"#id-29","referenceIndex":19,"text":"Lewkowycz et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-29","referenceIndex":19,"text":"2022","element":"a"},{"text":"], and Olympiad Bench [","element":"span"},{"href":"#id-30","referenceIndex":11,"text":"He et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-30","referenceIndex":11,"text":"2024","element":"a"},{"text":"], with 32 uniform token budgets ranging from 0 to 8000. We compare our methods with GRPO [","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"Shao et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"2024","element":"a"},{"text":"], incorporating the corrections introduced in Dr. GRPO [","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu ","element":"a"},{"href":"#id-7","referenceIndex":23,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":"].","element":"span"}],[{"id":"id-60","style":{"fontWeight":"bold"},"text":"3.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Main Results","element":"span"}],[{"text":"We consider the following prior distributions ","element":"span"},{"style":{"height":10},"width":41.05,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/5-2.png","element":"img","alt":" pB","inline":true,"padRight":true},{"text":"when optimizing the thinking policy by equation ","element":"span"},{"href":"#id-22","text":"3","element":"a"},{"text":":","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Base","element":"span"},{"text":": We only optimize the final performance as in standard reasoning task, namely ","element":"span"},{"style":{"height":13.19},"width":138.92,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/5-3.png","element":"img","alt":" Pm = 1.","inline":true}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Uniform","element":"span"},{"text":": We set ","element":"span"},{"style":{"height":10},"width":41.05,"height":25,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/5-4.png","element":"img","alt":" pB","inline":true,"padRight":true},{"text":"as a uniform distribution.","element":"span"}],[{"text":"• ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Linear","element":"span"},{"text":": We assign probability proportional to the budget length, such that ","element":"span"},{"style":{"height":16},"width":172.08,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/5-5.png","element":"img","alt":" pB(b) ∝ b.","inline":true}],[{"text":"We evaluate the final models after training and plot the score curves under varying thinking budgets in Figure ","element":"span"},{"href":"#id-31","text":"4","element":"a"},{"text":". For each question in AMC and AIME, we sample 320 thinking processes to compute the average score. For other datasets, we sample 80 thinking processes per question.","element":"span"}],[{"text":"As shown in Figure ","element":"span"},{"href":"#id-31","text":"4","element":"a"},{"text":", all variants of our method consistently outperform GRPO by a large margin across varying prior distributions. With small budgets, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner-uniform ","element":"span"},{"text":"excels by prioritizing optimization of these budgets. When the thinking budget is large, ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner ","element":"span"},{"text":"with different prior distributions tends to converge to similar performance, demonstrating the robustness of our approach. Notably, even for ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner-base","element":"span"},{"text":", where we optimize performance only under the maximum thinking budget as in the GRPO baseline, we still achieve significant better performance at all thinking budgets. This improvement is due to the decoupled optimization and our variance reduction technique (discussed further in Section ","element":"span"},{"href":"#id-32","text":"3.2.3","element":"a"},{"text":"). More details can be found in Appendix ","element":"span"},{"href":"#id-33","text":"D.1","element":"a"},{"text":".","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"3.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Ablations","element":"span"}],[{"text":"To further investigate which aspects of our framework contribute to performance improvements, we conduct detailed ablations considering three factors: verifiable dense rewards (Section ","element":"span"},{"href":"#id-34","text":"3.2.1","element":"a"},{"text":"), decoupled optimization (Section ","element":"span"},{"href":"#id-35","text":"3.2.2","element":"a"},{"text":"), and variance reduction (Section ","element":"span"},{"href":"#id-32","text":"3.2.3","element":"a"},{"text":"). We report three metrics during training. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Anytime Accuracy","element":"span"},{"text":": the average accuracy over thinking budgets at {2000, 4000, 6000, 8000}. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Final Accuracy","element":"span"},{"text":": the accuracy at the maximum budget (8000). ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Average Thinking Length","element":"span"},{"text":": the average thinking length under the maximum budget (8000).","element":"span"}],[{"id":"id-34","style":{"fontWeight":"bold"},"text":"3.2.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Verifiable Dense Rewards","element":"span"}],[{"id":"id-37","style":{"width":"93%"},"width":1487,"height":731,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/6-0.png","element":"img"}],[{"text":"Figure 5: Ablation on verifiable dense rewards.","element":"figcaption","subtype":"caption"}],[{"text":"We investigate the effectiveness of verifiable dense rewards by modifying the objective of the thinking policy to equation ","element":"span"},{"href":"#id-22","text":"3 ","element":"a"},{"text":"with a ","element":"span"},{"style":{"fontStyle":"italic"},"text":"linear ","element":"span"},{"text":"prior distribution, while keeping the summary policy training consistent with GRPO. Specifically, we use ","element":"span"},{"style":{"height":13.19},"width":39.24,"height":32.98,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/6-1.png","element":"img","alt":" V2","inline":true,"padRight":true},{"text":"as the variance reduction term to align with GRPO and eliminate the influence of enhanced variance reduction. We also compare our method with reward shaping, where we add a length penalty for correct answer as an alternative to budget sampling. As in ","element":"span"},{"href":"#id-36","referenceIndex":1,"text":"Aggarwal and Welleck ","element":"a"},{"text":"[","element":"span"},{"href":"#id-36","referenceIndex":1,"text":"2025","element":"a"},{"text":"], the reward will be ","element":"span"},{"style":{"height":23.22},"width":149.28,"height":58.05,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/6-2.png","element":"img","alt":" 1 − 0.2|z|bm","inline":true,"padRight":true},{"text":"for the correct answer and 0 for wrong answer.","element":"span"}],[{"text":"As illustrated in Figure ","element":"span"},{"href":"#id-37","text":"5","element":"a"},{"text":", incorporating dense rewards improves both the anytime and final performance. Notably, since our objective diverges from directly optimizing final performance as in the GRPO baseline, the observed improvements can be attributed to enhanced credit assignment facilitated by dense rewards. Another prominent observation is that the average thinking length is clearly shorter than the GRPO baseline under the maximum budget. This is because the thinking policy is encouraged to arrive at a correct answer as quickly as possible, making the model favor shorter, correct responses. Although reward shaping with length penalty can also reduce the thinking length, it sacrifices the performance and is unstable during training.","element":"span"}],[{"id":"id-35","style":{"fontWeight":"bold"},"text":"3.2.2 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Decoupled Optimization","element":"span"}],[{"id":"id-38","style":{"width":"93%"},"width":1487,"height":636,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/7-0.png","element":"img"}],[{"text":"Figure 6: Ablation on decoupled optimization for summary policy.","element":"figcaption","subtype":"caption"}],[{"text":"To study the impact of decoupled optimization for thinking and summary policies (detailed in Section ","element":"span"},{"href":"#id-26","text":"2.3","element":"a"},{"text":"), we modify the training of summary policy in GRPO to align with ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner","element":"span"},{"text":", while keeping the thinking policy training unchanged. Specifically, we sample 4 answers for each thinking budget in {2000, 4000, 6000, 8000}, applying GRPO within each summary group. This approach trains a summary policy under uniformly distributed thinking budgets, while the thinking policy optimizes performance only under the maximum budget (8000).","element":"span"}],[{"text":"As shown in Figure ","element":"span"},{"href":"#id-38","text":"6","element":"a"},{"text":", the decoupled GRPO clearly outperforms the vanilla GRPO, especially in the AMC benchmark. Notably, the significant improvement in anytime accuracy (the average score under sampled thinking budgets) indicates that decoupled optimization results in a better summary policy for anytime reasoning.","element":"span"}],[{"id":"id-32","style":{"fontWeight":"bold"},"text":"3.2.3 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Variance Reduction","element":"span"}],[{"style":{"width":"93%"},"width":1487,"height":636,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/7-1.png","element":"img"}],[{"text":"Figure 7: Ablation on variance reduction.","element":"figcaption","subtype":"caption"}],[{"text":"To evaluate the effectiveness of our BRPO variance reduction (as detailed in Section ","element":"span"},{"href":"#id-25","text":"2.2","element":"a"},{"text":"), we modified the training of the thinking policy by incorporating BRPO’s variance reduction techniques, while maintaining the summary policy training consistent with GRPO. Specifically, we set ","element":"span"},{"style":{"fontStyle":"italic"},"text":"m ","element":"span"},{"text":"= 4 ","element":"span"},{"text":"and ","element":"span"},{"style":{"height":16},"width":182.61,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/7-2.png","element":"img","alt":"P(bm) = 1","inline":true,"padRight":true},{"text":"in equation ","element":"span"},{"href":"#id-39","text":"7","element":"a"},{"text":", aligning the objective exactly with GRPO.","element":"span"}],[{"text":"Figure ","element":"span"},{"href":"#id-32","text":"7 ","element":"a"},{"text":"shows that our approach enhances performance on the AIME benchmark. As discussed in Section ","element":"span"},{"href":"#id-35","text":"3.2.2","element":"a"},{"text":", the suboptimal summary policy in GRPO may constrain the potential of BRPO’s effectiveness. To address this, we introduced decoupled optimization (detailed in Section ","element":"span"},{"href":"#id-26","text":"2.3","element":"a"},{"text":") to improve the summary policy, resulting in further performance gains.","element":"span"}]]},{"heading":"4 Related Works","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"Reinforcement Learning with Verifiable Rewards ","element":"span"},{"text":"Since the introduction of DeepSeek-R1 [","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"Guo ","element":"a"},{"href":"#id-1","referenceIndex":9,"text":"et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-1","referenceIndex":9,"text":"2025","element":"a"},{"text":"], a growing body of research has adopted the reinforcement learning with verifiable rewards (RLVR) paradigm [","element":"span"},{"href":"#id-40","referenceIndex":17,"text":"Lambert et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-40","referenceIndex":17,"text":"2024","element":"a"},{"text":"] to improve the reasoning capabilities of large language models (LLMs). SimpleRL [","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"Zeng et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-8","referenceIndex":40,"text":"2025","element":"a"},{"text":"] provides the first open-source replication of R1-Zero in mathematical domains and analyzes RL dynamics across various base models. ","element":"span"},{"href":"#id-41","referenceIndex":13,"text":"Hu ","element":"a"},{"href":"#id-41","referenceIndex":13,"text":"et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-41","referenceIndex":13,"text":"2025","element":"a"},{"text":"] demonstrate that removing the KL regularization used in RLHF [","element":"span"},{"href":"#id-42","referenceIndex":5,"text":"Christiano et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-42","referenceIndex":5,"text":"2017","element":"a"},{"text":"] improves both RL efficiency and asymptotic performance. ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":"] identify an optimization bias in GRPO [","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"Shao et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-12","referenceIndex":32,"text":"2024","element":"a"},{"text":"] and propose Dr.,GRPO, which applies a Monte Carlo policy gradient method with a baseline [","element":"span"},{"href":"#id-16","referenceIndex":34,"text":"Sutton and Barto","element":"a"},{"text":", ","element":"span"},{"href":"#id-16","referenceIndex":34,"text":"2018","element":"a"},{"text":"]. While these works improve our understanding of R1-Zero-style training, they still depend on sparse outcome-based rewards, which pose challenges for credit assignment and learning efficiency [","element":"span"},{"href":"#id-18","referenceIndex":15,"text":"Kazemnejad et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-18","referenceIndex":15,"text":"2024","element":"a"},{"text":"]. In contrast, our method introduces a novel policy optimization framework that leverages cheaply estimated ","element":"span"},{"style":{"fontStyle":"italic"},"text":"verifiable dense rewards ","element":"span"},{"text":"to improve sample efficiency and learning stability.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Token Budget Efficiency of Reasoning Models ","element":"span"},{"text":"Previous efforts have studied budgeted reasoning by reducing response length through prompting [","element":"span"},{"href":"#id-43","referenceIndex":14,"text":"Jin et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-43","referenceIndex":14,"text":"2024","element":"a"},{"text":", ","element":"span"},{"href":"#id-44","referenceIndex":27,"text":"Nayab et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-44","referenceIndex":27,"text":"2024","element":"a"},{"text":", ","element":"span"},{"href":"#id-45","referenceIndex":18,"text":"Lee et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-45","referenceIndex":18,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-46","referenceIndex":25,"text":"Ma et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-46","referenceIndex":25,"text":"2025","element":"a"},{"text":"] or adaptive sampling [","element":"span"},{"href":"#id-47","referenceIndex":38,"text":"Yang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-47","referenceIndex":38,"text":"2025","element":"a"},{"text":"]. While these training-free approaches can shorten outputs, they often entail a trade-off between conciseness and task performance. More recent work explores token efficiency within online RL frameworks, enabling models to jointly optimize for accuracy and brevity. ","element":"span"},{"href":"#id-48","referenceIndex":39,"text":"Yeo et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-48","referenceIndex":39,"text":"2025","element":"a"},{"text":"] observe that the output lengths on harder questions tend to grow during RL training, and propose a cosine-shaped reward to constrain length. ","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"Liu et al. ","element":"a"},{"text":"[","element":"span"},{"href":"#id-7","referenceIndex":23,"text":"2025","element":"a"},{"text":"] trace this issue to optimization bias in GRPO and show that correcting it enhances token efficiency. Further, ","element":"span"},{"href":"#id-49","referenceIndex":3,"text":"Arora and Zanette ","element":"a"},{"text":"[","element":"span"},{"href":"#id-49","referenceIndex":3,"text":"2025","element":"a"},{"text":"] and ","element":"span"},{"href":"#id-36","referenceIndex":1,"text":"Aggarwal and Welleck ","element":"a"},{"text":"[","element":"span"},{"href":"#id-36","referenceIndex":1,"text":"2025","element":"a"},{"text":"] apply explicit reward shaping to target shortened or fixed outputs. Our work differs by operating in an ","element":"span"},{"style":{"fontStyle":"italic"},"text":"anytime reasoning ","element":"span"},{"text":"framework, where the reasoning process can be interrupted at anytime and the best-effort solution should be provided [","element":"span"},{"href":"#id-3","referenceIndex":8,"text":"Dean and Boddy","element":"a"},{"text":", ","element":"span"},{"href":"#id-3","referenceIndex":8,"text":"1988","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":41,"text":"Zilberstein and Russell","element":"a"},{"text":", ","element":"span"},{"href":"#id-4","referenceIndex":41,"text":"1995","element":"a"},{"text":"]. Despite not explicitly enforcing conciseness, our objective naturally encourages efficient reasoning, as demonstrated empirically.","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"Connection to MRT ","element":"span"},{"text":"An independent work to ours, MRT [","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"Qu et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-10","referenceIndex":29,"text":"2025","element":"a"},{"text":"], optimizes test-time compute by minimizing cumulative regret relative to an oracle. Since the oracle is unknown, they employ meta-RL [","element":"span"},{"href":"#id-50","referenceIndex":37,"text":"Xiang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-50","referenceIndex":37,"text":"2025","element":"a"},{"text":", ","element":"span"},{"href":"#id-51","referenceIndex":4,"text":"Beck et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-51","referenceIndex":4,"text":"2023","element":"a"},{"text":"] as an approximation, aiming to maximize the \"progress\" of each newly generated ","element":"span"},{"style":{"fontStyle":"italic"},"text":"episode","element":"span"},{"text":". Despite sharing a similar high-level goal, our formulation fundamentally differs. Rather than minimizing regret, we optimize anytime performance by sampling the thinking budget from a prior distribution, remaining tractable with standard RL techniques. These foundational distinctions lead to significant methodological differences. Firstly, our approach operates on a per-token basis, instead of on ","element":"span"},{"style":{"fontStyle":"italic"},"text":"episode ","element":"span"},{"text":"which is ambiguous and can be hackable in RL if not well handled. Secondly, our method is grounded in principled RL, explicitly accounting for long-term returns. In contrast, MRT adopts a greedy strategy, optimizing the progress of immediate next episode only. Our experimental results also significantly outperform their reported outcomes. We achieve an accuracy of 32.7% compared to their reported 30.3% on AIME 2024.","element":"span"}]]},{"heading":"5 Conclusion","paragraphs":[[{"text":"The effectiveness of test-time scaling in LLM reasoning is commonly attributed to the generation-verification gap [","element":"span"},{"href":"#id-50","referenceIndex":37,"text":"Xiang et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-50","referenceIndex":37,"text":"2025","element":"a"},{"text":"], where verifying solutions is substantially easier than generating them. During reasoning, the model engages in an iterative search process, exploring potential solutions until a valid one is found. Once generated, the solution is verified for correctness, and this search-verification loop continues until a confident answer is produced.","element":"span"}],[{"text":"In this work, we present a framework that systematically exploits this generation-verification gap. Our approach is based on the key observation that verifying answers and extracting them from partial reasoning traces is easy and computationally cheap. Building on this insight, we design our framework to produce answers at some predefined thinking budgets, thereby introducing verifiable dense rewards to enhance RL training. Furthermore, we utilize these additional rewards to construct a more effective variance reduction baseline than GRPO, significantly improving the stability and efficiency of RL training. By integrating these techniques, our framework achieves superior performance in both standard and anytime reasoning tasks.","element":"span"}]]},{"heading":"References","paragraphs":[[{"id":"id-36","text":"Pranjal Aggarwal and Sean Welleck. L1: Controlling how long a reasoning model thinks with ","element":"span"},{"text":"reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv: 2503.04697","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-20","text":"Arash Ahmadian, Chris Cremer, Matthias Gallé, Marzieh Fadaee, Julia Kreutzer, Olivier Pietquin, ","element":"span"},{"text":"Ahmet Üstün, and Sara Hooker. Back to basics: Revisiting reinforce style optimization for learning from human feedback in llms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2402.14740","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-49","text":"Daman Arora and Andrea Zanette. Training language models to reason efficiently. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv: 2502.04463","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-51","text":"Jacob Beck, Risto Vuorio, Evan Zheran Liu, Zheng Xiong, Luisa Zintgraf, Chelsea Finn, and Shimon ","element":"span"},{"text":"Whiteson. A survey of meta-reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2301.08028","element":"span"},{"text":", 2023.","element":"span"}],[{"id":"id-42","text":"Paul F Christiano, Jan Leike, Tom Brown, Miljan Martic, Shane Legg, and Dario Amodei. Deep ","element":"span"},{"text":"reinforcement learning from human preferences. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", 30, 2017.","element":"span"}],[{"id":"id-11","text":"Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu ","element":"span"},{"text":"Yu, Qixin Xu, Weize Chen, et al. Process reinforcement through implicit rewards. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2502.01456","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-54","text":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher Ré. Flashattention: Fast and memory- ","element":"span"},{"text":"efficient exact attention with io-awareness. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", 35: 16344–16359, 2022.","element":"span"}],[{"id":"id-3","text":"Thomas L Dean and Mark S Boddy. An analysis of time-dependent planning. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AAAI","element":"span"},{"text":", volume 88, pages 49–54, 1988.","element":"span"}],[{"id":"id-1","text":"Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, ","element":"span"},{"text":"Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2501.12948","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-6","text":"Tingxu Han, Zhenting Wang, Chunrong Fang, Shiyu Zhao, Shiqing Ma, and Zhenyu Chen. Token- ","element":"span"},{"text":"budget-aware llm reasoning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2412.18547","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-30","text":"Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, ","element":"span"},{"text":"Xu Han, Yujie Huang, Yuxiang Zhang, et al. Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2402.14008","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-28","text":"Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, ","element":"span"},{"text":"and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2103.03874","element":"span"},{"text":", 2021.","element":"span"}],[{"id":"id-41","text":"Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, and Heung-Yeung Shum Xiangyu Zhang. Open- ","element":"span"},{"text":"reasoner-zero: An open source approach to scaling reinforcement learning on the base model. ","element":"span"},{"href":"https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero","style":{"fontFamily":"monospace"},"text":"https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero","element":"a"},{"text":", 2025.","element":"span"}],[{"id":"id-43","text":"Mingyu Jin, Qinkai Yu, Dong Shu, Haiyan Zhao, Wenyue Hua, Yanda Meng, Yongfeng Zhang, and ","element":"span"},{"text":"Mengnan Du. The impact of reasoning step length on large language models. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"ACL (Findings)","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-18","text":"Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, ","element":"span"},{"text":"Aaron Courville, and Nicolas Le Roux. Vineppo: Unlocking rl potential for llm reasoning through refined credit assignment. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2410.01679","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-52","text":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. ","element":"span"},{"text":"Gonzalez, Hao Zhang, and Ion Stoica. Efficient memory management for large language model serving with pagedattention. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles","element":"span"},{"text":", 2023.","element":"span"}],[{"id":"id-40","text":"Nathan Lambert, Jacob Morrison, Valentina Pyatkin, Shengyi Huang, Hamish Ivison, Faeze Brahman, ","element":"span"},{"text":"Lester James V Miranda, Alisa Liu, Nouha Dziri, Shane Lyu, et al. T","element":"span"},{"style":{"fontStyle":"italic"},"text":"\\","element":"span"},{"text":"\" ulu 3: Pushing frontiers in open language model post-training. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2411.15124","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-45","text":"Ayeong Lee, Ethan Che, and Tianyi Peng. How well do llms compress their own chain-of-thought? a ","element":"span"},{"text":"token complexity approach. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv: 2503.01141","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-29","text":"Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay Ra- ","element":"span"},{"text":"masesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, et al. Solving quantitative reasoning problems with language models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in Neural Information Processing Systems","element":"span"},{"text":", 35:3843–3857, 2022.","element":"span"}],[{"id":"id-27","text":"Jia Li, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Huang, Kashif ","element":"span"},{"text":"Rasul, Longhui Yu, Albert Q Jiang, Ziju Shen, et al. Numinamath: The largest public dataset in ai4maths with 860k pairs of competition math problems and solutions. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Hugging Face repository","element":"span"},{"text":", 13:9, 2024a.","element":"span"}],[{"id":"id-55","text":"Junyan Li, Delin Chen, Tianle Cai, Peihao Chen, Yining Hong, Zhenfang Chen, Yikang Shen, and ","element":"span"},{"text":"Chuang Gan. Flexattention for efficient high-resolution vision-language models. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"European Conference on Computer Vision","element":"span"},{"text":", pages 286–302. Springer, 2024b.","element":"span"}],[{"id":"id-19","text":"Ziniu Li, Tian Xu, Yushun Zhang, Zhihang Lin, Yang Yu, Ruoyu Sun, and Zhi-Quan Luo. Remax: A ","element":"span"},{"text":"simple, effective, and efficient reinforcement learning method for aligning large language models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2310.10505","element":"span"},{"text":", 2023.","element":"span"}],[{"id":"id-7","text":"Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min ","element":"span"},{"text":"Lin. Understanding r1-zero-like training: A critical perspective. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2503.20783","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-9","text":"Michael Luo, Sijun Tan, Justin Wong, Xiaoxiang Shi, William Y. Tang, Manan Roongta, Colin Cai, ","element":"span"},{"text":"Jeffrey Luo, Tianjun Zhang, Li Erran Li, Raluca Ada Popa, and Ion Stoica. Deepscaler: Surpassing o1-preview with a 1.5b model by scaling rl. ","element":"span"},{"href":"https://github.com/agentica-project/deepscaler","style":{"fontFamily":"monospace"},"text":"https://github.com/agentica-project/ ","element":"a"},{"href":"https://github.com/agentica-project/deepscaler","style":{"fontFamily":"monospace"},"text":"deepscaler","element":"a"},{"text":", 2025.","element":"span"}],[{"id":"id-46","text":"Wenjie Ma, Jingxuan He, Charlie Snell, Tyler Griggs, Sewon Min, and Matei Zaharia. Reasoning ","element":"span"},{"text":"models can be effective without thinking. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2504.09858","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-14","text":"Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke ","element":"span"},{"text":"Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori Hashimoto. s1: Simple test-time scaling. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2501.19393","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-44","text":"Sania Nayab, Giulio Rossolini, Marco Simoni, Andrea Saracino, Giorgio Buttazzo, Nicolamaria ","element":"span"},{"text":"Manes, and Fabrizio Giacomelli. Concise thoughts: Impact of output length on llm reasoning and cost. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv: 2407.19825","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-0","text":"OpenAI. ","element":"span"},{"text":"Learning to reason with llms, ","element":"span"},{"text":"2024. ","element":"span"},{"text":"URL ","element":"span"},{"href":"https://openai.com/index/learning-to-reason-with-llms/","style":{"fontFamily":"monospace"},"text":"https://openai.com/index/ ","element":"a"},{"href":"https://openai.com/index/learning-to-reason-with-llms/","style":{"fontFamily":"monospace"},"text":"learning-to-reason-with-llms/","element":"a"},{"text":".","element":"span"}],[{"id":"id-10","text":"Yuxiao Qu, Matthew YR Yang, Amrith Setlur, Lewis Tunstall, Edward Emanuel Beeching, Ruslan ","element":"span"},{"text":"Salakhutdinov, and Aviral Kumar. Optimizing test-time compute via meta reinforcement finetuning. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2503.07572","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-17","text":"John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. High-dimensional ","element":"span"},{"text":"continuous control using generalized advantage estimation. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1506.02438","element":"span"},{"text":", 2015.","element":"span"}],[{"id":"id-24","text":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy ","element":"span"},{"text":"optimization algorithms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:1707.06347","element":"span"},{"text":", 2017.","element":"span"}],[{"id":"id-12","text":"Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, ","element":"span"},{"text":"Mingchuan Zhang, YK Li, Y Wu, et al. Deepseekmath: Pushing the limits of mathematical reasoning in open language models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2402.03300","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-23","text":"Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, ","element":"span"},{"text":"Haibin Lin, and Chuan Wu. Hybridflow: A flexible and efficient rlhf framework. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2409.19256","element":"span"},{"text":", 2024.","element":"span"}],[{"id":"id-16","text":"Richard S. Sutton and Andrew G. Barto. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Reinforcement Learning: An Introduction","element":"span"},{"text":". The MIT Press, second edition, 2018.","element":"span"}],[{"id":"id-5","text":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, ","element":"span"},{"text":"Johan Schalkwyk, Andrew M Dai, Anja Hauth, Katie Millican, et al. Gemini: a family of highly capable multimodal models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2312.11805","element":"span"},{"text":", 2023.","element":"span"}],[{"id":"id-2","text":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed Chi, Quoc V Le, Denny ","element":"span"},{"text":"Zhou, et al. Chain-of-thought prompting elicits reasoning in large language models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Advances in neural information processing systems","element":"span"},{"text":", 35:24824–24837, 2022.","element":"span"}],[{"id":"id-50","text":"Violet Xiang, Charlie Snell, Kanishk Gandhi, Alon Albalak, Anikait Singh, Chase Blagden, Duy ","element":"span"},{"text":"Phung, Rafael Rafailov, Nathan Lile, Dakota Mahan, et al. Towards system 2 reasoning in llms: Learning how to think with meta chain-of-though. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2501.04682","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-47","text":"Chenxu Yang, Qingyi Si, Yongjie Duan, Zheliang Zhu, Chenyu Zhu, Zheng Lin, Li Cao, and Weiping ","element":"span"},{"text":"Wang. Dynamic early exit in reasoning models. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv: 2504.15895","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-48","text":"Edward Yeo, Yuxuan Tong, Morry Niu, Graham Neubig, and Xiang Yue. Demystifying long ","element":"span"},{"text":"chain-of-thought reasoning in llms. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2502.03373","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-8","text":"Weihao Zeng, Yuzhen Huang, Qian Liu, Wei Liu, Keqing He, Zejun Ma, and Junxian He. Simplerl- ","element":"span"},{"text":"zoo: Investigating and taming zero reinforcement learning for open base models in the wild. ","element":"span"},{"style":{"fontStyle":"italic"},"text":"arXiv preprint arXiv:2503.18892","element":"span"},{"text":", 2025.","element":"span"}],[{"id":"id-4","text":"Shlomo Zilberstein and Stuart Russell. Approximate reasoning using anytime algorithms. In ","element":"span"},{"style":{"fontStyle":"italic"},"text":"Imprecise and approximate computation","element":"span"},{"text":", pages 43–62. Springer, 1995.","element":"span"}]]},{"heading":"Appendix Table of Contents","paragraphs":[[{"style":{"fontWeight":"bold"},"text":"A Implementation Details ","element":"span"},{"style":{"fontWeight":"bold"},"text":"13","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"B ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Tree-like Generation and Training ","element":"span"},{"style":{"fontWeight":"bold"},"text":"13","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"C Relation Between Standard and Anytime Reasoning ","element":"span"},{"style":{"fontWeight":"bold"},"text":"14","element":"span"}],[{"style":{"fontWeight":"bold"},"text":"D Experimental Results ","element":"span"},{"style":{"fontWeight":"bold"},"text":"15","element":"span"}],[{"text":"D.1 ","element":"span"},{"href":"#id-33","text":"Main Results ","element":"a"},{"text":". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ","element":"span"},{"text":"15","element":"span"}]]},{"heading":"A Implementation Details","paragraphs":[[{"text":"We illustrate the implementation details about how we truncate the reasoning process and prompt the model to output an answer.","element":"span"}],[{"style":{"width":"99%"},"width":1578,"height":680,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/12-0.png","element":"img"}],[{"text":"Figure 8: We decouple the generation of thinking and its summary. Given the question, the model first generates the ","element":"figcaption","subtype":"caption"},{"text":"thinking","element":"figcaption","subtype":"caption"},{"text":", which can be stopped by a special token ","element":"figcaption","subtype":"caption"},{"style":{"fontStyle":"italic"},"text":" ","element":"figcaption","subtype":"caption"},{"text":"or the budget limit. Then we insert ","element":"figcaption","subtype":"caption"},{"style":{"height":7.6},"width":48.77,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/12-1.png","element":"img","alt":" ∗ ∗","inline":true,"padRight":true},{"text":"Final Answer ","element":"figcaption","subtype":"caption"},{"style":{"height":7.6},"width":48.77,"height":19,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/12-2.png","element":"img","alt":" ∗ ∗","inline":true,"padRight":true},{"text":"(and two ellipsis ","element":"figcaption","subtype":"caption"},{"style":{"height":14.8},"width":306.98,"height":37,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/12-3.png","element":"img","alt":" · · · plus ","inline":true,"padRight":true},{"text":"for out of budget cases) to prompt the model to summarize the ","element":"figcaption","subtype":"caption"},{"text":"answer","element":"figcaption","subtype":"caption"},{"text":". In training, these inserted tokens will be ignored when calculating the loss.","element":"figcaption","subtype":"caption"}]]},{"heading":"B Tree-like Generation and Training","paragraphs":[[{"text":"Unlike previous methods with sequential question-response generation and training, our approach employs a tree-like structure. In this section, we introduce how to address implementation challenges for efficient training.","element":"span"}],[{"text":"During generation, we use the prefix caching feature of vLLM [","element":"span"},{"href":"#id-52","referenceIndex":16,"text":"Kwon et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-52","referenceIndex":16,"text":"2023","element":"a"},{"text":"] to reuse computations. We sample a complete thinking process ","element":"span"},{"style":{"fontStyle":"italic"},"text":"z ","element":"span"},{"text":"for a question ","element":"span"},{"style":{"fontStyle":"italic"},"text":"x","element":"span"},{"text":", then split it based on predefined token budgets (","element":"span"},{"style":{"fontStyle":"italic"},"text":"{","element":"span"},{"style":{"fontStyle":"italic"},"text":"i, j, k","element":"span"},{"style":{"fontStyle":"italic"},"text":"} ","element":"span"},{"text":"in Figure ","element":"span"},{"href":"#id-53","text":"9","element":"a"},{"text":"). Each partial thinking process is appended with a special end-of-think token (","element":"span"},{"style":{"fontStyle":"italic"},"text":"","element":"span"},{"text":"), and the model is prompted to output the answer directly (see Appendix ","element":"span"},{"text":"A ","element":"span"},{"text":"for more details).","element":"span"}],[{"id":"id-53","style":{"width":"93%"},"width":1480,"height":423,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-0.png","element":"img"}],[{"text":"Figure 9: Our methods utilize a tree-like structure for generation and training.","element":"figcaption","subtype":"caption"}],[{"text":"During training, each response is typically concatenated with its corresponding question using FlashAttention [","element":"span"},{"href":"#id-54","referenceIndex":7,"text":"Dao et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-54","referenceIndex":7,"text":"2022","element":"a"},{"text":"] for speed. However, this introduces significant duplicated computation for tree-like structures, making it impractical due to high computational demands for LLM training. We implement a tree structure attention mask based on FlexAttention [","element":"span"},{"href":"#id-55","referenceIndex":21,"text":"Li et al.","element":"a"},{"text":", ","element":"span"},{"href":"#id-55","referenceIndex":21,"text":"2024b","element":"a"},{"text":"]. As shown in Figure ","element":"span"},{"href":"#id-53","text":"9","element":"a"},{"text":", we append all summaries at the end of the thinking process and record their connection positions in a 1D tensor. This tensor is converted to a block mask by FlexAttention, avoiding 2D tensors that can cause out-of-memory issues for long generation lengths.","element":"span"}]]},{"heading":"C Relation Between Standard and Anytime Reasoning","paragraphs":[[{"text":"In this section, we provide a proof for the inequality below:","element":"span"}],[{"style":{"width":"49%"},"width":782,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-1.png","element":"img"}],[{"text":"According to equation ","element":"span"},{"href":"#id-56","text":"4","element":"a"},{"text":", we have:","element":"span"}],[{"style":{"width":"46%"},"width":735,"height":65,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-2.png","element":"img"}],[{"id":"id-57","text":"Thus, it follows that:","element":"span"}],[{"style":{"width":"76%"},"width":1207,"height":239,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-3.png","element":"img"}],[{"text":"Assuming ","element":"span"},{"style":{"height":16},"width":190.89,"height":40,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-4.png","element":"img","alt":" r(x, y) ≥ 0","inline":true},{"text":", which is always achievable by adding a constant to each reward, we also have:","element":"span"}],[{"style":{"width":"76%"},"width":1207,"height":324,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-5.png","element":"img"}],[{"id":"id-58","text":"Combining ","element":"span"},{"href":"#id-57","text":"11 ","element":"a"},{"text":"and ","element":"span"},{"href":"#id-58","text":"12","element":"a"},{"text":", we can get","element":"span"}],[{"style":{"width":"74%"},"width":1185,"height":88,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/13-6.png","element":"img"}],[{"text":"This completes the proof.","element":"span"}],[{"id":"id-61","style":{"width":"94%"},"width":1500,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/14-0.png","element":"img"}],[{"text":"Table 1: The ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"Final Accuracy ","element":"figcaption","subtype":"caption"},{"text":"by evaluating the maximum budget (8000) for the final models.","element":"figcaption","subtype":"caption"}],[{"id":"id-62","style":{"width":"94%"},"width":1500,"height":286,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/14-1.png","element":"img"}],[{"text":"Table 2: The ","element":"figcaption","subtype":"caption"},{"style":{"fontWeight":"bold"},"text":"Anytime Accuracy ","element":"figcaption","subtype":"caption"},{"text":"by evaluating 32 budgets (every 250 tokens) for the final models.","element":"figcaption","subtype":"caption"}]]},{"heading":"D Experimental Results","paragraphs":[[{"id":"id-33","style":{"fontWeight":"bold"},"text":"D.1 ","element":"span"},{"style":{"fontWeight":"bold"},"text":"Main Results","element":"span"}],[{"text":"We present the training curves of our ","element":"span"},{"style":{"fontStyle":"italic"},"text":"AnytimeReasoner ","element":"span"},{"text":"in Figure ","element":"span"},{"href":"#id-59","text":"10","element":"a"},{"text":", corresponding to the experiments in Section ","element":"span"},{"href":"#id-60","text":"3.1","element":"a"},{"text":". We also evaluate the performance of the models at training step of 600, and report the final accuracy in Table ","element":"span"},{"href":"#id-61","text":"1 ","element":"a"},{"text":"and the anytime accuracy in Table ","element":"span"},{"href":"#id-62","text":"2","element":"a"},{"text":".","element":"span"}],[{"id":"id-59","style":{"width":"93%"},"width":1487,"height":731,"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.13438/images/14-2.png","element":"img"}],[{"text":"Figure 10: The training curves for main results.","element":"figcaption","subtype":"caption"}]]}],"_version":"3.3.4"},"paperNode":"$1b:props:children:props:children:0:props:product"}]]]}]}]