2e:[[["$","$L34","0",{"heading":"Abstract","index":0,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-45","style":"$undefined","children":"Recent advances in multimodal Reward Models (RMs) have shown significant "}]}],["$","$1","1",{"children":"promise in delivering reward signals to align vision models with human preferences. However, current RMs are generally restricted to providing direct responses or engaging in shallow reasoning processes with limited depth, often leading to inaccurate reward signals. We posit that incorporating explicit long chains of thought (CoT) into the reward reasoning process can significantly strengthen their reliability and robustness. Furthermore, we believe that once RMs internalize CoT reasoning, their direct response accuracy can also be improved through implicit reasoning capabilities. To this end, this paper proposes U"}],["$","$1","2",{"children":"NIFIED"}],["$","$1","3",{"children":"R"}],["$","$1","4",{"children":"EWARD"}],["$","$1","5",{"children":"-T"}],["$","$1","6",{"children":"HINK"}],["$","$1","7",{"children":", the first unified multimodal CoT-based reward model, capable of multi-dimensional, step-by-step long-chain reasoning for both visual understanding and generation reward tasks. Specifically, we adopt an exploration-driven reinforcement fine-tuning approach to elicit and incentivize the model’s latent complex reasoning"}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"$35"}]]}]]}],["$","$L34","1",{"heading":"1 Introduction","index":1,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In recent years, multimodal reward models (RMs) "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:2"}]}],["$","$1","3",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:5"}]}],["$","$1","6",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:6"}]}],["$","$1","7",{"children":", "}],["$","$1","8",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:8"}]}],["$","$1","9",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:9"}]}],["$","$1","10",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:10"}]}],["$","$1","11",{"children":", "}],["$","$1","12",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:12"}]}],["$","$1","13",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:13"}]}],["$","$1","14",{"children":", "}],["$","$1","15",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:15"}]}],["$","$1","16",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:16"}]}],["$","$1","17",{"children":", "}],["$","$1","18",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:18"}]}],["$","$1","19",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:19"}]}],["$","$1","20",{"children":", "}],["$","$1","21",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:21"}]}],["$","$1","22",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:22"}]}],["$","$1","23",{"children":"have excelled at aligning vision model outputs with human preferences, providing crucial reward signals to guide model training "}],["$","$1","24",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:24"}]}],["$","$1","25",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:25"}]}],["$","$1","26",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:26"}]}],["$","$1","27",{"children":", "}],["$","$1","28",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:28"}]}],["$","$1","29",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:29"}]}],["$","$1","30",{"children":", "}],["$","$1","31",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:31"}]}],["$","$1","32",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:32"}]}],["$","$1","33",{"children":", "}],["$","$1","34",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:34"}]}],["$","$1","35",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:35"}]}],["$","$1","36",{"children":", "}],["$","$1","37",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:37"}]}],["$","$1","38",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:38"}]}],["$","$1","39",{"children":"and inference "}],["$","$1","40",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:40"}]}],["$","$1","41",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:41"}]}],["$","$1","42",{"children":", "}],["$","$1","43",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:43"}]}],["$","$1","44",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:44"}]}],["$","$1","45",{"children":". Traditional reward models are typically trained on large-scale human-annotated preference data through supervised fine-tuning (SFT). At test time, most methods "}],["$","$1","46",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:46"}]}],["$","$1","47",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:47"}]}],["$","$1","48",{"children":", "}],["$","$1","49",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:49"}]}],["$","$1","50",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:50"}]}],["$","$1","51",{"children":", "}],["$","$1","52",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:52"}]}],["$","$1","53",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:53"}]}],["$","$1","54",{"children":"directly assign scores or provide pairwise ranking for vision model outputs, relying on the knowledge and intuitions acquired from the training data. While effective, these methods tend to lack interpretability, which makes it difficult for users to understand the underlying reasoning process behind the assigned scores or rankings. To this end, recent studies "}],["$","$1","55",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:55"}]}],["$","$1","56",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:56"}]}],["$","$1","57",{"children":", "}],["$","$1","58",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:58"}]}],["$","$1","59",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:59"}]}],["$","$1","60",{"children":", "}],["$","$1","61",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:61"}]}],["$","$1","62",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:62"}]}],["$","$1","63",{"children":", "}],["$","$1","64",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:64"}]}],["$","$1","65",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:0:65"}]}],["$","$1","66",{"children":"leverage the generative capabilities of Visual-Language Models (VLMs), enabling RMs to provide concise justifications alongside the assigned reward signals. Despite their success, their reasoning processes often lack rigorous logical structure and the capacity for multi-dimensional, deep analysis, which may result in inaccurate reward signals in complex scenarios or misguided conclusions arising from flawed reasoning processes."}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In light of these issues, we posit two key hypotheses: (1) Incorporating explicit long Chains-of-Thought (CoT) into the reward reasoning process is essential for significantly enhancing RM’s reliability and robustness; (2) Once the model internalizes this ability, the accuracy of its directly provided reward signals, even without CoT reasoning traces, can also be improved by leveraging its implicit logical thinking capabilities. However, equipping RMs with CoT reasoning using traditional training schemes like SFT poses a highly challenge due to the scarcity of large-scale CoT reward data, as manual annotation requires substantial human resources and time."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In this work, we argue that this challenge can be effectively addressed, as VLMs inherently possess prior knowledge of complex reasoning; what is needed is an effective strategy to elicit and incentivize this capability "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:2"}]}],["$","$1","3",{"children":". Therefore, this paper proposes U"}],["$","$1","4",{"children":"NIFIED"}],["$","$1","5",{"children":"R"}],["$","$1","6",{"children":"EWARD"}],["$","$1","7",{"children":"-T"}],["$","$1","8",{"children":"HINK"}],["$","$1","9",{"children":", the first unified multimodal CoT-based reward model, capable of performing multi-dimensional, step-by-step long-chain reasoning across both visual understanding and generation reward tasks. The core idea is to activate the model’s latent long-chain reasoning capabilities through limited CoT reward data and to progressively reinforce and refine this capability through exploration-driven reinforcement fine-tuning that optimizes for accurate and robust reasoning patterns. Specifically, our training pipeline, as shown in Fig. "}],["$","$1","10",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:10"}]}],["$","$1","11",{"children":"consists of three stages: (1) "}],["$","$1","12",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:12:style","children":"Cold Start"}]}],["$","$1","13",{"children":". We first distill the reasoning process of GPT-4o "}],["$","$1","14",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:14"}]}],["$","$1","15",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:15"}]}],["$","$1","16",{"children":"using fewer image generation preference data, which serves to cold-start the model training, enabling it to learn the structure and format of long CoT reasoning. (2) "}],["$","$1","17",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:17:style","children":"Rejection Sampling"}]}],["$","$1","18",{"children":". Next, we prepare large-scale unified multimodal preference data to incentivize the model’s CoT reasoning outputs across various visual reward tasks. Correct reasoning trajectories are retained and utilized through rejection sampling to strengthen the distribution of correct reasoning patterns. (3) "}],["$","$1","19",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:19:style","children":"Group Relative Policy Optimization "}]}],["$","$1","20",{"children":"(GRPO). Finally, incorrectly reasoned samples are leveraged for GRPO-based reinforcement fine-tuning, enabling the model to explore diverse reasoning paths and optimize toward desirable outcomes defined by our verified rewards ("}],["$","$1","21",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:2:21:style","children":"i.e., "}]}],["$","$1","22",{"children":"format reward and accuracy reward). Unlike SFT, which merely imitates predefined answers, GRPO promotes trial-and-error"}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/2-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:3:0"}]]}]}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 2: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-14","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:1:style","children":"Method Overview. "}]}],["$","$1","2",{"children":"The training pipeline consists of three key stages: (1) "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:3:style","children":"Cold Start"}]}],["$","$1","4",{"children":": We first distill GPT-4o’s reasoning process on a small amount of image generation preference data to initialize the model’s CoT reasoning format; (2) "}],["$","$1","5",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:5:style","children":"Rejection Sampling"}]}],["$","$1","6",{"children":": Then, we leverage the model’s generalization capabilities on large-scale unified multimodal preference data to elicit its CoT reasoning process across various vision tasks, using correctly predicted samples for rejection sampling to refine the model; (3) "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:1:paragraphs:4:7:style","children":"GRPO"}]}],["$","$1","8",{"children":": Finally, incorrectly predicted samples are utilized for GRPO-based reinforcement fine-tuning to further enhance the model’s step-by-step reasoning capabilities."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"learning by evaluating and refining the model’s reasoning outputs based on verified rewards, thus encouraging deeper reasoning discovery rather than passive memorization."}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Extensive experiments demonstrate that incorporating long CoT reasoning significantly improves the accuracy and reliability of reward signals. Remarkably, experimental results also prove that once the model internalizes the CoT reasoning ability, it also exhibits strong implicit reasoning capabilities: even when providing direct reward outputs without explicit reasoning traces, it consistently outperforms existing baselines across all vision reward tasks."}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In summary, our contributions are threefold: (1) We propose the first unified multimodal CoT-based reward model, U"}],["$","$1","1",{"children":"NIFIED"}],["$","$1","2",{"children":"R"}],["$","$1","3",{"children":"EWARD"}],["$","$1","4",{"children":"-T"}],["$","$1","5",{"children":"HINK"}],["$","$1","6",{"children":", capable of multi-dimensional, step-by-step long-chain reasoning across both visual understanding and generation tasks; (2) We demonstrate that explicit long CoT reasoning substantially enhances reward model reliability, and once mastered, also strengthens implicit reasoning, enabling more accurate reward signals even without explicit reasoning traces; (3) Extensive experiments validate the superiority of our method compared with existing baselines across all vision reward tasks. We hope our work can unlock reward models’ reasoning potential to enhance interpretability, generalization, and alignment, enabling more trustworthy and human-aligned reward signals for multimodal generation and understanding."}]]}]]}],["$","$L34","2",{"heading":"2 Related Work","index":2,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:0:style","children":"2.1 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:0:1:style","children":"Multimodal Reward Models"}]}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Multimodal reward models have become increasingly important for aligning vision understanding and generation models with human preferences. A dominant approach is to fine-tune visual-language "}],["$","$1","1",{"children":"models (VLMs) "}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:2"}]}],["$","$1","3",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:5"}]}],["$","$1","6",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:6"}]}],["$","$1","7",{"children":", exploiting their powerful multimodal alignment capabilities to learn human judgment-based reward functions. Earlier studies have explored reward models on visual generation "}],["$","$1","8",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:8"}]}],["$","$1","9",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:9"}]}],["$","$1","10",{"children":", "}],["$","$1","11",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:11"}]}],["$","$1","12",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:12"}]}],["$","$1","13",{"children":", "}],["$","$1","14",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:14"}]}],["$","$1","15",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:15"}]}],["$","$1","16",{"children":", "}],["$","$1","17",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:17"}]}],["$","$1","18",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:18"}]}],["$","$1","19",{"children":"and understanding "}],["$","$1","20",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:20"}]}],["$","$1","21",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:21"}]}],["$","$1","22",{"children":", "}],["$","$1","23",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:23"}]}],["$","$1","24",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:24"}]}],["$","$1","25",{"children":"tasks. For instance, "}],["$","$1","26",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:26"}]}],["$","$1","27",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:27"}]}],["$","$1","28",{"children":"collects human feedback and constructs human-rated video datasets to train the reward model, LiFTCritic, which measures how well the generated videos align with human expectations. "}],["$","$1","29",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:29"}]}],["$","$1","30",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:30"}]}],["$","$1","31",{"children":"develops an effective pipeline for constructing multimodal preference datasets and leverages existing high-quality data to train the reward model, IXC-2.5-Reward, enabling accurate evaluation of visual understanding outputs. However, these reward models are task-specific, limiting their adaptability across diverse visual understanding and generation tasks. To address this limitation, "}],["$","$1","32",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:32"}]}],["$","$1","33",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:1:33"}]}],["$","$1","34",{"children":"introduces UnifiedReward, a unified reward model capable of assessing both image and video generation and understanding tasks, demonstrating that joint learning across diverse visual tasks can yield substantial mutual benefits."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Despite their effectiveness, these reward models are largely limited to providing direct responses "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:2"}]}],["$","$1","3",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:3"}]}],["$","$1","4",{"children":", "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:5"}]}],["$","$1","6",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:6"}]}],["$","$1","7",{"children":", "}],["$","$1","8",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:8"}]}],["$","$1","9",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:9"}]}],["$","$1","10",{"children":", "}],["$","$1","11",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:11"}]}],["$","$1","12",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:12"}]}],["$","$1","13",{"children":"or engaging in shallow reasoning with limited depth "}],["$","$1","14",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:14"}]}],["$","$1","15",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:15"}]}],["$","$1","16",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:16"}]}],["$","$1","17",{"children":", "}],["$","$1","18",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:18"}]}],["$","$1","19",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:2:19"}]}],["$","$1","20",{"children":", often resulting in inaccurate or unreliable reward signals in complex scenarios or misguided conclusions arising from flawed reasoning processes. To this end, we propose U"}],["$","$1","21",{"children":"NIFIED"}],["$","$1","22",{"children":"R"}],["$","$1","23",{"children":"EWARD"}],["$","$1","24",{"children":"-T"}],["$","$1","25",{"children":"HINK"}],["$","$1","26",{"children":", the first unified multimodal long CoT-based reward model, enabling multi-dimensional long-chain reasoning for both visual understanding and generation tasks. The core idea is to use reinforcement learning to activate and enhance VLMs’ latent reasoning capabilities, which will be discussed in the following section."}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:3:0:style","children":"2.2 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:3:1:style","children":"Reinforcement Learning"}]}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Recently, reinforcement learning (RL) techniques have been extensively used to enhance the reasoning capabilities of Large-Language Models (LLMs), enabling them to effectively solve complex problems "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:2"}]}],["$","$1","3",{"children":", "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:4"}]}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:5"}]}],["$","$1","6",{"children":", "}],["$","$1","7",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:7"}]}],["$","$1","8",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:8"}]}],["$","$1","9",{"children":", "}],["$","$1","10",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:10"}]}],["$","$1","11",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:11"}]}],["$","$1","12",{"children":", "}],["$","$1","13",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:13"}]}],["$","$1","14",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:14"}]}],["$","$1","15",{"children":", "}],["$","$1","16",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:16"}]}],["$","$1","17",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:17"}]}],["$","$1","18",{"children":", "}],["$","$1","19",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:19"}]}],["$","$1","20",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:20"}]}],["$","$1","21",{"children":", "}],["$","$1","22",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:22"}]}],["$","$1","23",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:23"}]}],["$","$1","24",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:24"}]}],["$","$1","25",{"children":". A significant advancement in this field is Deepseek-R1-Zero "}],["$","$1","26",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:26"}]}],["$","$1","27",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:27"}]}],["$","$1","28",{"children":", which introduced a novel approach for developing robust reasoning capabilities using Group Relative Policy Optimization (GRPO). By leveraging rule-based rewards, it enhances reasoning without the need for supervised fine-tuning (SFT). For VLMs, RL has mainly been applied to tasks such as mitigating hallucinations "}],["$","$1","29",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:29"}]}],["$","$1","30",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:30"}]}],["$","$1","31",{"children":", "}],["$","$1","32",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:32"}]}],["$","$1","33",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:33"}]}],["$","$1","34",{"children":", aligning models with human preferences "}],["$","$1","35",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:35"}]}],["$","$1","36",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:36"}]}],["$","$1","37",{"children":", "}],["$","$1","38",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:38"}]}],["$","$1","39",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:39"}]}],["$","$1","40",{"children":", improving reasoning and visual perception "}],["$","$1","41",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:41"}]}],["$","$1","42",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:42"}]}],["$","$1","43",{"children":", and visual quality assessment "}],["$","$1","44",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:44"}]}],["$","$1","45",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:2:paragraphs:4:45"}]}],["$","$1","46",{"children":"."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"However, the application of RL in multimodal reward models is still under exploration. To address this gap, our work introduces a reinforcement fine-tuning strategy that leverages verifiable rewards with GRPO-based RL to integrate long CoT reasoning, enhancing the accuracy of reward signals by enabling multi-dimensional and step-by-step reasoning processes."}]]}]]}],["$","$L34","3",{"heading":"3 Method","index":3,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:0:0:style","children":"3.1 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:0:1:style","children":"Overview"}]}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This work aims to incorporate long Chain-of-Thought (CoT) reasoning into the reward model’s decision-making process to enhance the reliability and robustness of reward signals. However, achieving this with traditional training methods like Supervised Fine-Tuning (SFT) remains highly challenging due to the scarcity of large-scale CoT-based reward data. This work posits that VisualLanguage Models (VLMs) inherently possess prior knowledge of complex reasoning; the key challenge lies in devising an effective strategy to elicit and incentivize this capability. Therefore, we take the first step to propose the unified multimodal CoT-based reward model, U"}],["$","$1","1",{"children":"NIFIED"}],["$","$1","2",{"children":"R"}],["$","$1","3",{"children":"EWARD"}],["$","$1","4",{"children":"-T"}],["$","$1","5",{"children":"HINK"}],["$","$1","6",{"children":", adopting exploration-driven reinforcement fine-tuning to activate and refine the VLM’s multi-dimensional and step-by-step long chain reasoning across various vision reward tasks."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Specifically, as shown in Fig. "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:2:1"}]}],["$","$1","2",{"children":"our pipeline includes three key stages: (1) Cold Start: use a small amount of distilled CoT reward data to initialize the reward model with the format and structure of multi-step reasoning (Sec. "}],["$","$1","3",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:2:3"}]}],["$","$1","4",{"children":"; (2) Rejection Sampling: Utilize large-scale unified preference data to elicit the model’s generalizable CoT reasoning across diverse vision tasks; correctly reasoned samples are retained for rejection sampling to reinforce accurate reasoning patterns (Sec. "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:2:5"}]}],["$","$1","6",{"children":"; (3) Group"}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/4-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:3:0"}]]}]}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 3: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-46","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:4:1:style","children":"Qualitative Results of Video Generation CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given a pair of videos and the corresponding caption, our model performs quality assessment across semantic consistency, temporal coherence, and authenticity through CoT reasoning."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Relative Policy Optimization (GRPO): Leverage incorrectly reasoned samples for GRPO-based reinforcement fine-tuning to further improve the model’s CoT reasoning capabilities (Sec. "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:5:1"}]}],["$","$1","2",{"children":"."}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-33","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:6:0:style","children":"3.2 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:6:1:style","children":"Cold Start: Learning CoT Reward Format"}]}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In this work, we hypothesize that VLMs inherently possess the potential for complex, long-chain reasoning. However, due to the absence of exposure to reward modeling during pre-training, they often lack a suitable format to articulate such reasoning in this context, which leads to inconsistent or shallow outputs. To address this, we initiate the training with a cold start phase, where a small amount of high-quality CoT reward data distilled from GPT-4o "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:2"}]}],["$","$1","3",{"children":"is used to explicitly teach the model the desired reasoning format and structure. Specifically, we begin by preparing a small set of image generation preference data, each consisting of an image pair and a prompt ("}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:4:style","children":"i.e., "}]}],["$","$1","5",{"children":"instruction and image caption). These samples are then fed into GPT-4o to generate the detailed long-chain reasoning process and the corresponding final answer. Among the generated samples, we retain only those where the reasoning trajectory leads to correct answers, denoted as "}],["$","$1","6",{"children":"("}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:7:style","children":"x"}]}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:8:style","children":", "}]}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:9:style","children":"y"}]}],["$","$1","10",{"children":") "}],["$","$1","11",{"children":"where "}],["$","$1","12",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:12:style","children":"x "}]}],["$","$1","13",{"children":"is the original input, and "}],["$","$1","14",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:14:style","children":"y "}]}],["$","$1","15",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:7:15"}]]}]}],["$","$1","16",{"children":"is the distilled output, which are subsequently used to cold-start the training. The objective function is defined as:"}]]}],["$","$L1a","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/4-2.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:8:0"}]]}]}]]}],["$","$L1a","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"where "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:9:1"}]]}]}],["$","$1","2",{"children":"represents the parameters of the reward model. This stage serves to initialize the model’s ability to follow a structured CoT reasoning format."}]]}],["$","$L1a","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-34","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:10:0:style","children":"3.3 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:10:1:style","children":"Rejection Sampling: Unified CoT Reward Generalization Fine-Tuning"}]}]]}],["$","$L1a","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"After learning the CoT reasoning format through cold-start training, we further elicit and expand the model’s reasoning capabilities to vision understanding and generation tasks. This is inspired by a prior study "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:2"}]}],["$","$1","3",{"children":"$39"}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:4"}]}],["$","$1","5",{"children":"but it utilizes the filtered reasoning data of diverse vision tasks ("}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:6:style","children":"x"}]}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:7"}]]}]}],["$","$1","8",{"children":", "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:9:style","children":"y"}]}],["$","$1","10",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:11:10"}]]}]}],["$","$1","11",{"children":") obtained through rejection sampling."}]]}],["$","$L1a","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/5-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:12:0"}]]}]}]]}],["$","$L1a","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"1. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:13:1:style","children":"Semantic accuracy"}]}],["$","$1","2",{"children":": - Answer 1 (8/10) - The answer captures the emotions of happiness and freedom well, supported by the boy's smile and the peaceful"}]]}],["$","$L1a","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"suburban setting. However, it could delve deeper into the visual details. - Answer 2 (9/10) - This answer effectively conveys the themes of youthful freedom and innocence, supported by the boy's expression"}]]}],["$","$L1a","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"2. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:15:1:style","children":"Correctness"}]}],["$","$1","2",{"children":": - Answer 1 (7/10) - The interpretation is correct but somewhat simplistic. It focuses mainly on the boy's emotions without much depth. - Answer 2 (9/10) - The answer is factually correct and provides a nuanced understanding of the scene, including the setting and the"}]]}],["$","$L1a","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/5-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:16:0"}]]}]}]]}],["$","$L1a","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 4: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-47","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:17:1:style","children":"Qualitative Cases of Image and Video Understanding CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given an image or video, a query, and a pair of candidate answers, our model performs quality assessment across semantic accuracy, factual correctness, and clarity through CoT reasoning."}]]}],["$","$L1a","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This process concentrates the training distribution around accurate reasoning patterns and enhances the model’s generalization ability across diverse visual domains."}]]}],["$","$L1a","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-35","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:19:0:style","children":"3.4 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:19:1:style","children":"GRPO: Unified CoT Reward Reinforcement Fine-Tuning"}]}]]}],["$","$L1a","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In the rejection sampling stage, a subset of challenging data featuring intricate reasoning patterns is filtered, which the model has yet to fully comprehend and master. To ensure the model fully learns the underlying knowledge in the training dataset and further enhances its reasoning ability, we introduce GRPO-based reinforcement fine-tuning "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:2"}]}],["$","$1","3",{"children":". Specifically, in GRPO, the policy model "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:4"}]]}]}],["$","$1","5",{"children":"generates multiple candidate responses for a given input, which are evaluated using predefined verifiable reward functions, providing corresponding reward signals. These signals guide policy updates, encouraging alignment with high-quality reasoning while constraining deviations from the reference model "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:20:6"}]]}]}],["$","$1","7",{"children":". This approach enables the model to explore diverse reasoning processes, guiding it toward the correct reasoning trajectory and improving its ability to handle complex scenarios. In the following, we will first introduce the design of verifiable rewards in our work, followed by a detailed description of the GRPO training process."}]]}],["$","$L1a","21",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:0:style","children":"3.4.1 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:21:1:style","children":"Verifiable Reward"}]}]]}],["$","$L1a","22",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In GRPO, verifiable rewards are essential for guiding the model’s learning by offering rule-based feedback. In this work, we employ two types of verifiable rewards "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:22:1:style","children":"i.e., "}]}],["$","$1","2",{"children":"format reward and accuracy reward, to ensure both the quality and accuracy of the model’s responses:"}]]}],["$","$L1a","23",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:23:0:style","children":"Format Reward "}]}],["$","$1","1",{"children":"ensures that the generated response follows a specific reasoning structure, which is critical for maintaining clarity and consistency in the reasoning process. In this case, the response is expected to contain two key tags: "}],["$","$1","2",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:23:2"}]]}]}],["$","$1","3",{"children":". These tags are used to delineate the model’s reasoning process and the final answer, respectively. If both tags are present and properly formatted in the response, the reward "}],["$","$1","4",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:23:4"}]]}]}],["$","$1","5",{"children":"is set to 1. Otherwise, the reward "}],["$","$1","6",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:23:6"}]]}]}],["$","$1","7",{"children":"is 0. This mechanism helps reinforce the importance of structuring the response."}]]}],["$","$L1a","24",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:0:style","children":"Accuracy Reward "}]}],["$","$1","1",{"children":"evaluates whether the final answer output "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:2:style","children":"o "}]}],["$","$1","3",{"children":"("}],["$","$1","4",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:4:style","children":"i.e., "}]}],["$","$1","5",{"children":"Image/Video/Answer "}],["$","$1","6",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:6:style","children":"X "}]}],["$","$1","7",{"children":"is better), enclosed within the "}],["$","$1","8",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:8"}]]}]}],["$","$1","9",{"children":"tag, exactly matches the ground truth. This reward serves as a reliable signal to ensure that the model produces correct answers. If "}],["$","$1","10",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:24:10:style","children":"o "}]}],["$","$1","11",{"children":"matches the ground truth precisely, the reward is set to 1; otherwise, it is 0. Formally, the accuracy reward is defined as:"}]]}],["$","$L1a","25",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/6-4.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:25:0"}]]}]}]]}],["$","$L1a","26",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This is crucial for reinforcing accurate reasoning and encouraging the model to generate correct responses. Finally, the overall verifiable reward "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:26:1:style","children":"r "}]}],["$","$1","2",{"children":"is formulated as:"}]]}],["$","$L1a","27",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/6-5.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:27:0"}]]}]}]]}],["$","$L1a","28",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"By incorporating both the format and accuracy rewards into the GRPO training, we provide the model with explicit feedback that encourages it to generate responses that are both well-structured and factually accurate."}]]}],["$","$L1a","29",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:29:0:style","children":"3.4.2 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:29:1:style","children":"Reinforcement Fine-Tuning"}]}]]}],["$","$L1a","30",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Given an input "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:1:style","children":"x"}]}],["$","$1","2",{"children":", GRPO first samples "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:3:style","children":"N "}]}],["$","$1","4",{"children":"distinct responses "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:5"}]]}]}],["$","$1","6",{"children":"from the previous policy model "}],["$","$1","7",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:7"}]]}]}],["$","$1","8",{"children":". Each response is evaluated using our verifiable rewards, "}],["$","$1","9",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:9:style","children":"i.e., "}]}],["$","$1","10",{"children":"format and accuracy rewards, resulting in corresponding reward scores "}],["$","$1","11",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:30:11"}]]}]}],["$","$1","12",{"children":". Then, GRPO normalizes these scores and quantifies the relative quality of each response by computing the advantage of each response using the standardized reward:"}]]}],["$","$L1a","31",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/6-9.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:31:0"}]]}]}]]}],["$","$L1a","32",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"where "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:32:1"}]]}]}],["$","$1","2",{"children":"quantifies the relative quality of the "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:32:3:style","children":"i"}]}],["$","$1","4",{"children":"-th response in comparison to other candidates within the same sampled group."}]]}],["$","$L1a","33",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Then, GRPO estimates magnitude of the policy update by computing the likelihood ratio of each response under the new policy "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:33:1"}]]}]}],["$","$1","2",{"children":"relative to the old policy "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:33:3"}]]}]}],["$","$1","4",{"children":", defined as:"}]]}],["$","$L1a","34",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/6-13.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:34:0"}]]}]}]]}],["$","$L1a","35",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"To stabilize training and avoid overly aggressive updates, the ratio is clipped to a bounded interval "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:35:1"}]]}]}],["$","$1","2",{"children":". Moreover, to ensure that the updated policy does not diverge significantly from a fixed reference model "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:35:3"}]]}]}],["$","$1","4",{"children":", a KL divergence penalty term is introduced with a weighting factor "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:35:5"}]]}]}],["$","$1","6",{"children":". The final optimization objective is defined as:"}]]}],["$","$L1a","36",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/6-17.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:36:0"}]]}]}]]}],["$","$L1a","37",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"where "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:37:1:style","children":"X "}]}],["$","$1","2",{"children":"denotes the set of training sample input and "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:3:paragraphs:37:3"}]]}]}],["$","$1","4",{"children":"denotes the KL divergence."}]]}],["$","$L1a","38",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"By integrating normalized reward advantages, clipped importance sampling, and reference-model regularization, GRPO enables stable and effective policy optimization, driving our model toward generating higher-quality, verifiably correct CoT reasoning paths."}]]}]]}],["$","$L34","4",{"heading":"4 Experiments","index":4,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:0:0:style","children":"4.1 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:0:1:style","children":"Experimental Setup"}]}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:0:style","children":"Datasets. "}]}],["$","$1","1",{"children":"For "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:2:style","children":"Image Generation"}]}],["$","$1","3",{"children":", we utilize HPD (25.6K) "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:4"}]}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:5"}]}],["$","$1","6",{"children":", OIP (7.4K)"}],["$","$1","7",{"children":"3"}],["$","$1","8",{"children":", EvalMuse (3K) "}],["$","$1","9",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:9"}]}],["$","$1","10",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:10"}]}],["$","$1","11",{"children":", all preprocessed by "}],["$","$1","12",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:12"}]}],["$","$1","13",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:1:13"}]}],["$","$1","14",{"children":"—as"}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Table 1: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-43","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:2:1:style","children":"Image Understanding Assessment Comparison. "}]}],["$","$1","2",{"children":"We evaluate baselines across different understanding aspects on VLRewardBench."}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/7-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:3:0"}]]}]}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Table 2: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-44","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:4:1:style","children":"Image and Video Generation Assessment Comparison. "}]}],["$","$1","2",{"children":"Evaluation is conducted on GenAI-Bench (Image and Video) and VideoGen-Reward. “tau” indicates that accuracy is calculated with ties, and “diff” excludes tied pairs when calculating accuracy."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/7-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:5:0"}]]}]}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"well as OpenAI-4o_t2i_human_preference (6.7K) collected by Rapidata"}],["$","$1","1",{"children":"4"}],["$","$1","2",{"children":". For "}],["$","$1","3",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:3:style","children":"Video Generation"}]}],["$","$1","4",{"children":", we employ VideoDPO (10K) "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:5"}]}],["$","$1","6",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:6"}]}],["$","$1","7",{"children":"and Text2Video-Human Preferences (5.7K), also collected by Rapidata. For "}],["$","$1","8",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:8:style","children":"Image Understanding"}]}],["$","$1","9",{"children":", we sample 30K data from LLaVA-Critic-113K "}],["$","$1","10",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:10"}]}],["$","$1","11",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:11"}]}],["$","$1","12",{"children":". For "}],["$","$1","13",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:13:style","children":"Video Understanding"}]}],["$","$1","14",{"children":", we adopt ShareGPTVideo-DPO (17K) "}],["$","$1","15",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:15"}]}],["$","$1","16",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:16"}]}],["$","$1","17",{"children":". In the cold-start stage, we distill 5K image generation CoT reward reasoning samples from GPT-4o. The input data are randomly drawn from four image generation datasets, with the remaining data reserved for the subsequent training stages. "}],["$","$1","18",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:18:style","children":"Reward Model. "}]}],["$","$1","19",{"children":"We adopt UnifiedReward "}],["$","$1","20",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:20"}]}],["$","$1","21",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:21"}]}],["$","$1","22",{"children":"as our base model, which is capable of assessing both image/video generation and understanding. We leverage its strong performance and extensive prior knowledge in visual perception and generation, and further activate its latent capacity for long-chain CoT reasoning. "}],["$","$1","23",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:23:style","children":"Evaluations. "}]}],["$","$1","24",{"children":"We evaluate image and video understanding reward assessment on VLRewardBench "}],["$","$1","25",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:25"}]}],["$","$1","26",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:26"}]}],["$","$1","27",{"children":"and ShareGPTVideo "}],["$","$1","28",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:28"}]}],["$","$1","29",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:29"}]}],["$","$1","30",{"children":", using 5K test samples, respectively. For generation evaluation, we adopt GenAIBench "}],["$","$1","31",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:31"}]}],["$","$1","32",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:32"}]}],["$","$1","33",{"children":", which covers both image and video reward benchmarks. Additionally, we utilize VideoGen-RewardBench "}],["$","$1","34",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:34"}]}],["$","$1","35",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:6:35"}]}],["$","$1","36",{"children":"to further assess video generation."}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:7:0:style","children":"4.2 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:7:1:style","children":"Comparison Results"}]}]]}],["$","$L1a","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"The quantitative results, presented in Tabs. "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:2"}]}],["$","$1","3",{"children":"and Fig. "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:4"}]}],["$","$1","5",{"children":"$3a"}],["$","$1","6",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:6"}]}],["$","$1","7",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:7"}]}],["$","$1","8",{"children":"and "}],["$","$1","9",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:8:9"}]}]]}],["$","$L1a","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Table 3: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-48","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:9:1:style","children":"Ablation Results of Image Understanding Assessment. "}]}],["$","$1","2",{"children":"We conduct ablation experiments under different settings and evaluate them across multiple aspects on VLRewardBench."}]]}],["$","$L1a","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/8-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:10:0"}]]}]}]]}],["$","$L1a","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Table 4: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-49","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:11:1:style","children":"Ablation Results of Image and Video Generation Assessment Comparison. "}]}],["$","$1","2",{"children":"We conduct ablation experiments under different settings and evaluate them on GenAI-Bench (Image and Video) and VideoGen-Reward. “tau” indicates that accuracy is calculated with ties, and “diff” excludes tied pairs when calculating accuracy."}]]}],["$","$L1a","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/8-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:12:0"}]]}]}]]}],["$","$L1a","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:0:style","children":"4.3 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:13:1:style","children":"Ablation Studies"}]}]]}],["$","$L1a","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:14:0:style","children":"Ablation of Each Training Stage. "}]}],["$","$1","1",{"children":"We conduct ablation studies to assess the effectiveness of each training stage. As shown in Tabs. "}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:14:2"}]}],["$","$1","3",{"children":"and "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:14:4"}]}],["$","$1","5",{"children":"after the cold start phase, though the model learns the CoT reasoning format, it still struggles with accurate reward prediction. Notably, introducing rejection sampling leads to clear improvements by retaining correctly reasoned samples for supervised fine-tuning, thereby reinforcing desirable reasoning patterns. Further, the GRPO stage yields the most substantial gains, as it focuses on previously mispredicted cases, allowing the model to explore multiple reasoning paths and converge on more accurate solutions. These results highlight the complementary roles of each stage and demonstrate how our staged training strategy progressively enhances CoT-based reward modeling."}]]}],["$","$L1a","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:15:0:style","children":"Ablation of GRPO without CoT Reasoning. "}]}],["$","$1","1",{"children":"To further validate the necessity of learning the CoT reasoning process, we evaluate a GRPO variant that removes CoT and directly optimizes reward predictions based on final answers. As shown in Tabs. "}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:15:2"}]}],["$","$1","3",{"children":"and "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:4:paragraphs:15:4"}]}],["$","$1","5",{"children":"although this approach yields slight improvements over the baseline, the gains are significantly limited. This suggests that learning from final answers alone fails to teach the model the underlying reasoning process. In contrast, our CoT-based GRPO guides the model to explore multiple reasoning trajectories and gradually converge toward the correct path, leading to deeper understanding and more robust generalization. These results show that the effectiveness of our GRPO-based reinforcement fine-tuning stems from explicitly modeling the reasoning process, rather than simply reinforcing the final answer."}]]}]]}],["$","$L34","5",{"heading":"5 Conclusion","index":5,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This work proposes U"}],["$","$1","1",{"children":"NIFIED"}],["$","$1","2",{"children":"R"}],["$","$1","3",{"children":"EWARD"}],["$","$1","4",{"children":"-T"}],["$","$1","5",{"children":"HINK"}],["$","$1","6",{"children":", the first unified multimodal CoT reward model capable of multi-dimensional, step-by-step reliable reward reasoning for both visual understanding and generation tasks. Specifically, we adopt an exploration-driven reinforcement fine-tuning approach to elicit and incentivize the model’s latent complex reasoning ability, including three key stages: cold start, rejection sampling, and GRPO-based reinforcement fine-tuning. Extensive experiments demonstrate that CoT reasoning not only improves the accuracy and robustness of reward signals but also equips the model with strong implicit reasoning capabilities, enabling superior performance even without explicit CoT outputs."}]]}]]}],["$","$L34","6",{"heading":"References","index":6,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-17","style":"$undefined","children":"S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, et al. "}]}],["$","$1","1",{"children":"Qwen2.5-vl technical report. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:0:2:style","children":"arXiv preprint arXiv:2502.13923"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-37","style":"$undefined","children":"D. Christodoulou and M. Kuhlmann-Jørgensen. Finding the subjective truth: Collecting 2 million "}]}],["$","$1","1",{"children":"votes for comprehensive gen-ai model evaluation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:1:2:style","children":"arXiv preprint arXiv:2409.11904"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-11","style":"$undefined","children":"C. Gulcehre, T. L. Paine, S. Srinivasan, K. Konyushkova, L. Weerts, A. Sharma, A. Siddhant, "}]}],["$","$1","1",{"children":"A. Ahern, M. Wang, C. Gu, et al. Reinforced self-training (rest) for language modeling. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:2:2:style","children":"arXiv preprint arXiv:2308.08998"}]}],["$","$1","3",{"children":", 2023."}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-13","style":"$undefined","children":"D. Guo, D. Yang, H. Zhang, J. Song, R. Zhang, R. Xu, Q. Zhu, S. Ma, P. Wang, X. Bi, et al. "}]}],["$","$1","1",{"children":"Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:3:2:style","children":"arXiv preprint arXiv:2501.12948"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-38","style":"$undefined","children":"S. Han, H. Fan, J. Fu, L. Li, T. Li, J. Cui, Y. Wang, Y. Tai, J. Sun, C. Guo, et al. Evalmuse-40k: A "}]}],["$","$1","1",{"children":"reliable and fine-grained benchmark with comprehensive human annotations for text-to-image generation model evaluation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:4:2:style","children":"arXiv preprint arXiv:2412.18150"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-4","style":"$undefined","children":"X. He, D. Jiang, G. Zhang, M. Ku, A. Soni, S. Siu, H. Chen, A. Chandra, Z. Jiang, A. Arulraj, "}]}],["$","$1","1",{"children":"K. Wang, Q. D. Do, Y. Ni, B. Lyu, Y. Narsupalli, R. Fan, Z. Lyu, Y. Lin, and W. Chen. Videoscore: Building automatic metrics to simulate fine-grained human feedback for video generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:5:2:style","children":"arXiv preprint arXiv:2406.15252"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-24","style":"$undefined","children":"B. Hui, J. Yang, Z. Cui, J. Yang, D. Liu, L. Zhang, T. Liu, J. Zhang, B. Yu, K. Lu, et al. Qwen2. "}]}],["$","$1","1",{"children":"5-coder technical report. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:6:2:style","children":"arXiv preprint arXiv:2409.12186"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-15","style":"$undefined","children":"A. Hurst, A. Lerer, A. P. Goucher, A. Perelman, A. Ramesh, A. Clark, A. Ostrow, A. Welihinda, "}]}],["$","$1","1",{"children":"A. Hayes, A. Radford, et al. Gpt-4o system card. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:7:2:style","children":"arXiv preprint arXiv:2410.21276"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-19","style":"$undefined","children":"A. Jaech, A. Kalai, A. Lerer, A. Richardson, A. El-Kishky, A. Low, A. Helyar, A. Madry, A. Beutel, "}]}],["$","$1","1",{"children":"A. Carney, et al. Openai o1 system card. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:8:2:style","children":"arXiv preprint arXiv:2412.16720"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-42","style":"$undefined","children":"D. Jiang, M. Ku, T. Li, Y. Ni, S. Sun, R. Fan, and W. Chen. Genai arena: An open evaluation platform "}]}],["$","$1","1",{"children":"for generative models. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:9:2:style","children":"arXiv preprint arXiv:2406.04485"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-25","style":"$undefined","children":"F. Jiao, G. Guo, X. Zhang, N. F. Chen, S. Joty, and F. Wei. Preference optimization for reasoning "}]}],["$","$1","1",{"children":"with pseudo feedback. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:10:2:style","children":"arXiv preprint arXiv:2411.16345"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-50","style":"$undefined","children":"Y. Kirstain, A. Polyak, U. Singer, S. Matiana, J. Penna, and O. Levy. Pick-a-pic: An open dataset of "}]}],["$","$1","1",{"children":"user preferences for text-to-image generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:11:2:style","children":"NeurIPS"}]}],["$","$1","3",{"children":", 36:36652–36663, 2023."}]]}],["$","$L1a","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-16","style":"$undefined","children":"B. Li, Y. Zhang, D. Guo, R. Zhang, F. Li, H. Zhang, K. Zhang, Y. Li, Z. Liu, and C. Li. Llava- "}]}],["$","$1","1",{"children":"onevision: Easy visual task transfer. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:12:2:style","children":"arXiv preprint arXiv:2408.03326"}]}],["$","$1","3",{"children":", 2024a."}]]}],["$","$L1a","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-41","style":"$undefined","children":"L. Li, Y. Wei, Z. Xie, X. Yang, Y. Song, P. Wang, C. An, T. Liu, S. Li, B. Y. Lin, et al. Vlreward- "}]}],["$","$1","1",{"children":"bench: A challenging benchmark for vision-language generative reward models. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:13:2:style","children":"arXiv preprint arXiv:2411.17451"}]}],["$","$1","3",{"children":", 2024b."}]]}],["$","$L1a","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-7","style":"$undefined","children":"W. Li, X. Zhang, S. Zhao, Y. Zhang, J. Li, L. Zhang, and J. Zhang. Q-insight: Understanding image "}]}],["$","$1","1",{"children":"quality via visual reinforcement learning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:14:2:style","children":"arXiv preprint arXiv:2503.22679"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-6","style":"$undefined","children":"J. Liu, G. Liu, J. Liang, Z. Yuan, X. Liu, M. Zheng, X. Wu, Q. Wang, W. Qin, M. Xia, X. Wang, "}]}],["$","$1","1",{"children":"X. Liu, F. Yang, P. Wan, D. Zhang, K. Gai, Y. Yang, and W. Ouyang. Improving video generation with human feedback. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:15:2:style","children":"arXiv preprint arXiv:2501.13918"}]}],["$","$1","3",{"children":", 2025a."}]]}],["$","$L1a","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-39","style":"$undefined","children":"R. Liu, H. Wu, Z. Ziqiang, C. Wei, Y. He, R. Pi, and Q. Chen. Videodpo: Omni-preference alignment "}]}],["$","$1","1",{"children":"for video diffusion generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:16:2:style","children":"arXiv preprint arXiv:2412.14167"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-32","style":"$undefined","children":"Z. Liu, Z. Sun, Y. Zang, X. Dong, Y. Cao, H. Duan, D. Lin, and J. Wang. Visual-rft: Visual "}]}],["$","$1","1",{"children":"reinforcement fine-tuning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:17:2:style","children":"arXiv preprint arXiv:2503.01785"}]}],["$","$1","3",{"children":", 2025b."}]]}],["$","$L1a","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-20","style":"$undefined","children":"T. Q. Luong, X. Zhang, Z. Jie, P. Sun, X. Jin, and H. Li. Reft: Reasoning with reinforced fine-tuning. "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:18:1:style","children":"arXiv preprint arXiv:2401.08967"}]}],["$","$1","2",{"children":", 3, 2024."}]]}],["$","$L1a","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-8","style":"$undefined","children":"L. Ouyang, J. Wu, X. Jiang, D. Almeida, C. Wainwright, P. Mishkin, C. Zhang, S. Agarwal, K. Slama, "}]}],["$","$1","1",{"children":"A. Ray, et al. Training language models to follow instructions with human feedback. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:19:2:style","children":"NeurIPS"}]}],["$","$1","3",{"children":", 35: 27730–27744, 2022."}]]}],["$","$L1a","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-9","style":"$undefined","children":"R. Rafailov, A. Sharma, E. Mitchell, C. D. Manning, S. Ermon, and C. Finn. Direct preference "}]}],["$","$1","1",{"children":"optimization: Your language model is secretly a reward model. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:20:2:style","children":"NeurIPS"}]}],["$","$1","3",{"children":", 36:53728–53741, 2023."}]]}],["$","$L1a","21",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-10","style":"$undefined","children":"J. Schulman, F. Wolski, P. Dhariwal, A. Radford, and O. Klimov. Proximal policy optimization "}]}],["$","$1","1",{"children":"algorithms. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:21:2:style","children":"arXiv preprint arXiv:1707.06347"}]}],["$","$1","3",{"children":", 2017."}]]}],["$","$L1a","22",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-21","style":"$undefined","children":"Z. Shao, P. Wang, Q. Zhu, R. Xu, J. Song, X. Bi, H. Zhang, M. Zhang, Y. Li, Y. Wu, et al. "}]}],["$","$1","1",{"children":"Deepseekmath: Pushing the limits of mathematical reasoning in open language models. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:22:2:style","children":"arXiv preprint arXiv:2402.03300"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","23",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-12","style":"$undefined","children":"C. Snell, J. Lee, K. Xu, and A. Kumar. Scaling llm test-time compute optimally can be more effective "}]}],["$","$1","1",{"children":"than scaling model parameters. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:23:2:style","children":"arXiv preprint arXiv:2408.03314"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","24",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-28","style":"$undefined","children":"Z. Sun, S. Shen, S. Cao, H. Liu, C. Li, Y. Shen, C. Gan, L.-Y. Gui, Y.-X. Wang, Y. Yang, et al. "}]}],["$","$1","1",{"children":"Aligning large multimodal models with factually augmented rlhf. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:24:2:style","children":"arXiv preprint arXiv:2309.14525"}]}],["$","$1","3",{"children":", 2023."}]]}],["$","$L1a","25",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-0","style":"$undefined","children":"Y. Wang, Z. Tan, J. Wang, X. Yang, C. Jin, and H. Li. Lift: Leveraging human feedback for "}]}],["$","$1","1",{"children":"text-to-video model alignment. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:25:2:style","children":"arXiv preprint arXiv:2412.04814"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","26",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-1","style":"$undefined","children":"Y. Wang, Y. Zang, H. Li, C. Jin, and W. Jiaqi. Unified reward model for multimodal understanding "}]}],["$","$1","1",{"children":"and generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:26:2:style","children":"arXiv preprint arXiv:2503.05236"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","27",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-3","style":"$undefined","children":"T. Xiong, X. Wang, D. Guo, Q. Ye, H. Fan, Q. Gu, H. Huang, and C. Li. Llava-critic: Learning to "}]}],["$","$1","1",{"children":"evaluate multimodal models. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:27:2:style","children":"arXiv preprint arXiv:2410.02712"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","28",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-18","style":"$undefined","children":"J. Xu, X. Liu, Y. Wu, Y. Tong, Q. Li, M. Ding, J. Tang, and Y. Dong. Imagereward: Learning and "}]}],["$","$1","1",{"children":"evaluating human preferences for text-to-image generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:28:2:style","children":"NeurIPS"}]}],["$","$1","3",{"children":", 36:15903–15935, 2023."}]]}],["$","$L1a","29",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-5","style":"$undefined","children":"J. Xu, Y. Huang, J. Cheng, Y. Yang, J. Xu, Y. Wang, W. Duan, S. Yang, Q. Jin, S. Li, J. Teng, "}]}],["$","$1","1",{"children":"Z. Yang, W. Zheng, X. Liu, M. Ding, X. Zhang, X. Gu, S. Huang, M. Huang, J. Tang, and Y. Dong. Visionreward: Fine-grained multi-dimensional human preference learning for image and video generation. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:29:2:style","children":"arXiv preprint arXiv:2412.21059"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","30",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-22","style":"$undefined","children":"A. Yang, B. Zhang, B. Hui, B. Gao, B. Yu, C. Li, D. Liu, J. Tu, J. Zhou, J. Lin, et al. Qwen2. "}]}],["$","$1","1",{"children":"5-math technical report: Toward mathematical expert model via self-improvement. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:30:2:style","children":"arXiv preprint arXiv:2409.12122"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","31",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-23","style":"$undefined","children":"H. Ying, S. Zhang, L. Li, Z. Zhou, Y. Shao, Z. Fei, Y. Ma, J. Hong, K. Liu, Z. Wang, et al. "}]}],["$","$1","1",{"children":"Internlm-math: Open math large language models toward verifiable reasoning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:31:2:style","children":"arXiv preprint arXiv:2402.06332"}]}],["$","$1","3",{"children":", 2024."}]]}],["$","$L1a","32",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-29","style":"$undefined","children":"T. Yu, Y. Yao, H. Zhang, T. He, Y. Han, G. Cui, J. Hu, Z. Liu, H.-T. Zheng, M. Sun, et al. Rlhf-v: "}]}],["$","$1","1",{"children":"Towards trustworthy mllms via behavior alignment from fine-grained correctional human feedback. In "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:32:2:style","children":"CVPR"}]}],["$","$1","3",{"children":", pages 13807–13816, 2024a."}]]}],["$","$L1a","33",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-30","style":"$undefined","children":"T. Yu, H. Zhang, Y. Yao, Y. Dang, D. Chen, X. Lu, G. Cui, T. He, Z. Liu, T.-S. Chua, et al. Rlaif-v: "}]}],["$","$1","1",{"children":"Aligning mllms through open-source ai feedback for super gpt-4v trustworthiness. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:33:2:style","children":"arXiv preprint arXiv:2405.17220"}]}],["$","$1","3",{"children":", 2024b."}]]}],["$","$L1a","34",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-55","style":"$undefined","children":"Y. Yue, Z. Chen, R. Lu, A. Zhao, Z. Wang, S. Song, and G. Huang. Does reinforcement learning really "}]}],["$","$1","1",{"children":"incentivize reasoning capacity in llms beyond the base model? "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:34:2:style","children":"arXiv preprint arXiv:2504.13837"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","35",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-2","style":"$undefined","children":"Y. Zang, X. Dong, P. Zhang, Y. Cao, Z. Liu, S. Ding, S. Wu, Y. Ma, H. Duan, W. Zhang, K. Chen, "}]}],["$","$1","1",{"children":"D. Lin, and J. Wang. Internlm-xcomposer2.5-reward: A simple yet effective multi-modal reward model. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:35:2:style","children":"arXiv preprint arXiv:2501.12368"}]}],["$","$1","3",{"children":", 2025."}]]}],["$","$L1a","36",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-26","style":"$undefined","children":"K. Zhang, G. Li, Y. Dong, J. Xu, J. Zhang, J. Su, Y. Liu, and Z. Jin. Codedpo: Aligning code models "}]}],["$","$1","1",{"children":"with self generated and verified source code. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:36:2:style","children":"arXiv preprint arXiv:2410.05605"}]}],["$","$1","3",{"children":", 2024a."}]]}],["$","$L1a","37",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-40","style":"$undefined","children":"R. Zhang, L. Gui, Z. Sun, Y. Feng, K. Xu, Y. Zhang, D. Fu, C. Li, A. Hauptmann, Y. Bisk, et al. "}]}],["$","$1","1",{"children":"Direct preference optimization of video large multimodal models from language model reward. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:37:2:style","children":"arXiv preprint arXiv:2404.01258"}]}],["$","$1","3",{"children":", 2024b."}]]}],["$","$L1a","38",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-27","style":"$undefined","children":"Y. Zhang, S. Wu, Y. Yang, J. Shu, J. Xiao, C. Kong, and J. Sang. o1-coder: an o1 replication for "}]}],["$","$1","1",{"children":"coding. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:38:2:style","children":"arXiv preprint arXiv:2412.00154"}]}],["$","$1","3",{"children":", 2024c."}]]}],["$","$L1a","39",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"id-31","style":"$undefined","children":"Y. Zhou, C. Cui, R. Rafailov, C. Finn, and H. Yao. Aligning modalities in vision large language "}]}],["$","$1","1",{"children":"models via preference fine-tuning. "}],["$","$1","2",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:6:paragraphs:39:2:style","children":"arXiv preprint arXiv:2402.11411"}]}],["$","$1","3",{"children":", 2024."}]]}]]}],["$","$L34","7",{"heading":"A Further Methodological Insights","index":7,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:0:0:style","children":"Cold Start only with Image Generation Preference Data: Why It Works. "}]}],["$","$1","1",{"children":"Our experiments demonstrate that using a small amount of high-quality image generation CoT reward reasoning data, instead of distilling data for every task, is sufficient for the model to learn the CoT reasoning format and structure across all visual reward tasks. The underlying reason lies in the fact that video tasks can be seen as multi-image understanding problems. Video frames, like images, also require recognizing objects, spatial relationships, and context. Once the model masters CoT reasoning for images, it can naturally extend this reasoning to videos by leveraging its inherent prior knowledge of temporal dynamics and sequential visual understanding. Therefore, by learning CoT reasoning from images, the model can seamlessly generalize to both static and dynamic visual tasks, eliminating the need for separate distillation for each task."}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:1:0:style","children":"Rejection Sampling for Unified Reward Generalization Fine-Tuning: Why We Need It. "}]}],["$","$1","1",{"children":"After the cold start, the model has already internalized the format and structure of CoT reasoning. With prior knowledge across tasks, it can generate accurate CoT-based reward analyses for many simple scenarios. However, directly applying GRPO to the entire training set would be inefficient and computationally costly. Besides, GRPO offers limited gains on samples that the model has already mastered. Therefore, we first apply rejection sampling to filter out cases that the model already performs well on. This not only reduces training cost but also reinforces the distribution of correct reasoning patterns by prioritizing high-confidence outputs. More challenging or ambiguous samples are passed to the GRPO stage, where the model explores diverse reasoning trajectories and gradually learns to prefer more accurate solutions."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:7:paragraphs:2:0:style","children":"Why We Trust the CoT Reasoning When the Final Answer is Correct during GRPO. "}]}],["$","$1","1",{"children":"In MLLMs, a common failure mode during CoT reasoning is the disconnect between the reasoning steps and the final answer: models may produce plausible reasoning chains but ultimately rely on shortcuts or intuition to generate the conclusion. To mitigate this, we explicitly structure the CoT process by having the model score each image across multiple dimensions and then aggregate these scores to derive the final decision. This enforces a step-by-step alignment between intermediate reasoning and outcome, ensuring that a correct final answer emerges from a coherent and interpretable reasoning process. As a result, in GRPO training, verifying the correctness of the final answer implicitly validates the reasoning trajectory, offering a principled yet efficient way to supervise complex CoT generation."}]]}]]}],["$","$L34","8",{"heading":"B More Experimental Details","index":8,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:0:0:style","children":"B.1 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:0:1:style","children":"Base Model."}]}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"This work adopts UnifiedReward "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:1:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:1:2"}]}],["$","$1","3",{"children":"as our base architecture, which unifies image/video generation and understanding reward tasks, demonstrating the mutual benefits of multi-task learning and achieving strong performance across various vision reward benchmarks. However, its reasoning is limited to direct responses or shallow thinking, lacking the capability for long and structured CoT reasoning, which may lead to lower accuracy and weaker interpretability in complex scenarios. Inspired by this work, we build upon UnifiedReward and further activate its latent long CoT reasoning capabilities across different vision reward tasks, aiming to enhance the accuracy and robustness of the reward signals."}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:2:0:style","children":"B.2 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:2:1:style","children":"Training Details."}]}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"For both the cold-start and rejection sampling stages, training is performed with a batch size of 1, 16 gradient accumulation steps, a learning rate of "}],["$","$1","1",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:3:1"}]]}]}],["$","$1","2",{"children":", and a warm-up ratio of 0.3, using 8 NVIDIA H100 (80GB) GPUs. For GRPO, training is conducted with a batch size of 1, a single gradient accumulation step, a learning rate of "}],["$","$1","3",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:3:3"}]]}]}],["$","$1","4",{"children":", and a KL penalty coefficient of "}],["$","$1","5",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"1ch","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[null,["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:3:5"}]]}]}],["$","$1","6",{"children":". The number of generated responses "}],["$","$1","7",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:3:7:style","children":"N "}]}],["$","$1","8",{"children":"is set to 8, using 64 NVIDIA H20 (97GB) GPUs."}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:4:0:style","children":"B.3 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:4:1:style","children":"Reward Model Baselines."}]}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We compare our method against a series of strong reward model baselines across both image and video domains, covering generation and understanding tasks."}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:6:0:style","children":"PickScore "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:6:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:6:2"}]}],["$","$1","3",{"children":"is a text-to-image preference model that integrates CLIP-based vision-language features with a reward modeling strategy inspired by InstructGPT. It is trained on the Pick-a-Pic dataset to align image generation outputs with human preferences."}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:7:0:style","children":"HPSv2 "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:7:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:7:2"}]}],["$","$1","3",{"children":"builds upon CLIP and is fine-tuned using the HPD_v2 "}],["$","$1","4",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:7:4"}]}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:7:5"}]}],["$","$1","6",{"children":"to predict human preferences over generated images. It demonstrates strong performance in pairwise ranking tasks and serves as a representative image generation reward baseline."}]]}],["$","$L1a","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:8:0:style","children":"ImageReward "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:8:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:8:2"}]}],["$","$1","3",{"children":"is trained on a large-scale preference dataset containing 137k human expert comparisons through both rating and ranking. It is specifically designed to capture subtle aspects of human preferences in text-to-image generation quality."}]]}],["$","$L1a","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:9:0:style","children":"LLaVA-Critic "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:9:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:9:2"}]}],["$","$1","3",{"children":"extends large language models for evaluating image understanding through a critic-style framework. It is trained on high-quality instruction-following data covering diverse criteria such as accuracy, relevance, and hallucination, supporting both pointwise and pairwise evaluation."}]]}],["$","$L1a","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:10:0:style","children":"VisionReward "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:10:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:10:2"}]}],["$","$1","3",{"children":"introduces a fine-grained, multi-dimensional evaluation framework for both image and video domains. It trains separate reward models tailored to human preferences collected through carefully curated datasets, offering strong baselines for visual content assessment."}]]}],["$","$L1a","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:11:0:style","children":"VideoScore "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:11:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:11:2"}]}],["$","$1","3",{"children":"focuses on assessing video generation quality. It is trained on the VideoFeedback dataset comprising human-annotated scores over 37.6K videos, each evaluated across multiple aspects including fidelity, consistency, and alignment."}]]}],["$","$L1a","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:12:0:style","children":"LiFT-Critic "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:12:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:12:2"}]}],["$","$1","3",{"children":"is a reward model developed under the LiFT framework, which aligns text-to-video models using human feedback. Trained on LiFT-HRA—a dataset containing over 10K human-labeled samples with both scores and rationales—it captures detailed human evaluation signals across multiple dimensions."}]]}],["$","$L1a","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:13:0:style","children":"VideoReward "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:13:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:13:2"}]}],["$","$1","3",{"children":"offers a multi-dimensional assessment for video generation tasks. It is trained on a large-scale 182K dataset of human-labeled comparisons collected from outputs of 12 video generation models, providing strong performance on complex video benchmarks."}]]}],["$","$L1a","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:14:0:style","children":"UnifiedReward "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:14:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:14:2"}]}],["$","$1","3",{"children":"serves as our base architecture. It leverages multi-task learning across diverse image and video generation and understanding datasets. By unifying multimodal reward tasks into a single framework, UnifiedReward demonstrates mutual enhancement effects and establishes a solid baseline for holistic visual reward modeling."}]]}],["$","$L1a","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:15:0:style","children":"Our UnifiedReward-Think "}]}],["$","$1","1",{"children":"extends UnifiedReward by integrating explicit long CoT reasoning across both visual understanding and generation tasks. Through a three-stage training pipeline—including cold start to learning CoT reward format, rejection sampling for unified CoT reward generalization fine-tuning, and GRPO for unified CoT reward reinforcement fine-tuning, the model achieves stronger accuracy and interpretability in reward assessment. It also generalizes well without explicit reasoning, leveraging implicit CoT capabilities for robust performance."}]]}],["$","$L1a","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:16:0:style","children":"B.4 "}]}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:16:1:style","children":"Evaluation Benchmarks"}]}]]}],["$","$L1a","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:17:0:style","children":"VLRewardBench "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:17:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:17:2"}]}],["$","$1","3",{"children":"serves as a diverse benchmark for evaluating image understanding capabilities, featuring 1,250 carefully curated samples across general vision-language queries, hallucination detection, and complex reasoning. To ensure robust evaluation, response orders are randomly shuffled during testing."}]]}],["$","$L1a","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:18:0:style","children":"ShareGPTVideo "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:18:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:18:2"}]}],["$","$1","3",{"children":"provides large-scale video-caption pairs and human preference data, covering various aspects of video understanding such as temporal reasoning, spatial relations, and factual grounding. We 3K for evaluation in our reward modeling experiments."}]]}],["$","$L1a","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:19:0:style","children":"GenAI-Bench "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:19:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:19:2"}]}],["$","$1","3",{"children":"is a multimodal generation benchmark designed to assess how well models align with human preferences across image and video generation tasks. We adopt its image and video generation subsets for evaluating generative reward performance."}]]}],["$","$L1a","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:20:0:style","children":"VideoGen-RewardBench "}]}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:20:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:8:paragraphs:20:2"}]}],["$","$1","3",{"children":"offers a large-scale benchmark tailored for evaluating video reward models, consisting of 26.5k video pairs labeled by humans. Each pair is ranked "}],["$","$1","4",{"children":"according to multiple criteria, and we use the Overall Quality scores to benchmark the performance of the model."}]]}]]}],["$","$L34","9",{"heading":"C Prompting Templates and More Qualitative Cases","index":9,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"We provide more qualitative cases across diverse vision tasks with prompting templates in Figs. "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:0:2"}]}],["$","$1","3",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:0:3"}]}],["$","$1","4",{"children":"and "}],["$","$1","5",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:9:paragraphs:0:5"}]}]]}]]}],["$","$L34","10",{"heading":"D Limitations and Future Works","index":10,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"While our method introduces long-form CoT reasoning to improve reward modeling, this inevitably increases inference time during reasoning. However, we show that once the model has mastered CoT reasoning, it can leverage implicit reasoning to enhance answer accuracy even without explicitly generating CoT traces. This suggests strong internalization of the reasoning process. In future work, we aim to further optimize efficiency by exploring shorter or more efficient CoT formats without compromising reasoning quality. Besides, although our reinforcement fine-tuning strategy successfully activates the model’s latent long CoT reasoning ability using only a small amount of high-quality data, the prior study "}],["$","$1","1",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:10:paragraphs:0:1"}]}],["$","$1","2",{"children":["$","$L36",null,{"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:10:paragraphs:0:2"}]}],["$","$1","3",{"children":"has shown that reinforcement learning cannot fundamentally extend a model’s capability: it can only amplify the potential already acquired during supervised fine-tuning (SFT). Therefore, to further push the boundaries of CoT-based reward reasoning, scaling up high-quality CoT supervision still remains a promising direction."}]]}]]}],["$","$L34","11",{"heading":"E Societal Impacts","index":11,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Our work introduces a unified multimodal CoT reward model capable of high-quality, interpretable assessment across diverse multimodal tasks. This advancement can significantly enhance the alignment of generative models with human preferences in real-world applications such as AI-assisted content creation, and education. By improving both the accuracy and interpretability of reward signals, our method contributes more transparent and controllable AI behaviors, potentially increasing public trust in generative technologies. However, as reward models become more capable and general, they may also be misused to reinforce harmful biases in generation models, especially if the training data or preference annotations reflect subjective or skewed human values. We encourage future work to further examine the ethical implications of large-scale reward modeling and to include fairness-aware training strategies."}]]}]]}],["$","$L34","12",{"heading":"F Ethical Statement","index":12,"length":13,"content":[["$","$L1a","0",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"In this work, we affirm our commitment to ethical research practices and responsible innovation. To the best of our knowledge, this study does not involve any data, methodologies, or applications that raise ethical concerns. All experiments and analyses were conducted in compliance with established ethical guidelines, ensuring the integrity and transparency of our research process."}]]}],["$","$L1a","1",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/15-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:1:0"}]]}]}]]}],["$","$L1a","2",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 1 (9/10): The image shows a boy eating ramen in a restaurant setting, which aligns well with the caption. The presence of a bowl of ramen and the restaurant environment are clear."}]]}],["$","$L1a","3",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 2 (8/10): This image also depicts a boy eating ramen in a restaurant. However, the focus on the boy's face is more intense, slightly reducing the emphasis on the restaurant setting. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:3:1:style","children":"2. Aesthetics:"}]}]]}],["$","$L1a","4",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 1 (8/10): The composition is balanced, with warm lighting that enhances the dining atmosphere. The focus on the boy and the ramen is clear and visually appealing."}]]}],["$","$L1a","5",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 2 (7/10): The image has a good composition, but the focus on the boy's face might distract from the overall aesthetic. The lighting is slightly harsher, affecting the ambiance. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:5:1:style","children":"3. Authenticity:"}]}]]}],["$","$L1a","6",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 1 (9/10): The details in the image, such as the texture of the ramen and the restaurant background, are realistic and convincing."}]]}],["$","$L1a","7",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Image 2 (7/10): While the image is realistic, the focus on the boy's face and the lighting make it slightly less authentic in terms of capturing a typical restaurant dining experience. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:7:1:style","children":"Total score: "}]}],["$","$1","2",{"children":"Image 1: 9+8+9=26 Image 2: 8+7+7=22 "}],["$","$1","3",{"children":" "}],["$","$1","4",{"children":"Image 1 is better"}],["$","$1","5",{"children":""}]]}],["$","$L1a","8",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:8:0:style","children":"Image Generation "}]}],["$","$1","1",{"children":"Given a caption and two images generated based on this caption, please analyze in detail the two provided images. Evaluate them on various dimensions such as semantic consistency (how closely the image content aligns with the caption), aesthetics (composition, color usage, artistic expression), authenticity (realism and attention to detail), and any other factors you deem relevant. For each evaluation dimension, provide a score between 1-10 for both images (e.g., Image 1: 8/10, Image 2: 6/10) and provide a concise rationale for the score. Calculate the total score for each image by summing all dimension scores. Use a chain-of-thought process to detail your reasoning steps, and enclose all your detailed reasoning within and tags. Then, in the tag, output exactly one of the following strings: \\'Image 1 is better\\' or \\'Image 2 is better\\' based on the total scores. No additional text is allowed in the section."}]]}],["$","$L1a","9",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/15-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:9:0"}]]}]}]]}],["$","$L1a","10",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 5: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-51","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:10:1:style","children":"More Qualitative Results of Image Generation CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given a pair of images and the corresponding caption, our model performs quality assessment across semantic consistency, aesthetics, and authenticity through CoT reasoning."}]]}],["$","$L1a","11",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/16-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:11:0"}]]}]}]]}],["$","$L1a","12",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:12:0:style","children":"1. Semantic consistency: "}]}],["$","$1","1",{"children":"- Video 1 (5/10): The video shows a woman in a green blouse, but her hair is dark, not blonde. The faucet is turned on, but the"}]]}],["$","$L1a","13",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"water flow is not visible. The setting is a modern kitchen, aligning with the caption. - Video 2 (9/10): The video features a woman with long blonde hair and a green blouse, matching the caption. The faucet is"}]]}],["$","$L1a","14",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:14:0:style","children":"2. Temporal coherence: "}]}],["$","$1","1",{"children":"- Video 1 (4/10): The woman's actions are not smooth, and the faucet turning on is not clearly depicted. The scene lacks a"}]]}],["$","$L1a","15",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"coherent flow. - Video 2 (9/10): The video shows a clear, smooth action of turning on the faucet with water flowing, maintaining a logical"}]]}],["$","$L1a","16",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:16:0:style","children":"3. Authenticity: "}]}],["$","$1","1",{"children":"- Video 1 (6/10): The setting and attire are realistic, but the lack of visible water flow reduces authenticity. - Video 2 (9/10): The video is realistic, with natural lighting and a believable scene of water flowing from the faucet."}]]}],["$","$L1a","17",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:17:0:style","children":"Video Generation "}]}],["$","$1","1",{"children":"Given a caption and two videos generated based on this caption, please analyze in detail the two provided videos. Evaluate them on various dimensions such as semantic consistency (how closely the video content aligns with the caption), temporal coherence (smoothness and logical flow of motion across frames), authenticity (realism and attention to detail), and any other factors you deem relevant. For each evaluation dimension, provide a score between 1-10 for both videos (e.g., Video 1: 8/10, Video 2: 6/10) and provide a concise rationale for the score. Calculate the total score for each video by summing all dimension scores. Use a chain-of-thought process to detail your reasoning steps, and enclose all your detailed reasoning within and tags. Then, in the tag, output exactly one of the following strings: 'Video 1 is better' or 'Video 2 is better' based on the total scores. No additional text is allowed in the section. Example output format: "}],["$","$1","2",{"children":" "}],["$","$1","3",{"children":"1. Semantic consistency: Video 1 (9/10) - ...; Video 2 (7/10) - ... 2. Temporal coherence: Video 1 (8/10) - ...; Video 2 (6/10) - ... 3. Authenticity: Video 1 (7/10) - ...; Video 2 (5/10) - ... [Additional dimensions if any]: Video 2 (8/10) - ...; Video 1 (6/10) - ... Total score: Video 1: 9+8+7+6=30 Video 2: 7+6+5+8=26 "}],["$","$1","4",{"children":" "}],["$","$1","5",{"children":"Video 1 is better"}],["$","$1","6",{"children":" "}],["$","$1","7",{"children":"Note: In the example above, scores and the final answer are placeholders meant only to demonstrate the format. Your actual evaluation should be based on the quality of two given videos. Your task is provided as follows: Text Caption: [{prompt}]"}]]}],["$","$L1a","18",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/16-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:18:0"}]]}]}]]}],["$","$L1a","19",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 6: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-52","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:19:1:style","children":"More Qualitative Results of Video Generation CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given a pair of videos and the corresponding caption, our model performs quality assessment across semantic consistency, temporal coherence, and authenticity through CoT reasoning."}]]}],["$","$L1a","20",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/17-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:20:0"}]]}]}]]}],["$","$L1a","21",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (8/10) - Correctly describes the presence of cupcakes decorated with characters resembling Minions and a birthday cake. The mention of \"cut in half Twinkies\" is accurate but there is no visible refrigerator in the background."}]]}],["$","$L1a","22",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 2 (6/10) - Describes a festive scene with balloons, party hats, and confetti, none of which are visible in the image. Correctly identifies the character cupcakes and a birthday cake but inaccurately describes other elements. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:22:1:style","children":"2. Correctness:"}]}]]}],["$","$L1a","23",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (7/10) - Generally correct in describing the cupcakes and cake, but the mention of a refrigerator and the broader setting is incorrect."}]]}],["$","$L1a","24",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 2 (5/10) - Incorrectly adds elements like balloons and party hats that are not present, affecting overall correctness. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:24:1:style","children":"3. Clarity:"}]}]]}],["$","$L1a","25",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (9/10) - Clear and concise, effectively communicates the main elements of the image, despite minor inaccuracies."}]]}],["$","$L1a","26",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Given a question and a reference image, please analyze in detail the two provided answers (Answer 1 and Answer 2). \" Evaluate them based on the following three core dimensions: 1. Semantic accuracy: How well the answer reflects the visual content of the image 2. Correctness: Whether the answer is logically and factually correct 3. Clarity: Whether the answer is clearly and fluently expressed You may also consider additional dimensions if you find them relevant (e.g., reasoning ability, attention to detail, multimodal grounding, etc.). For each dimension, provide a score from 1 to 10 for both answers, and briefly explain your reasoning. Then, compute the total score for each answer by explicitly adding the scores for all dimensions and showing the full calculation. Enclose your full reasoning within and tags. Then, in the tag, output exactly one of the following: 'Answer 1 is better' or 'Answer 2 is better'. No other text is allowed in the section. Example format "}],["$","$1","1",{"children":" "}],["$","$1","2",{"children":"1. Semantic accuracy: Answer 1 (9/10) - ...; Answer 2 (7/10) - ... 2. Correctness: Answer 1 (8/10) - ...; Answer 2 (7/10) - ... 3. Clarity: Answer 1 (9/10) - ...; Answer 2 (8/10) - ... [Additional dimensions if any]: Answer 1 (6/10) - ...; Answer 2 (7/10) - ... Total score:\\nAnswer 1: 9+8+9+6=32\\nAnswer 2: 7+7+8+7=29 "}],["$","$1","3",{"children":" "}],["$","$1","4",{"children":"Answer 1 is better"}],["$","$1","5",{"children":" "}],["$","$1","6",{"children":"Note: In the example above, scores and the final answer are placeholders meant only to demonstrate the format. Your actual evaluation should be based on the quality of two given answers. Your task is provided as follows: Question: [{Query}]\\nAnswer 1: [{R1}]\\nAnswer 2: [{R2}]\")"}]]}],["$","$L1a","27",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/17-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:27:0"}]]}]}]]}],["$","$L1a","28",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 7: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-53","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:28:1:style","children":"More Qualitative Cases of Image Understanding CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given an image, a query, and a pair of candidate answers, our model performs quality assessment across semantic accuracy, factual correctness, and clarity through CoT reasoning."}]]}],["$","$L1a","29",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/18-0.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:29:0"}]]}]}]]}],["$","$L1a","30",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (8/10): The answer accurately describes the visual and cultural significance of fireworks in the lion dance performance. It mentions the enhancement of the festive atmosphere and the symbolic meaning of good luck and prosperity."}]]}],["$","$L1a","31",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 2 (9/10): This answer provides a more detailed explanation, covering both the visual impact and cultural significance. It elaborates on the symbolic power of fireworks in relation to warding off evil spirits and welcoming good fortune. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:31:1:style","children":"2. Correctness:"}]}]]}],["$","$L1a","32",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (9/10): The explanation is factually correct, aligning well with the cultural context of lion dances and the role of fireworks."}]]}],["$","$L1a","33",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 2 (10/10): This answer is also factually correct and provides a comprehensive understanding of the cultural and symbolic aspects of fireworks in the context of the lion dance. "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"$undefined","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:33:1:style","children":"3. Clarity:"}]}]]}],["$","$L1a","34",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"- Answer 1 (8/10): The answer is clear and well-structured, effectively conveying the importance of fireworks in the performance."}]]}],["$","$L1a","35",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":"Given a question and a reference video, please evaluate the two provided answers (Answer 1 and Answer 2). Judge them based on the following key dimensions: 1. Semantic accuracy: Does the answer align with the visual and temporal content in the video? 2. Correctness: Is the answer factually and logically correct? 3. Clarity: Is the answer expressed fluently, clearly, and coherently? You are encouraged to consider any additional dimensions if relevant (e.g., temporal reasoning, causal understanding, visual detail, emotional perception, etc.). For each dimension, assign a score from 1 to 10 for both answers and explain briefly. Then, compute and explicitly show the total score as an addition of all dimension scores. Wrap your full reasoning in tags. In the tag, output exactly one of the following: 'Answer 1 is better' or 'Answer 2 is better'. No additional commentary is allowed in the section. Example format: "}],["$","$1","1",{"children":" "}],["$","$1","2",{"children":"1. Semantic accuracy: Answer 1 (8/10) - ...; Answer 2 (9/10) - ... 2. Correctness: Answer 1 (7/10) - ...; Answer 2 (6/10) - ... 3. Clarity: Answer 1 (9/10) - ...; Answer 2 (8/10) - ... [Additional dimensions if any]: Answer 1 (7/10) - ...; Answer 2 (6/10) - ... Total score:\\Answer 1: 8+7+9+7=31\\Answer 2: 9+6+8+6=29 "}],["$","$1","3",{"children":" "}],["$","$1","4",{"children":"Answer 1 is better"}],["$","$1","5",{"children":" "}],["$","$1","6",{"children":"Your task is provided as follows: Question: {Query}\\nAnswer 1: {R1}\\nAnswer 2: {R2}\\n"}]]}],["$","$L1a","36",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":{"compact":"left","expanded":"justify"},"typography":{"compact":"paperBody2","expanded":"paperBody1"}},"children":[["$","$1","0",{"children":["$","$Lb",null,{"component":"span","sx":{"verticalAlign":"middle","px":"$undefined","& img":{"imageRendering":"-webkit-optimize-contrast"}},"children":[["$","$L37",null,{"src":"https://cdn.bytez.com/mobilePapers/v2/arxiv/2505.03318/images/18-1.png","alt":"$undefined"}],["$","$L38",null,{"inAbstract":false,"imgScale":4,"avgLineHeight":10.64,"fragment":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:36:0"}]]}]}]]}],["$","$L1a","37",{"component":"p","variant":"paperBody1","overflow":"hidden","sx":{"pb":2,"textWrap":"pretty","textAlign":"center","color":"var(--secondary-color)","typography":"paperBody2"},"children":[["$","$1","0",{"children":"Figure 8: "}],["$","$1","1",{"children":["$","span",null,{"tabIndex":-1,"id":"id-54","style":"$1b:props:children:props:children:3:1:props:paperJSON:sections:12:paragraphs:37:1:style","children":"More Qualitative Cases of Video Understanding CoT Reward Reasoning. "}]}],["$","$1","2",{"children":"Given a video, a query, and a pair of candidate answers, our model performs quality assessment across semantic accuracy, factual correctness, and clarity through CoT reasoning."}]]}]]}]],["$","$L3b",null,{"paper":"$1b:props:children:props:children:0:props:product"}]]