microsoft · jingyuanlm · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 24, 2025
diff --git a/Makefile b/Makefile
@@ -67,7 +67,8 @@ init-qlib-env:
 	@source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate
 
 dev:
-	$(PIPRUN) pip install -e .[docs,lint,package,test] -c $(CONSTRAINTS_FILE)
+	$(PIPRUN) pip install -U pip setuptools wheel
+	$(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE)
 	$(PIPRUN) pip install -U kaggle
 	if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -113,6 +113,7 @@ lint = {file = ["requirements/lint.txt"]}
 package = {file = ["requirements/package.txt"]}
 test = {file = ["requirements/test.txt"]}
 torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch.  pip install rdagent[torch]
+reward = {file = ["requirements/reward.txt"]}
 
 [tool.setuptools_scm]
 local_scheme = "no-local-version"
@@ -121,4 +122,4 @@ version_scheme = "guess-next-dev"
 [tool.tomlsort]
 all = true
 in_place = true
-trailing_comma_inline_array = true
+trailing_comma_inline_array = true
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -197,6 +197,33 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     user_interaction_wait_seconds: int = 6000  # seconds to wait for user interaction
     user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction"
 
+    #### reward model related
+    enable_reward_model_selection: bool = True
+    """Enable reward model based hypothesis selection."""
+
+    reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+
+    #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+    #"/data/userdata/v-lijingyuan/last_run_2"
+
+    #reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_3"
+
+
+    """The path to the reward model for hypothesis selection."""
+
+    competition_mapping_path: str = "/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/comp_to_scen.json"
+
+    #competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json"
+
+
+    reward_base_model: str = "Qwen/Qwen3-4B"
+    """ Backbone of the reward model"""
+
+    max_length : int = 2300
+    """ max_length of the reward model"""
+
+    reward_select_type: int = 1
+
 
 DS_RD_SETTING = DataScienceBasePropSetting()
 

diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -8,7 +8,6 @@
 import pandas as pd
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.agent.context7 import Agent as DocAgent
 from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback
 from rdagent.components.coder.CoSTEER.evaluators import (
     CoSTEEREvaluator,
@@ -307,6 +306,7 @@ def evaluate(
         do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search
 
         if do_documentation_search:
+            from rdagent.components.agent.context7 import Agent as DocAgent
             # Use MCPAgent for clean, user-friendly interface
             try:
                 # Create agent targeting Context7 service - model config comes from mcp_config.json

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -9,7 +9,6 @@
 from pydantic import BaseModel, Field
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.agent.rag import Agent as RAGAgent
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
@@ -41,7 +40,7 @@
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.repo.diff import generate_diff_from_dict
 from rdagent.utils.workflow import wait_retry
-
+import torch
 _COMPONENT_META: Dict[str, Dict[str, Any]] = {
     "DataLoadSpec": {
         "target_name": "Data loader and specification generation",
@@ -313,7 +312,6 @@ def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraft
     else:
         return None
 
-
 class DSProposalV1ExpGen(ExpGen):
     def gen(
         self,
@@ -649,6 +647,7 @@ def hypothesis_gen(
 
         # knowledge retrieval
         if DS_RD_SETTING.enable_research_rag:
+            from rdagent.components.agent.rag import Agent as RAGAgent
             rag_agent = RAGAgent(
                 system_prompt="""You are a helpful assistant.
 You help users retrieve relevant knowledge from community discussions and public code."""
@@ -1182,6 +1181,163 @@ def hypothesis_rank(
             problem_label=problem_dict.get("label", "FEEDBACK_PROBLEM"),
             appendix=hypothesis_dict[max_score_problem_name].get("appendix", None),
         )
+
+    def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]:
+        """
+        Select hypothesis based on reward model scores.
+        """
+        from .reward_inference import RewardModelInference
+        from transformers import AutoTokenizer
+        import os
+        logdir = DS_RD_SETTING.reward_model_path
+        base_model = DS_RD_SETTING.reward_base_model
+        adapter_path = os.path.join(logdir, "lora_adapter")
+        reward_head_path = os.path.join(logdir, "reward_head.pt")
+        calib_path = os.path.join(logdir, "calib.json")
+
+        tokenizer = AutoTokenizer.from_pretrained(base_model)
+        if not getattr(tokenizer, "pad_token", None):
+            tokenizer.pad_token = tokenizer.eos_token
+        device = torch.device("cuda:1")
+        model = RewardModelInference(
+            base_model_name=base_model,
+            adapter_path=adapter_path,
+            reward_head_path=reward_head_path,device=device
+        )
+        model.eval()
+
+        parent_nodes = {}
+        for node in range(len(trace.hist)):
+            parents = trace.get_parents(node)
+            parent_nodes[node] = parents[-2] if len(parents) > 1 else None
+        # FIXME: add the convert logic to method in trace
+        if hasattr(trace, "idx2loop_id"):
+            parent_nodes = {
+                trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
+            }
+
+        if not trace.current_selection or trace.current_selection[0] == -1:
+            first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+            return first_text
+
+        current_parent_record_id = trace.current_selection[0]
+        loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
+        loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
+
+        hypothesis_list = [
+            trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
+            for loop_id in loop_id_list
+            if trace.hist[loop_id2idx[loop_id]][1].decision == True
+        ][::-1]
+        sep =  "->"
+
+        hypothesis_chain_list = []
+        accumulate = []
+        for hyp in hypothesis_list:
+            accumulate.append(hyp)
+            hypothesis_chain_list.append(sep.join(accumulate))
+
+        if not hypothesis_chain_list:
+            first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+            return first_text
+
+        last_text = []
+        texts = []
+        for name, data in hypothesis_dict.items():
+            last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
+            texts.append(data.get("hypothesis", "Hypothesis not provided"))
+
+        comp_dict_path = DS_RD_SETTING.competition_mapping_path
+        with open(comp_dict_path, "r") as f:
+            comp_dict = json.load(f)
+        competition = trace.scen.competition
+        comp_description = comp_dict[competition]
+        rewards = model.compute_reward(texts, tokenizer,comp_description)
+        max_idx = rewards.index(max(rewards))
+        return texts[max_idx]
+
+    def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_dict):
+        """
+        Select hypothesis based on avg win rate
+        """
+        parent_nodes = {}
+        for node in range(len(trace.hist)):
+            parents = trace.get_parents(node)
+            parent_nodes[node] = parents[-2] if len(parents) > 1 else None
+        # FIXME: add the convert logic to method in trace
+        if hasattr(trace, "idx2loop_id"):
+            parent_nodes = {
+                trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
+            }
+
+        if not trace.current_selection or trace.current_selection[0] == -1:
+            first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+            return first_text
+        current_parent_record_id = trace.current_selection[0]
+
+        loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
+        loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
+
+        hypothesis_list = [
+            trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
+            for loop_id in loop_id_list
+            if trace.hist[loop_id2idx[loop_id]][1].decision == True
+        ][::-1]
+        sep =  "->"
+
+        hypothesis_chain_list = []
+        accumulate = []
+        for hyp in hypothesis_list:
+            accumulate.append(hyp)
+            hypothesis_chain_list.append(sep.join(accumulate))
+
+        if not hypothesis_chain_list:
+            first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+            return first_text
+
+        last_text = []
+        texts = []
+        for name, data in hypothesis_dict.items():
+            last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
+            texts.append(data.get("hypothesis", "Hypothesis not provided"))
+
+        from .reward_inference import RewardModelInference
+        from transformers import AutoTokenizer
+        import os
+        logdir = DS_RD_SETTING.reward_model_path
+        base_model = DS_RD_SETTING.reward_base_model
+
+        adapter_path = os.path.join(logdir, "lora_adapter")
+        reward_head_path = os.path.join(logdir, "reward_head.pt")
+
+        tokenizer = AutoTokenizer.from_pretrained(base_model)
+        if not getattr(tokenizer, "pad_token", None):
+            tokenizer.pad_token = tokenizer.eos_token
+
+        model = RewardModelInference(
+            base_model_name=base_model,
+            adapter_path=adapter_path,
+            reward_head_path=reward_head_path,
+        ).to("cuda")
+        model.eval()
+        comp_dict_path = DS_RD_SETTING.competition_mapping_path
+        with open(comp_dict_path, "r") as f:
+            comp_dict = json.load(f)
+        competition = trace.scen.competition
+        comp_description = comp_dict[competition]
+        parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer,comp_description)
+        currnet_rewards = model.compute_reward(last_text,tokenizer,comp_description)
+
+        avg_win_rate = []
+        for re in currnet_rewards:
+            win_rate = []
+            for p_re in parent_rewards:
+                current_win_rate = re/(re + p_re)
+                win_rate.append(current_win_rate)
+            avg_win_rate.append(np.mean(win_rate))
+        max_idx = avg_win_rate.index(max(avg_win_rate))
+        return texts[max_idx]
+
 
     def task_gen(
         self,
@@ -1473,7 +1629,45 @@ def gen(
             )
             pickled_problem_name = None
         else:
-            pickled_problem_name, new_hypothesis = self.hypothesis_rank(
+            sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)
+            if DS_RD_SETTING.enable_reward_model_selection==True and sota_flag and not trace.is_selection_new_tree():
+                # logger.info("Selecting hypothesis using reward model.")
+                # selected_hypothesis_text = self.reward_model_select_hypothesis(
+                #     hypothesis_dict=hypothesis_dict,
+                #     problem_dict=all_problems,
+                # )
+                logger.info("Selecting hypothesis using reward model. (avg win)")
+                if DS_RD_SETTING.reward_select_type==1:
+                    selected_hypothesis_text=  self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict)
+                elif DS_RD_SETTING.reward_select_type==2:
+                    selected_hypothesis_text = self.reward_model_select_hypothesis(
+                        trace=trace,
+                        hypothesis_dict=hypothesis_dict,
+                        problem_dict=all_problems,
+                    )
+                # Find the problem name corresponding to the selected hypothesis text
+                pickled_problem_name = None
+                for problem_name, data in hypothesis_dict.items():
+                    if data.get("hypothesis", "") == selected_hypothesis_text:
+                        pickled_problem_name = problem_name
+                        break
+                if pickled_problem_name is None:
+                    raise ValueError("Selected hypothesis text does not match any known hypothesis.")
+                new_hypothesis = DSHypothesis(
+                    component=hypothesis_dict[pickled_problem_name].get("component", "Model"),
+                    hypothesis=hypothesis_dict[pickled_problem_name].get("hypothesis", "Hypothesis not provided"),
+                    reason=hypothesis_dict[pickled_problem_name].get("reason", "Reason not provided"),
+                    problem_name=pickled_problem_name,
+                    problem_desc=all_problems.get(pickled_problem_name, {}).get(
+                        "problem", "Problem description not provided"
+                    ),
+                    problem_label=all_problems.get(pickled_problem_name, {}).get(
+                        "label", "FEEDBACK_PROBLEM"
+                    ),
+                    appendix=hypothesis_dict[pickled_problem_name].get("appendix", None),
+                )
+            else:
+                pickled_problem_name, new_hypothesis = self.hypothesis_rank(
                 hypothesis_dict=hypothesis_dict,
                 problem_dict=all_problems,
             )