From ddec8c382fdb7a6a351753b8460008f1e2d10125 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Thu, 23 Oct 2025 08:30:49 +0000
Subject: [PATCH 01/29] add reward model
---
rdagent/app/data_science/conf.py | 12 +++
.../data_science/proposal/exp_gen/proposal.py | 61 +++++++++++++-
.../proposal/exp_gen/reward_inference.py | 84 +++++++++++++++++++
3 files changed, 155 insertions(+), 2 deletions(-)
create mode 100644 rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 2d8ec6262..ef394e7a0 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -197,6 +197,18 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction
user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction"
+ #### reward model related
+ enable_reward_model_selection: bool = True
+ """Enable reward model based hypothesis selection."""
+
+ reward_model_path: str = "/data/userdata/v-lijingyuan/logs/rm_bt_s1024_gc/tb/version_0"
+
+ """The path to the reward model for hypothesis selection."""
+
+
+
+
+
DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 8a0343f27..33b62dfef 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -313,7 +313,6 @@ def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraft
else:
return None
-
class DSProposalV1ExpGen(ExpGen):
def gen(
self,
@@ -1182,6 +1181,36 @@ def hypothesis_rank(
problem_label=problem_dict.get("label", "FEEDBACK_PROBLEM"),
appendix=hypothesis_dict[max_score_problem_name].get("appendix", None),
)
+
+ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]:
+ """
+ Select hypothesis based on reward model scores.
+ """
+ from .reward_inference import RewardModelInference
+ from transformers import AutoTokenizer
+ import os
+ logdir = DS_RD_SETTING.reward_model_path
+ base_model = "gpt2"
+ adapter_path = os.path.join(logdir, "lora_adapter")
+ reward_head_path = os.path.join(logdir, "reward_head.pt")
+ calib_path = os.path.join(logdir, "calib.json")
+
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
+ if not getattr(tokenizer, "pad_token", None):
+ tokenizer.pad_token = tokenizer.eos_token
+
+ model = RewardModelInference(
+ base_model_path=base_model,
+ adapter_path=adapter_path,
+ reward_head_path=reward_head_path,
+ calib_path=calib_path,
+ ).to("cuda")
+ texts = []
+ for name, data in hypothesis_dict.items():
+ texts.append(data.get("hypothesis", "Hypothesis not provided"))
+ rewards = model.compute_reward(texts, tokenizer)
+ max_idx = rewards.index(max(rewards))
+ return texts[max_idx]
def task_gen(
self,
@@ -1473,7 +1502,35 @@ def gen(
)
pickled_problem_name = None
else:
- pickled_problem_name, new_hypothesis = self.hypothesis_rank(
+ if DS_RD_SETTING.enable_reward_model_selection==True:
+ logger.info("Selecting hypothesis using reward model.")
+ selected_hypothesis_text = self.reward_model_select_hypothesis(
+ hypothesis_dict=hypothesis_dict,
+ problem_dict=all_problems,
+ )
+ # Find the problem name corresponding to the selected hypothesis text
+ pickled_problem_name = None
+ for problem_name, data in hypothesis_dict.items():
+ if data.get("hypothesis", "") == selected_hypothesis_text:
+ pickled_problem_name = problem_name
+ break
+ if pickled_problem_name is None:
+ raise ValueError("Selected hypothesis text does not match any known hypothesis.")
+ new_hypothesis = DSHypothesis(
+ component=hypothesis_dict[pickled_problem_name].get("component", "Model"),
+ hypothesis=hypothesis_dict[pickled_problem_name].get("hypothesis", "Hypothesis not provided"),
+ reason=hypothesis_dict[pickled_problem_name].get("reason", "Reason not provided"),
+ problem_name=pickled_problem_name,
+ problem_desc=all_problems.get(pickled_problem_name, {}).get(
+ "problem", "Problem description not provided"
+ ),
+ problem_label=all_problems.get(pickled_problem_name, {}).get(
+ "label", "FEEDBACK_PROBLEM"
+ ),
+ appendix=hypothesis_dict[pickled_problem_name].get("appendix", None),
+ )
+ else:
+ pickled_problem_name, new_hypothesis = self.hypothesis_rank(
hypothesis_dict=hypothesis_dict,
problem_dict=all_problems,
)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
new file mode 100644
index 000000000..d9fedac5d
--- /dev/null
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -0,0 +1,84 @@
+import os
+import json
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+from rdagent.app.data_science.conf import DS_RD_SETTING
+# =====================
+# Reward Model Wrapper
+# =====================
+class RewardModelInference(nn.Module):
+ def __init__(self, base_model_path, adapter_path, reward_head_path, calib_path=None, use_bf16=False):
+ super().__init__()
+ dtype = torch.bfloat16 if use_bf16 else torch.float16
+ self.model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=dtype)
+ self.model = PeftModel.from_pretrained(self.model, adapter_path)
+ self.model.eval()
+
+ # hidden size
+ hs = getattr(self.model.config, "hidden_size",
+ getattr(self.model.config, "n_embd",
+ getattr(self.model.config, "d_model", None)))
+ if hs is None:
+ hs = self.model.transformer.wte.embedding_dim
+
+ # reward head
+ self.reward_head = nn.Linear(hs, 1)
+
+ state_dict = torch.load(reward_head_path, map_location="cpu", weights_only=True)
+ self.reward_head.load_state_dict(state_dict)
+ self.reward_head = self.reward_head.to(dtype=self.model.dtype)
+ self.reward_head.eval()
+
+ # load calibration parameters
+ self.calib = {"a": 1.0, "b": 0.0, "tau": 1.0}
+ if calib_path and os.path.exists(calib_path):
+ with open(calib_path, "r", encoding="utf-8") as f:
+ self.calib = json.load(f)
+
+ # ✅ 打印调试信息,确认精度一致
+ print(f"[INFO] Model dtype: {self.model.dtype}, Reward head dtype: {next(self.reward_head.parameters()).dtype}")
+
+ @torch.no_grad()
+ def forward(self, input_ids, attention_mask):
+ out = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ output_hidden_states=True,
+ use_cache=False,
+ )
+ last_hidden = out.hidden_states[-1]
+ lengths = attention_mask.sum(dim=1) - 1
+ lengths = lengths.clamp(min=0)
+ idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1))
+ pooled = last_hidden.gather(1, idx).squeeze(1)
+ reward = self.reward_head(pooled).squeeze(-1)
+ return reward
+
+ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
+ if system_prompt is None:
+ system_prompt = (
+ "You are an experienced data science competition judge. "
+ "Evaluate the quality, effectiveness, and innovation of the proposed solutions."
+ )
+
+ inputs = [f"{system_prompt} Solution: {t}{tokenizer.eos_token}" for t in texts]
+
+ enc = tokenizer(
+ inputs,
+ truncation=True,
+ padding=True,
+ max_length=1024,
+ return_tensors="pt"
+ ).to(device)
+
+ rewards = self.forward(enc["input_ids"], enc["attention_mask"])
+ # Apply calibration
+ rewards = self.calib["a"] * rewards + self.calib["b"]
+ return rewards.cpu().exp().tolist()
+
+# =====================
+# Example Usage
+# =====================
+
From 2fd918c7033a62861561a2dd19e2d7a6a482a2ed Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 17 Nov 2025 08:51:55 +0000
Subject: [PATCH 02/29] add avg win reward model
---
rdagent/app/data_science/conf.py | 8 +-
.../data_science/proposal/exp_gen/proposal.py | 85 ++++++++++++-
.../proposal/exp_gen/reward_inference.py | 112 ++++++++++--------
3 files changed, 149 insertions(+), 56 deletions(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index ef394e7a0..88cf1e693 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,12 +201,14 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str = "/data/userdata/v-lijingyuan/logs/rm_bt_s1024_gc/tb/version_0"
-
+ reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_2"
"""The path to the reward model for hypothesis selection."""
+ reward_base_model: str = "Qwen/Qwen3-0.6B"
+ """ Backbone of the reward model"""
-
+ max_length = 2200
+ """ max_length of the reward model"""
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 33b62dfef..432a2ecbd 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1212,6 +1212,77 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di
max_idx = rewards.index(max(rewards))
return texts[max_idx]
+ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_dict):
+ """
+ Select hypothesis based on avg win rate
+ """
+ parent_nodes = {}
+ for node in range(len(trace.hist)):
+ parents = trace.get_parents(node)
+ parent_nodes[node] = parents[-2] if len(parents) > 1 else None
+ # FIXME: add the convert logic to method in trace
+ if hasattr(trace, "idx2loop_id"):
+ parent_nodes = {
+ trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
+ }
+ #if trace.current_selection:
+ # current_parent_record_id = trace.current_selection[0] # record id
+ current_parent_record_id = trace.current_selection[0]
+ loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
+ loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
+
+ hypothesis_list = [
+ trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
+ for loop_id in loop_id_list
+ if trace.hist[loop_id2idx[loop_id]][1].decision == True
+ ][::-1]
+ sep = "->"
+
+ hypothesis_chain_list = []
+ accumulate = []
+ for hyp in hypothesis_list:
+ accumulate.append(hyp)
+ hypothesis_chain_list.append(sep.join(accumulate))
+
+ last_text = []
+ texts = []
+ for name, data in hypothesis_dict.items():
+ last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
+ texts.append(data.get("hypothesis", "Hypothesis not provided"))
+
+ from .reward_inference import RewardModelInference
+ from transformers import AutoTokenizer
+ import os
+ logdir = DS_RD_SETTING.reward_model_path
+ base_model = DS_RD_SETTING.reward_base_model
+
+ adapter_path = os.path.join(logdir, "lora_adapter")
+ reward_head_path = os.path.join(logdir, "reward_head.pt")
+
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
+ if not getattr(tokenizer, "pad_token", None):
+ tokenizer.pad_token = tokenizer.eos_token
+
+ model = RewardModelInference(
+ base_model_path=base_model,
+ adapter_path=adapter_path,
+ reward_head_path=reward_head_path,
+ ).to("cuda")
+
+ parent_rewards = model(hypothesis_chain_list,tokenizer)
+ currnet_rewards = model(last_text,tokenizer)
+
+ avg_win_rate = []
+ for re in currnet_rewards:
+ win_rate = []
+ for p_re in parent_rewards:
+ current_win_rate = re/(re + p_re)
+ win_rate.append(current_win_rate)
+ avg_win_rate.append(np.mean(win_rate))
+ max_idx = avg_win_rate.index(max(avg_win_rate))
+ return texts[max_idx]
+
+
def task_gen(
self,
component_desc: str,
@@ -1503,11 +1574,15 @@ def gen(
pickled_problem_name = None
else:
if DS_RD_SETTING.enable_reward_model_selection==True:
- logger.info("Selecting hypothesis using reward model.")
- selected_hypothesis_text = self.reward_model_select_hypothesis(
- hypothesis_dict=hypothesis_dict,
- problem_dict=all_problems,
- )
+ # logger.info("Selecting hypothesis using reward model.")
+ # selected_hypothesis_text = self.reward_model_select_hypothesis(
+ # hypothesis_dict=hypothesis_dict,
+ # problem_dict=all_problems,
+ # )
+ logger.info("Selecting hypothesis using reward model. (avg win)")
+
+ selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict)
+
# Find the problem name corresponding to the selected hypothesis text
pickled_problem_name = None
for problem_name, data in hypothesis_dict.items():
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
index d9fedac5d..458a4f074 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -5,80 +5,96 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.app.data_science.conf import DS_RD_SETTING
+
+
# =====================
# Reward Model Wrapper
# =====================
class RewardModelInference(nn.Module):
- def __init__(self, base_model_path, adapter_path, reward_head_path, calib_path=None, use_bf16=False):
+ def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda"):
super().__init__()
- dtype = torch.bfloat16 if use_bf16 else torch.float16
- self.model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=dtype)
- self.model = PeftModel.from_pretrained(self.model, adapter_path)
- self.model.eval()
-
- # hidden size
- hs = getattr(self.model.config, "hidden_size",
- getattr(self.model.config, "n_embd",
- getattr(self.model.config, "d_model", None)))
+ self.device = device
+ self.base = AutoModelForCausalLM.from_pretrained(base_model_name)
+ self.base = PeftModel.from_pretrained(self.base, adapter_path)
+ if hasattr(self.base, "gradient_checkpointing_enable"):
+ self.base.gradient_checkpointing_enable()
+ if hasattr(self.base.config, "use_cache"):
+ self.base.config.use_cache = False
+ hs = getattr(self.base.config, "hidden_size",
+ getattr(self.base.config, "n_embd",
+ getattr(self.base.config, "d_model", None)))
if hs is None:
- hs = self.model.transformer.wte.embedding_dim
-
- # reward head
- self.reward_head = nn.Linear(hs, 1)
-
- state_dict = torch.load(reward_head_path, map_location="cpu", weights_only=True)
- self.reward_head.load_state_dict(state_dict)
- self.reward_head = self.reward_head.to(dtype=self.model.dtype)
- self.reward_head.eval()
+ hs = self.base.get_input_embeddings().embedding_dim
- # load calibration parameters
- self.calib = {"a": 1.0, "b": 0.0, "tau": 1.0}
- if calib_path and os.path.exists(calib_path):
- with open(calib_path, "r", encoding="utf-8") as f:
- self.calib = json.load(f)
+ self.reward_head = nn.Linear(hs, 1).to(device)
+ self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device))
- # ✅ 打印调试信息,确认精度一致
- print(f"[INFO] Model dtype: {self.model.dtype}, Reward head dtype: {next(self.reward_head.parameters()).dtype}")
+ @staticmethod
+ def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor:
+ lengths = attn_mask.sum(dim=1) - 1
+ lengths = lengths.clamp(min=0)
+ idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1))
+ return last_hidden.gather(1, idx).squeeze(1)
- @torch.no_grad()
def forward(self, input_ids, attention_mask):
- out = self.model(
- input_ids=input_ids,
- attention_mask=attention_mask,
+ out = self.base(
+ input_ids=input_ids.to(self.device),
+ attention_mask=attention_mask.to(self.device),
output_hidden_states=True,
- use_cache=False,
+ use_cache=False
)
last_hidden = out.hidden_states[-1]
- lengths = attention_mask.sum(dim=1) - 1
- lengths = lengths.clamp(min=0)
- idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1))
- pooled = last_hidden.gather(1, idx).squeeze(1)
+ pooled = self.pool_last_nonpad(last_hidden, attention_mask)
reward = self.reward_head(pooled).squeeze(-1)
return reward
def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
- if system_prompt is None:
- system_prompt = (
- "You are an experienced data science competition judge. "
- "Evaluate the quality, effectiveness, and innovation of the proposed solutions."
+ if system_prompt is not None:
+ self.system_prompt = system_prompt
+ elif not hasattr(self, "system_prompt"):
+ self.system_prompt = (
+ "You are a senior data science competition judge and solution expert.\n"
+ "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n"
+ "A hypothesis chain shows iterative improvement of solutions.\n"
+ "You should assess:\n"
+ "1) reasoning correctness and consistency across steps,\n"
+ "2) improvement and refinement through the chain,\n"
+ "3) final hypothesis quality and practicality.\n"
+ "Be strict and fair. Provide expert-level insight."
)
- inputs = [f"{system_prompt} Solution: {t}{tokenizer.eos_token}" for t in texts]
+ inputs = []
+ for s in texts:
+ prompt = (
+ f"{self.system_prompt}\n\n"
+ "Hypothesis Chain (each step separated by '->'):\n"
+ f"{s}\n\n"
+ "\n"
+ "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n"
+ "Focus on clarity, correctness, and improvement.\n"
+ "Make sure to consider the chain direction from earliest to latest.\n"
+ "\n\n"
+ "Final Evaluation:\n"
+ )
+ inputs.append(prompt)
enc = tokenizer(
inputs,
truncation=True,
padding=True,
- max_length=1024,
+ max_length=DS_RD_SETTING.max_length,
return_tensors="pt"
- ).to(device)
+ )
+
+ enc = {k: v.to(device) for k, v in enc.items()}
rewards = self.forward(enc["input_ids"], enc["attention_mask"])
- # Apply calibration
- rewards = self.calib["a"] * rewards + self.calib["b"]
- return rewards.cpu().exp().tolist()
-# =====================
-# Example Usage
-# =====================
+ return torch.exp(rewards).cpu().tolist()
+
+
+
+
+
From 98ca9cf455bf87bb9fc2643bf3ef6fd662dc4532 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 17 Nov 2025 10:11:56 +0000
Subject: [PATCH 03/29] add path online
---
rdagent/app/data_science/conf.py | 4 +++-
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 1 +
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 88cf1e693..b8dc589e2 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+
+ #"/data/userdata/v-lijingyuan/last_run_2"
"""The path to the reward model for hypothesis selection."""
reward_base_model: str = "Qwen/Qwen3-0.6B"
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 432a2ecbd..7c62e4b13 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1268,6 +1268,7 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
adapter_path=adapter_path,
reward_head_path=reward_head_path,
).to("cuda")
+ model.eval()
parent_rewards = model(hypothesis_chain_list,tokenizer)
currnet_rewards = model(last_text,tokenizer)
From 7d3dc7f40bd30990bc0f9995f5abb74e8858a051 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 17 Nov 2025 11:12:09 +0000
Subject: [PATCH 04/29] fix bug
---
rdagent/app/data_science/conf.py | 6 +--
.../data_science/proposal/exp_gen/proposal.py | 46 +++++++++++++---
.../proposal/exp_gen/reward_inference.py | 54 +++++++++++++++++--
3 files changed, 91 insertions(+), 15 deletions(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index b8dc589e2..d15d47bac 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,15 +201,15 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
-
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
"""The path to the reward model for hypothesis selection."""
reward_base_model: str = "Qwen/Qwen3-0.6B"
""" Backbone of the reward model"""
- max_length = 2200
+ max_length : int = 2200
""" max_length of the reward model"""
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 7c62e4b13..c18811646 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1182,7 +1182,7 @@ def hypothesis_rank(
appendix=hypothesis_dict[max_score_problem_name].get("appendix", None),
)
- def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]:
+ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]:
"""
Select hypothesis based on reward model scores.
"""
@@ -1190,7 +1190,7 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di
from transformers import AutoTokenizer
import os
logdir = DS_RD_SETTING.reward_model_path
- base_model = "gpt2"
+ base_model = DS_RD_SETTING.reward_base_model
adapter_path = os.path.join(logdir, "lora_adapter")
reward_head_path = os.path.join(logdir, "reward_head.pt")
calib_path = os.path.join(logdir, "calib.json")
@@ -1203,11 +1203,43 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di
base_model_path=base_model,
adapter_path=adapter_path,
reward_head_path=reward_head_path,
- calib_path=calib_path,
).to("cuda")
+ model.eval()
+
+ parent_nodes = {}
+ for node in range(len(trace.hist)):
+ parents = trace.get_parents(node)
+ parent_nodes[node] = parents[-2] if len(parents) > 1 else None
+ # FIXME: add the convert logic to method in trace
+ if hasattr(trace, "idx2loop_id"):
+ parent_nodes = {
+ trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
+ }
+ #if trace.current_selection:
+ # current_parent_record_id = trace.current_selection[0] # record id
+ current_parent_record_id = trace.current_selection[0]
+ loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
+ loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
+
+ hypothesis_list = [
+ trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
+ for loop_id in loop_id_list
+ if trace.hist[loop_id2idx[loop_id]][1].decision == True
+ ][::-1]
+ sep = "->"
+
+ hypothesis_chain_list = []
+ accumulate = []
+ for hyp in hypothesis_list:
+ accumulate.append(hyp)
+ hypothesis_chain_list.append(sep.join(accumulate))
+
+ last_text = []
texts = []
for name, data in hypothesis_dict.items():
+ last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
texts.append(data.get("hypothesis", "Hypothesis not provided"))
+
rewards = model.compute_reward(texts, tokenizer)
max_idx = rewards.index(max(rewards))
return texts[max_idx]
@@ -1264,14 +1296,14 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
tokenizer.pad_token = tokenizer.eos_token
model = RewardModelInference(
- base_model_path=base_model,
+ base_model_name=base_model,
adapter_path=adapter_path,
reward_head_path=reward_head_path,
).to("cuda")
model.eval()
- parent_rewards = model(hypothesis_chain_list,tokenizer)
- currnet_rewards = model(last_text,tokenizer)
+ parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer)
+ currnet_rewards = model.compute_reward(last_text,tokenizer)
avg_win_rate = []
for re in currnet_rewards:
@@ -1574,7 +1606,7 @@ def gen(
)
pickled_problem_name = None
else:
- if DS_RD_SETTING.enable_reward_model_selection==True:
+ if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection:
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
index 458a4f074..872f05adf 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -93,8 +93,52 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
return torch.exp(rewards).cpu().tolist()
-
-
-
-
-
+ # @torch.no_grad()
+ # def compute_reward(
+ # self,
+ # texts: list[str],
+ # tokenizer,
+ # batch_size: int = 1,
+ # ) -> list[float]:
+ # """
+ # 直接对字符串列表计算 reward。
+ # 自动 tokenizer、batch、GPU 支持
+ # """
+ # if device is None:
+ # device = self.device
+
+ # rewards_all = []
+
+ # # 分 batch 处理
+ # for i in range(0, len(texts), batch_size):
+ # batch_texts = texts[i:i+batch_size]
+ # # 构建 prompt
+ # batch_prompts = [
+ # (
+ # "You are a senior data science competition judge and solution expert.\n"
+ # "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n"
+ # "Hypothesis Chain (each step separated by '->'):\n"
+ # f"{s}\n\n"
+ # "\n"
+ # "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n"
+ # "Focus on clarity, correctness, and improvement.\n"
+ # "Make sure to consider the chain direction from earliest to latest.\n"
+ # "\n\n"
+ # "Final Evaluation:\n"
+ # )
+ # for s in batch_texts
+ # ]
+
+ # enc = tokenizer(
+ # batch_prompts,
+ # truncation=True,
+ # padding=True,
+ # max_length=DS_RD_SETTING.max_length,
+ # return_tensors="pt"
+ # )
+ # enc = {k: v.to(device) for k, v in enc.items()}
+
+ # rewards = self.forward(enc["input_ids"], enc["attention_mask"])
+ # rewards_all.extend(torch.exp(rewards).cpu().tolist())
+
+ # return rewards_all
From 3d24be04b75b1865a05c156af76e348f82499056 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 17 Nov 2025 14:57:26 +0000
Subject: [PATCH 05/29] add install transformers
---
pyproject.toml | 1 +
requirements/reward.txt | 3 +++
2 files changed, 4 insertions(+)
create mode 100644 requirements/reward.txt
diff --git a/pyproject.toml b/pyproject.toml
index 184f517d0..135b61cbd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -113,6 +113,7 @@ lint = {file = ["requirements/lint.txt"]}
package = {file = ["requirements/package.txt"]}
test = {file = ["requirements/test.txt"]}
torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch]
+torch = {file = ["requirements/reward.txt"]}
[tool.setuptools_scm]
local_scheme = "no-local-version"
diff --git a/requirements/reward.txt b/requirements/reward.txt
new file mode 100644
index 000000000..89e792420
--- /dev/null
+++ b/requirements/reward.txt
@@ -0,0 +1,3 @@
+torch
+transformers==4.53.1
+peft==0.6.1
\ No newline at end of file
From 62de6d10f2d983c0bff9953c8c7f66375d6e0aac Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 18 Nov 2025 02:24:49 +0000
Subject: [PATCH 06/29] add
---
requirements/test.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/requirements/test.txt b/requirements/test.txt
index 4b4fbc0b8..441c1bfab 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,3 +1,4 @@
# Requirements for test.
coverage
pytest
+pydantic_ai
\ No newline at end of file
From 0453af1f12561355827611ffad4badc36de9a77e Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 18 Nov 2025 07:29:47 +0000
Subject: [PATCH 07/29] fix bug remove torch
---
pyproject.toml | 1 +
requirements/reward.txt | 1 -
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index 135b61cbd..c6652ebdc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,3 +123,4 @@ version_scheme = "guess-next-dev"
all = true
in_place = true
trailing_comma_inline_array = true
+c2defc195e9665a86ad7614f14f0fb8ca4f248f4
\ No newline at end of file
diff --git a/requirements/reward.txt b/requirements/reward.txt
index 89e792420..df9afb2a3 100644
--- a/requirements/reward.txt
+++ b/requirements/reward.txt
@@ -1,3 +1,2 @@
-torch
transformers==4.53.1
peft==0.6.1
\ No newline at end of file
From 9111b0cd367f397e2c0694fb5dc3c91c9ac85ff6 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 18 Nov 2025 07:30:19 +0000
Subject: [PATCH 08/29] fix bug
---
pyproject.toml | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index c6652ebdc..3232a4d99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,5 +122,4 @@ version_scheme = "guess-next-dev"
[tool.tomlsort]
all = true
in_place = true
-trailing_comma_inline_array = true
-c2defc195e9665a86ad7614f14f0fb8ca4f248f4
\ No newline at end of file
+trailing_comma_inline_array = true
\ No newline at end of file
From b45aac0ad801af365e8b5108e8f82eda4ee87cb0 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 18 Nov 2025 07:33:57 +0000
Subject: [PATCH 09/29] fix bug 2
---
pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyproject.toml b/pyproject.toml
index 3232a4d99..70cc31e14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -113,7 +113,7 @@ lint = {file = ["requirements/lint.txt"]}
package = {file = ["requirements/package.txt"]}
test = {file = ["requirements/test.txt"]}
torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch]
-torch = {file = ["requirements/reward.txt"]}
+reward = {file = ["requirements/reward.txt"]}
[tool.setuptools_scm]
local_scheme = "no-local-version"
From fabcc846a260977b6f9af8536d39da60301d174d Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 18 Nov 2025 08:12:44 +0000
Subject: [PATCH 10/29] fix bug 3
---
Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index a6787a88c..10d143b87 100644
--- a/Makefile
+++ b/Makefile
@@ -67,7 +67,7 @@ init-qlib-env:
@source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate
dev:
- $(PIPRUN) pip install -e .[docs,lint,package,test] -c $(CONSTRAINTS_FILE)
+ $(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE)
$(PIPRUN) pip install -U kaggle
if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi
From 6258915da3970e80039fa76c595f1f6a4eecd9bd Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Wed, 19 Nov 2025 07:57:15 +0000
Subject: [PATCH 11/29] fix pydantic-ai version
---
requirements.txt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 619b19fa8..abb51499a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -72,7 +72,7 @@ azureml-mlflow
types-pytz
# Agent
-pydantic-ai-slim[mcp,openai,prefect]
+pydantic-ai-slim[mcp,openai,prefect]==1.9.1
nest-asyncio
-prefect
\ No newline at end of file
+prefect==3.5.0
\ No newline at end of file
From a62870c83bd1f983f23c1e765a4408a7feb75dec Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Wed, 19 Nov 2025 10:16:01 +0000
Subject: [PATCH 12/29] fix pydantic-ai-bug :(
---
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +-
requirements.txt | 1 -
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index c18811646..ae6a2bfed 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -9,7 +9,6 @@
from pydantic import BaseModel, Field
from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.agent.rag import Agent as RAGAgent
from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
from rdagent.components.coder.data_science.feature.exp import FeatureTask
from rdagent.components.coder.data_science.model.exp import ModelTask
@@ -648,6 +647,7 @@ def hypothesis_gen(
# knowledge retrieval
if DS_RD_SETTING.enable_research_rag:
+ from rdagent.components.agent.rag import Agent as RAGAgent
rag_agent = RAGAgent(
system_prompt="""You are a helpful assistant.
You help users retrieve relevant knowledge from community discussions and public code."""
diff --git a/requirements.txt b/requirements.txt
index abb51499a..0c4e5ffa4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -72,7 +72,6 @@ azureml-mlflow
types-pytz
# Agent
-pydantic-ai-slim[mcp,openai,prefect]==1.9.1
nest-asyncio
prefect==3.5.0
\ No newline at end of file
From 1d7ee2c7d30ec64a12fb50d31a18a5a09f8c8980 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Wed, 19 Nov 2025 16:50:13 +0000
Subject: [PATCH 13/29] fix bug 4
---
rdagent/components/coder/data_science/pipeline/eval.py | 2 +-
requirements/test.txt | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
index f296d986f..a5ee17744 100644
--- a/rdagent/components/coder/data_science/pipeline/eval.py
+++ b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -8,7 +8,6 @@
import pandas as pd
from rdagent.app.data_science.conf import DS_RD_SETTING
-from rdagent.components.agent.context7 import Agent as DocAgent
from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback
from rdagent.components.coder.CoSTEER.evaluators import (
CoSTEEREvaluator,
@@ -307,6 +306,7 @@ def evaluate(
do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search
if do_documentation_search:
+ from rdagent.components.agent.context7 import Agent as DocAgent
# Use MCPAgent for clean, user-friendly interface
try:
# Create agent targeting Context7 service - model config comes from mcp_config.json
diff --git a/requirements/test.txt b/requirements/test.txt
index 441c1bfab..30699b666 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,4 +1,4 @@
# Requirements for test.
coverage
pytest
-pydantic_ai
\ No newline at end of file
+#pydantic_ai
\ No newline at end of file
From f83cfb5db3bf31c26bf8e46345bad9b85c3255ef Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Thu, 20 Nov 2025 08:25:10 +0000
Subject: [PATCH 14/29] add two type reward model
---
Makefile | 1 +
rdagent/app/data_science/conf.py | 1 +
.../data_science/proposal/exp_gen/proposal.py | 11 ++++++++---
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/Makefile b/Makefile
index 10d143b87..16f10d24b 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,7 @@ init-qlib-env:
@source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate
dev:
+ $(PIPRUN) pip install -U pip setuptools wheel
$(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE)
$(PIPRUN) pip install -U kaggle
if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index d15d47bac..38a791c43 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -212,6 +212,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
max_length : int = 2200
""" max_length of the reward model"""
+ reward_select_type: int = 1
DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index ae6a2bfed..72c5d20de 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1613,9 +1613,14 @@ def gen(
# problem_dict=all_problems,
# )
logger.info("Selecting hypothesis using reward model. (avg win)")
-
- selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict)
-
+ if DS_RD_SETTING.reward_select_type==1:
+ selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict)
+ elif DS_RD_SETTING.reward_select_type==2:
+ selected_hypothesis_text = self.reward_model_select_hypothesis(
+ trace=trace,
+ hypothesis_dict=hypothesis_dict,
+ problem_dict=all_problems,
+ )
# Find the problem name corresponding to the selected hypothesis text
pickled_problem_name = None
for problem_name, data in hypothesis_dict.items():
From 78e6b1ad3a3fc62ba621dae4ba0334d08cc8e765 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Thu, 20 Nov 2025 09:32:43 +0000
Subject: [PATCH 15/29] fix bug 5
---
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 72c5d20de..6794bb21c 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1200,7 +1200,7 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
tokenizer.pad_token = tokenizer.eos_token
model = RewardModelInference(
- base_model_path=base_model,
+ base_model_name=base_model,
adapter_path=adapter_path,
reward_head_path=reward_head_path,
).to("cuda")
From 9356071cc93b80394e31e32e617b372ba60e2239 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Fri, 21 Nov 2025 10:45:56 +0000
Subject: [PATCH 16/29] fix bug 6
---
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 6794bb21c..c993e373d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1606,7 +1606,7 @@ def gen(
)
pickled_problem_name = None
else:
- if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection:
+ if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and not trace.is_selection_new_tree():
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
From 1b28ffcb41bd7f1a45fc55100e8bd603e334240e Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 24 Nov 2025 08:38:31 +0000
Subject: [PATCH 17/29] add more competition
---
rdagent/app/data_science/conf.py | 14 ++++-
.../data_science/proposal/exp_gen/proposal.py | 20 +++++--
.../proposal/exp_gen/reward_inference.py | 56 ++-----------------
3 files changed, 31 insertions(+), 59 deletions(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 38a791c43..21f8b1280 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,15 +201,25 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_3" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
+
+ #reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_3"
+
+
"""The path to the reward model for hypothesis selection."""
+ competition_mapping_path: str = "/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/comp_to_scen.json"
+
+ #competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json"
+
+
reward_base_model: str = "Qwen/Qwen3-0.6B"
""" Backbone of the reward model"""
- max_length : int = 2200
+ max_length : int = 2300
""" max_length of the reward model"""
reward_select_type: int = 1
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index c993e373d..075dad165 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1240,7 +1240,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
texts.append(data.get("hypothesis", "Hypothesis not provided"))
- rewards = model.compute_reward(texts, tokenizer)
+ comp_dict_path = DS_RD_SETTING.competition_mapping_path
+ with open(comp_dict_path, "r") as f:
+ comp_dict = json.load(f)
+ competition = trace.scen.competition
+ comp_description = comp_dict[competition]
+ rewards = model.compute_reward(texts, tokenizer,comp_description)
max_idx = rewards.index(max(rewards))
return texts[max_idx]
@@ -1301,9 +1306,13 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
reward_head_path=reward_head_path,
).to("cuda")
model.eval()
-
- parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer)
- currnet_rewards = model.compute_reward(last_text,tokenizer)
+ comp_dict_path = DS_RD_SETTING.competition_mapping_path
+ with open(comp_dict_path, "r") as f:
+ comp_dict = json.load(f)
+ competition = trace.scen.competition
+ comp_description = comp_dict[competition]
+ parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer,comp_description)
+ currnet_rewards = model.compute_reward(last_text,tokenizer,comp_description)
avg_win_rate = []
for re in currnet_rewards:
@@ -1606,7 +1615,8 @@ def gen(
)
pickled_problem_name = None
else:
- if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and not trace.is_selection_new_tree():
+ sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)
+ if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag:
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
index 872f05adf..fa70c282b 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -49,7 +49,7 @@ def forward(self, input_ids, attention_mask):
reward = self.reward_head(pooled).squeeze(-1)
return reward
- def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
+ def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, device="cuda"):
if system_prompt is not None:
self.system_prompt = system_prompt
elif not hasattr(self, "system_prompt"):
@@ -68,6 +68,7 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
for s in texts:
prompt = (
f"{self.system_prompt}\n\n"
+ f"Competition description:\n{comp_description}\n\n"
"Hypothesis Chain (each step separated by '->'):\n"
f"{s}\n\n"
"\n"
@@ -77,6 +78,7 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
"\n\n"
"Final Evaluation:\n"
)
+
inputs.append(prompt)
enc = tokenizer(
@@ -91,54 +93,4 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"):
rewards = self.forward(enc["input_ids"], enc["attention_mask"])
- return torch.exp(rewards).cpu().tolist()
-
- # @torch.no_grad()
- # def compute_reward(
- # self,
- # texts: list[str],
- # tokenizer,
- # batch_size: int = 1,
- # ) -> list[float]:
- # """
- # 直接对字符串列表计算 reward。
- # 自动 tokenizer、batch、GPU 支持
- # """
- # if device is None:
- # device = self.device
-
- # rewards_all = []
-
- # # 分 batch 处理
- # for i in range(0, len(texts), batch_size):
- # batch_texts = texts[i:i+batch_size]
- # # 构建 prompt
- # batch_prompts = [
- # (
- # "You are a senior data science competition judge and solution expert.\n"
- # "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n"
- # "Hypothesis Chain (each step separated by '->'):\n"
- # f"{s}\n\n"
- # "\n"
- # "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n"
- # "Focus on clarity, correctness, and improvement.\n"
- # "Make sure to consider the chain direction from earliest to latest.\n"
- # "\n\n"
- # "Final Evaluation:\n"
- # )
- # for s in batch_texts
- # ]
-
- # enc = tokenizer(
- # batch_prompts,
- # truncation=True,
- # padding=True,
- # max_length=DS_RD_SETTING.max_length,
- # return_tensors="pt"
- # )
- # enc = {k: v.to(device) for k, v in enc.items()}
-
- # rewards = self.forward(enc["input_ids"], enc["attention_mask"])
- # rewards_all.extend(torch.exp(rewards).cpu().tolist())
-
- # return rewards_all
+ return torch.exp(rewards).cpu().tolist()
\ No newline at end of file
From 4784b75fec16838026f6e1b6b328f02c404aa36d Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 24 Nov 2025 09:31:34 +0000
Subject: [PATCH 18/29] fix bug 7
---
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 075dad165..fbeaaf7a2 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1616,7 +1616,7 @@ def gen(
pickled_problem_name = None
else:
sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)
- if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag:
+ if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag and not trace.is_selection_new_tree():
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
From d20781d44ab71c58b178f51d0d92d09e0e2afe3f Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 24 Nov 2025 11:17:08 +0000
Subject: [PATCH 19/29] fix bug 8
---
.../data_science/proposal/exp_gen/proposal.py | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index fbeaaf7a2..c771b0191 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1215,9 +1215,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
- #if trace.current_selection:
- # current_parent_record_id = trace.current_selection[0] # record id
- current_parent_record_id = trace.current_selection[0]
+ if trace.current_selection:
+ current_parent_record_id = trace.current_selection[0] # record id
+ else:
+ return texts[0]
+ #
+ #current_parent_record_id = trace.current_selection[0]
loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
@@ -1262,9 +1265,11 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
- #if trace.current_selection:
- # current_parent_record_id = trace.current_selection[0] # record id
- current_parent_record_id = trace.current_selection[0]
+ if trace.current_selection:
+ current_parent_record_id = trace.current_selection[0] # record id
+ else:
+ return texts[0]
+
loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
@@ -1616,7 +1621,7 @@ def gen(
pickled_problem_name = None
else:
sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)
- if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag and not trace.is_selection_new_tree():
+ if DS_RD_SETTING.enable_reward_model_selection==True and sota_flag and not trace.is_selection_new_tree():
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
From 89a3a610c4b786b365e89fd5f38b0b1bb1a3f511 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 24 Nov 2025 16:22:47 +0000
Subject: [PATCH 20/29] fix bug 9
---
.../scenarios/data_science/proposal/exp_gen/proposal.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index c771b0191..45c3b1f63 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1216,7 +1216,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
if trace.current_selection:
- current_parent_record_id = trace.current_selection[0] # record id
+ current_parent_record_id = trace.current_selection[0] # record id
+ if current_parent_record_id == -1:
+ return texts[0]
else:
return texts[0]
#
@@ -1266,7 +1268,9 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
if trace.current_selection:
- current_parent_record_id = trace.current_selection[0] # record id
+ current_parent_record_id = trace.current_selection[0] # record id
+ if current_parent_record_id == -1:
+ return texts[0]
else:
return texts[0]
From 0fd51dbed37a9f31fbd0d426916e0107cf644a26 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 25 Nov 2025 09:20:27 +0000
Subject: [PATCH 21/29] fix bug 10
---
rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 45c3b1f63..03d07be94 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1239,6 +1239,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
accumulate.append(hyp)
hypothesis_chain_list.append(sep.join(accumulate))
+ if not hypothesis_chain_list:
+ return texts[0]
+
last_text = []
texts = []
for name, data in hypothesis_dict.items():
@@ -1290,6 +1293,9 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
accumulate.append(hyp)
hypothesis_chain_list.append(sep.join(accumulate))
+ if not hypothesis_chain_list:
+ return texts[0]
+
last_text = []
texts = []
for name, data in hypothesis_dict.items():
From d0aa7324fa4f2f37cb74c7df62d9203d4cfbfa37 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 25 Nov 2025 10:43:01 +0000
Subject: [PATCH 22/29] fix bug 11
---
.../data_science/proposal/exp_gen/proposal.py | 33 +++++++++----------
1 file changed, 16 insertions(+), 17 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 03d07be94..b4f49f76d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1215,14 +1215,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
- if trace.current_selection:
- current_parent_record_id = trace.current_selection[0] # record id
- if current_parent_record_id == -1:
- return texts[0]
- else:
- return texts[0]
- #
- #current_parent_record_id = trace.current_selection[0]
+
+ if not trace.current_selection or trace.current_selection[0] == -1:
+ first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+ return first_text
+
+ current_parent_record_id = trace.current_selection[0]
loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
@@ -1240,8 +1238,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
hypothesis_chain_list.append(sep.join(accumulate))
if not hypothesis_chain_list:
- return texts[0]
-
+ first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+ return first_text
+
last_text = []
texts = []
for name, data in hypothesis_dict.items():
@@ -1270,12 +1269,11 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}
- if trace.current_selection:
- current_parent_record_id = trace.current_selection[0] # record id
- if current_parent_record_id == -1:
- return texts[0]
- else:
- return texts[0]
+
+ if not trace.current_selection or trace.current_selection[0] == -1:
+ first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+ return first_text
+ current_parent_record_id = trace.current_selection[0]
loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)
@@ -1294,7 +1292,8 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_
hypothesis_chain_list.append(sep.join(accumulate))
if not hypothesis_chain_list:
- return texts[0]
+ first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
+ return first_text
last_text = []
texts = []
From 35c60d901535cb675870dda22f13198aafa58bbd Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 8 Dec 2025 10:09:38 +0000
Subject: [PATCH 23/29] add new ckpt
---
rdagent/app/data_science/conf.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 21f8b1280..242127685 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_3" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_5" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
From f8904d97396968085f078d205107c32a688002e7 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Thu, 18 Dec 2025 10:23:29 +0000
Subject: [PATCH 24/29] add ckpt 6
---
rdagent/app/data_science/conf.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 242127685..4406b61ab 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_5" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_6" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
From 38b9736590fdc91430f02bc6259ffa14fe588bc4 Mon Sep 17 00:00:00 2001
From: Star dust <93254841+jingyuanlm@users.noreply.github.com>
Date: Fri, 9 Jan 2026 15:33:10 +0800
Subject: [PATCH 25/29] Evo (#1322)
* add ckpt 7
* add ckpt 8
---
rdagent/app/data_science/conf.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 4406b61ab..2c792372e 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_6" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
@@ -216,7 +216,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
#competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json"
- reward_base_model: str = "Qwen/Qwen3-0.6B"
+ reward_base_model: str = "Qwen/Qwen3-4B"
""" Backbone of the reward model"""
max_length : int = 2300
From aa8637a10ca82f60be4dccc9b880d1cd9de7307d Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Fri, 9 Jan 2026 07:50:51 +0000
Subject: [PATCH 26/29] ckpt 7
---
rdagent/app/data_science/conf.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 2c792372e..d714255d6 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_7" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"
From b6cd8133c43aec0d29170328f4aab9e77c7be1e8 Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Tue, 13 Jan 2026 07:36:06 +0000
Subject: [PATCH 27/29] fix cuda problem/ckpt7
---
.../scenarios/data_science/proposal/exp_gen/proposal.py | 8 ++++----
.../data_science/proposal/exp_gen/reward_inference.py | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index b4f49f76d..f40d6bdc8 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -40,7 +40,7 @@
from rdagent.utils.agent.tpl import T
from rdagent.utils.repo.diff import generate_diff_from_dict
from rdagent.utils.workflow import wait_retry
-
+import torch
_COMPONENT_META: Dict[str, Dict[str, Any]] = {
"DataLoadSpec": {
"target_name": "Data loader and specification generation",
@@ -1198,12 +1198,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di
tokenizer = AutoTokenizer.from_pretrained(base_model)
if not getattr(tokenizer, "pad_token", None):
tokenizer.pad_token = tokenizer.eos_token
-
+ device = torch.device("cuda:1")
model = RewardModelInference(
base_model_name=base_model,
adapter_path=adapter_path,
- reward_head_path=reward_head_path,
- ).to("cuda")
+ reward_head_path=reward_head_path,device=device
+ )
model.eval()
parent_nodes = {}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
index fa70c282b..8910476fb 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -49,7 +49,7 @@ def forward(self, input_ids, attention_mask):
reward = self.reward_head(pooled).squeeze(-1)
return reward
- def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, device="cuda"):
+ def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None):
if system_prompt is not None:
self.system_prompt = system_prompt
elif not hasattr(self, "system_prompt"):
@@ -89,7 +89,7 @@ def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None,
return_tensors="pt"
)
- enc = {k: v.to(device) for k, v in enc.items()}
+ enc = {k: v.to(self.device) for k, v in enc.items()}
rewards = self.forward(enc["input_ids"], enc["attention_mask"])
From 1fc1a6e14ffa13f16eff09d460a2d173f404499b Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Wed, 14 Jan 2026 10:45:35 +0000
Subject: [PATCH 28/29] ckpt 7 fix bug
---
.../proposal/exp_gen/reward_inference.py | 24 +++++++++++++------
1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
index 8910476fb..ab33009df 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py
@@ -12,23 +12,33 @@
# Reward Model Wrapper
# =====================
class RewardModelInference(nn.Module):
- def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda"):
+ def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda:1"):
super().__init__()
- self.device = device
+ self.device = torch.device(device)
+
self.base = AutoModelForCausalLM.from_pretrained(base_model_name)
self.base = PeftModel.from_pretrained(self.base, adapter_path)
+ self.base.to(self.device)
+
if hasattr(self.base, "gradient_checkpointing_enable"):
self.base.gradient_checkpointing_enable()
if hasattr(self.base.config, "use_cache"):
self.base.config.use_cache = False
- hs = getattr(self.base.config, "hidden_size",
- getattr(self.base.config, "n_embd",
- getattr(self.base.config, "d_model", None)))
+
+ hs = getattr(
+ self.base.config,
+ "hidden_size",
+ getattr(self.base.config, "n_embd",
+ getattr(self.base.config, "d_model", None))
+ )
if hs is None:
hs = self.base.get_input_embeddings().embedding_dim
- self.reward_head = nn.Linear(hs, 1).to(device)
- self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device))
+ self.reward_head = nn.Linear(hs, 1)
+ self.reward_head.load_state_dict(
+ torch.load(reward_head_path, map_location="cpu")
+ )
+ self.reward_head.to(self.device)
@staticmethod
def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor:
From d344fd680fb91f14b04667fbc3c6c7f776910d1f Mon Sep 17 00:00:00 2001
From: jingyuanlm <842442862@qq.com>
Date: Mon, 19 Jan 2026 08:20:14 +0000
Subject: [PATCH 29/29] ckpt 8
---
rdagent/app/data_science/conf.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index d714255d6..2c792372e 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""
- reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_7" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
+ reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"