From ddec8c382fdb7a6a351753b8460008f1e2d10125 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Thu, 23 Oct 2025 08:30:49 +0000 Subject: [PATCH 01/29] add reward model --- rdagent/app/data_science/conf.py | 12 +++ .../data_science/proposal/exp_gen/proposal.py | 61 +++++++++++++- .../proposal/exp_gen/reward_inference.py | 84 +++++++++++++++++++ 3 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 2d8ec6262..ef394e7a0 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -197,6 +197,18 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction" + #### reward model related + enable_reward_model_selection: bool = True + """Enable reward model based hypothesis selection.""" + + reward_model_path: str = "/data/userdata/v-lijingyuan/logs/rm_bt_s1024_gc/tb/version_0" + + """The path to the reward model for hypothesis selection.""" + + + + + DS_RD_SETTING = DataScienceBasePropSetting() diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 8a0343f27..33b62dfef 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -313,7 +313,6 @@ def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraft else: return None - class DSProposalV1ExpGen(ExpGen): def gen( self, @@ -1182,6 +1181,36 @@ def hypothesis_rank( problem_label=problem_dict.get("label", "FEEDBACK_PROBLEM"), appendix=hypothesis_dict[max_score_problem_name].get("appendix", None), ) + + def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]: + """ + Select hypothesis based on reward model scores. + """ + from .reward_inference import RewardModelInference + from transformers import AutoTokenizer + import os + logdir = DS_RD_SETTING.reward_model_path + base_model = "gpt2" + adapter_path = os.path.join(logdir, "lora_adapter") + reward_head_path = os.path.join(logdir, "reward_head.pt") + calib_path = os.path.join(logdir, "calib.json") + + tokenizer = AutoTokenizer.from_pretrained(base_model) + if not getattr(tokenizer, "pad_token", None): + tokenizer.pad_token = tokenizer.eos_token + + model = RewardModelInference( + base_model_path=base_model, + adapter_path=adapter_path, + reward_head_path=reward_head_path, + calib_path=calib_path, + ).to("cuda") + texts = [] + for name, data in hypothesis_dict.items(): + texts.append(data.get("hypothesis", "Hypothesis not provided")) + rewards = model.compute_reward(texts, tokenizer) + max_idx = rewards.index(max(rewards)) + return texts[max_idx] def task_gen( self, @@ -1473,7 +1502,35 @@ def gen( ) pickled_problem_name = None else: - pickled_problem_name, new_hypothesis = self.hypothesis_rank( + if DS_RD_SETTING.enable_reward_model_selection==True: + logger.info("Selecting hypothesis using reward model.") + selected_hypothesis_text = self.reward_model_select_hypothesis( + hypothesis_dict=hypothesis_dict, + problem_dict=all_problems, + ) + # Find the problem name corresponding to the selected hypothesis text + pickled_problem_name = None + for problem_name, data in hypothesis_dict.items(): + if data.get("hypothesis", "") == selected_hypothesis_text: + pickled_problem_name = problem_name + break + if pickled_problem_name is None: + raise ValueError("Selected hypothesis text does not match any known hypothesis.") + new_hypothesis = DSHypothesis( + component=hypothesis_dict[pickled_problem_name].get("component", "Model"), + hypothesis=hypothesis_dict[pickled_problem_name].get("hypothesis", "Hypothesis not provided"), + reason=hypothesis_dict[pickled_problem_name].get("reason", "Reason not provided"), + problem_name=pickled_problem_name, + problem_desc=all_problems.get(pickled_problem_name, {}).get( + "problem", "Problem description not provided" + ), + problem_label=all_problems.get(pickled_problem_name, {}).get( + "label", "FEEDBACK_PROBLEM" + ), + appendix=hypothesis_dict[pickled_problem_name].get("appendix", None), + ) + else: + pickled_problem_name, new_hypothesis = self.hypothesis_rank( hypothesis_dict=hypothesis_dict, problem_dict=all_problems, ) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py new file mode 100644 index 000000000..d9fedac5d --- /dev/null +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -0,0 +1,84 @@ +import os +import json +import torch +import torch.nn as nn +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel +from rdagent.app.data_science.conf import DS_RD_SETTING +# ===================== +# Reward Model Wrapper +# ===================== +class RewardModelInference(nn.Module): + def __init__(self, base_model_path, adapter_path, reward_head_path, calib_path=None, use_bf16=False): + super().__init__() + dtype = torch.bfloat16 if use_bf16 else torch.float16 + self.model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=dtype) + self.model = PeftModel.from_pretrained(self.model, adapter_path) + self.model.eval() + + # hidden size + hs = getattr(self.model.config, "hidden_size", + getattr(self.model.config, "n_embd", + getattr(self.model.config, "d_model", None))) + if hs is None: + hs = self.model.transformer.wte.embedding_dim + + # reward head + self.reward_head = nn.Linear(hs, 1) + + state_dict = torch.load(reward_head_path, map_location="cpu", weights_only=True) + self.reward_head.load_state_dict(state_dict) + self.reward_head = self.reward_head.to(dtype=self.model.dtype) + self.reward_head.eval() + + # load calibration parameters + self.calib = {"a": 1.0, "b": 0.0, "tau": 1.0} + if calib_path and os.path.exists(calib_path): + with open(calib_path, "r", encoding="utf-8") as f: + self.calib = json.load(f) + + # ✅ 打印调试信息,确认精度一致 + print(f"[INFO] Model dtype: {self.model.dtype}, Reward head dtype: {next(self.reward_head.parameters()).dtype}") + + @torch.no_grad() + def forward(self, input_ids, attention_mask): + out = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + use_cache=False, + ) + last_hidden = out.hidden_states[-1] + lengths = attention_mask.sum(dim=1) - 1 + lengths = lengths.clamp(min=0) + idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1)) + pooled = last_hidden.gather(1, idx).squeeze(1) + reward = self.reward_head(pooled).squeeze(-1) + return reward + + def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): + if system_prompt is None: + system_prompt = ( + "You are an experienced data science competition judge. " + "Evaluate the quality, effectiveness, and innovation of the proposed solutions." + ) + + inputs = [f"{system_prompt} Solution: {t}{tokenizer.eos_token}" for t in texts] + + enc = tokenizer( + inputs, + truncation=True, + padding=True, + max_length=1024, + return_tensors="pt" + ).to(device) + + rewards = self.forward(enc["input_ids"], enc["attention_mask"]) + # Apply calibration + rewards = self.calib["a"] * rewards + self.calib["b"] + return rewards.cpu().exp().tolist() + +# ===================== +# Example Usage +# ===================== + From 2fd918c7033a62861561a2dd19e2d7a6a482a2ed Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 17 Nov 2025 08:51:55 +0000 Subject: [PATCH 02/29] add avg win reward model --- rdagent/app/data_science/conf.py | 8 +- .../data_science/proposal/exp_gen/proposal.py | 85 ++++++++++++- .../proposal/exp_gen/reward_inference.py | 112 ++++++++++-------- 3 files changed, 149 insertions(+), 56 deletions(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index ef394e7a0..88cf1e693 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,12 +201,14 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str = "/data/userdata/v-lijingyuan/logs/rm_bt_s1024_gc/tb/version_0" - + reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_2" """The path to the reward model for hypothesis selection.""" + reward_base_model: str = "Qwen/Qwen3-0.6B" + """ Backbone of the reward model""" - + max_length = 2200 + """ max_length of the reward model""" diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 33b62dfef..432a2ecbd 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1212,6 +1212,77 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di max_idx = rewards.index(max(rewards)) return texts[max_idx] + def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_dict): + """ + Select hypothesis based on avg win rate + """ + parent_nodes = {} + for node in range(len(trace.hist)): + parents = trace.get_parents(node) + parent_nodes[node] = parents[-2] if len(parents) > 1 else None + # FIXME: add the convert logic to method in trace + if hasattr(trace, "idx2loop_id"): + parent_nodes = { + trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() + } + #if trace.current_selection: + # current_parent_record_id = trace.current_selection[0] # record id + current_parent_record_id = trace.current_selection[0] + loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} + loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) + + hypothesis_list = [ + trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis + for loop_id in loop_id_list + if trace.hist[loop_id2idx[loop_id]][1].decision == True + ][::-1] + sep = "->" + + hypothesis_chain_list = [] + accumulate = [] + for hyp in hypothesis_list: + accumulate.append(hyp) + hypothesis_chain_list.append(sep.join(accumulate)) + + last_text = [] + texts = [] + for name, data in hypothesis_dict.items(): + last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided")) + texts.append(data.get("hypothesis", "Hypothesis not provided")) + + from .reward_inference import RewardModelInference + from transformers import AutoTokenizer + import os + logdir = DS_RD_SETTING.reward_model_path + base_model = DS_RD_SETTING.reward_base_model + + adapter_path = os.path.join(logdir, "lora_adapter") + reward_head_path = os.path.join(logdir, "reward_head.pt") + + tokenizer = AutoTokenizer.from_pretrained(base_model) + if not getattr(tokenizer, "pad_token", None): + tokenizer.pad_token = tokenizer.eos_token + + model = RewardModelInference( + base_model_path=base_model, + adapter_path=adapter_path, + reward_head_path=reward_head_path, + ).to("cuda") + + parent_rewards = model(hypothesis_chain_list,tokenizer) + currnet_rewards = model(last_text,tokenizer) + + avg_win_rate = [] + for re in currnet_rewards: + win_rate = [] + for p_re in parent_rewards: + current_win_rate = re/(re + p_re) + win_rate.append(current_win_rate) + avg_win_rate.append(np.mean(win_rate)) + max_idx = avg_win_rate.index(max(avg_win_rate)) + return texts[max_idx] + + def task_gen( self, component_desc: str, @@ -1503,11 +1574,15 @@ def gen( pickled_problem_name = None else: if DS_RD_SETTING.enable_reward_model_selection==True: - logger.info("Selecting hypothesis using reward model.") - selected_hypothesis_text = self.reward_model_select_hypothesis( - hypothesis_dict=hypothesis_dict, - problem_dict=all_problems, - ) + # logger.info("Selecting hypothesis using reward model.") + # selected_hypothesis_text = self.reward_model_select_hypothesis( + # hypothesis_dict=hypothesis_dict, + # problem_dict=all_problems, + # ) + logger.info("Selecting hypothesis using reward model. (avg win)") + + selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict) + # Find the problem name corresponding to the selected hypothesis text pickled_problem_name = None for problem_name, data in hypothesis_dict.items(): diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py index d9fedac5d..458a4f074 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -5,80 +5,96 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel from rdagent.app.data_science.conf import DS_RD_SETTING +from rdagent.app.data_science.conf import DS_RD_SETTING + + # ===================== # Reward Model Wrapper # ===================== class RewardModelInference(nn.Module): - def __init__(self, base_model_path, adapter_path, reward_head_path, calib_path=None, use_bf16=False): + def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda"): super().__init__() - dtype = torch.bfloat16 if use_bf16 else torch.float16 - self.model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=dtype) - self.model = PeftModel.from_pretrained(self.model, adapter_path) - self.model.eval() - - # hidden size - hs = getattr(self.model.config, "hidden_size", - getattr(self.model.config, "n_embd", - getattr(self.model.config, "d_model", None))) + self.device = device + self.base = AutoModelForCausalLM.from_pretrained(base_model_name) + self.base = PeftModel.from_pretrained(self.base, adapter_path) + if hasattr(self.base, "gradient_checkpointing_enable"): + self.base.gradient_checkpointing_enable() + if hasattr(self.base.config, "use_cache"): + self.base.config.use_cache = False + hs = getattr(self.base.config, "hidden_size", + getattr(self.base.config, "n_embd", + getattr(self.base.config, "d_model", None))) if hs is None: - hs = self.model.transformer.wte.embedding_dim - - # reward head - self.reward_head = nn.Linear(hs, 1) - - state_dict = torch.load(reward_head_path, map_location="cpu", weights_only=True) - self.reward_head.load_state_dict(state_dict) - self.reward_head = self.reward_head.to(dtype=self.model.dtype) - self.reward_head.eval() + hs = self.base.get_input_embeddings().embedding_dim - # load calibration parameters - self.calib = {"a": 1.0, "b": 0.0, "tau": 1.0} - if calib_path and os.path.exists(calib_path): - with open(calib_path, "r", encoding="utf-8") as f: - self.calib = json.load(f) + self.reward_head = nn.Linear(hs, 1).to(device) + self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device)) - # ✅ 打印调试信息,确认精度一致 - print(f"[INFO] Model dtype: {self.model.dtype}, Reward head dtype: {next(self.reward_head.parameters()).dtype}") + @staticmethod + def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor: + lengths = attn_mask.sum(dim=1) - 1 + lengths = lengths.clamp(min=0) + idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1)) + return last_hidden.gather(1, idx).squeeze(1) - @torch.no_grad() def forward(self, input_ids, attention_mask): - out = self.model( - input_ids=input_ids, - attention_mask=attention_mask, + out = self.base( + input_ids=input_ids.to(self.device), + attention_mask=attention_mask.to(self.device), output_hidden_states=True, - use_cache=False, + use_cache=False ) last_hidden = out.hidden_states[-1] - lengths = attention_mask.sum(dim=1) - 1 - lengths = lengths.clamp(min=0) - idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1)) - pooled = last_hidden.gather(1, idx).squeeze(1) + pooled = self.pool_last_nonpad(last_hidden, attention_mask) reward = self.reward_head(pooled).squeeze(-1) return reward def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): - if system_prompt is None: - system_prompt = ( - "You are an experienced data science competition judge. " - "Evaluate the quality, effectiveness, and innovation of the proposed solutions." + if system_prompt is not None: + self.system_prompt = system_prompt + elif not hasattr(self, "system_prompt"): + self.system_prompt = ( + "You are a senior data science competition judge and solution expert.\n" + "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n" + "A hypothesis chain shows iterative improvement of solutions.\n" + "You should assess:\n" + "1) reasoning correctness and consistency across steps,\n" + "2) improvement and refinement through the chain,\n" + "3) final hypothesis quality and practicality.\n" + "Be strict and fair. Provide expert-level insight." ) - inputs = [f"{system_prompt} Solution: {t}{tokenizer.eos_token}" for t in texts] + inputs = [] + for s in texts: + prompt = ( + f"{self.system_prompt}\n\n" + "Hypothesis Chain (each step separated by '->'):\n" + f"{s}\n\n" + "\n" + "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n" + "Focus on clarity, correctness, and improvement.\n" + "Make sure to consider the chain direction from earliest to latest.\n" + "\n\n" + "Final Evaluation:\n" + ) + inputs.append(prompt) enc = tokenizer( inputs, truncation=True, padding=True, - max_length=1024, + max_length=DS_RD_SETTING.max_length, return_tensors="pt" - ).to(device) + ) + + enc = {k: v.to(device) for k, v in enc.items()} rewards = self.forward(enc["input_ids"], enc["attention_mask"]) - # Apply calibration - rewards = self.calib["a"] * rewards + self.calib["b"] - return rewards.cpu().exp().tolist() -# ===================== -# Example Usage -# ===================== + return torch.exp(rewards).cpu().tolist() + + + + + From 98ca9cf455bf87bb9fc2643bf3ef6fd662dc4532 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 17 Nov 2025 10:11:56 +0000 Subject: [PATCH 03/29] add path online --- rdagent/app/data_science/conf.py | 4 +++- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 88cf1e693..b8dc589e2 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + + #"/data/userdata/v-lijingyuan/last_run_2" """The path to the reward model for hypothesis selection.""" reward_base_model: str = "Qwen/Qwen3-0.6B" diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 432a2ecbd..7c62e4b13 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1268,6 +1268,7 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ adapter_path=adapter_path, reward_head_path=reward_head_path, ).to("cuda") + model.eval() parent_rewards = model(hypothesis_chain_list,tokenizer) currnet_rewards = model(last_text,tokenizer) From 7d3dc7f40bd30990bc0f9995f5abb74e8858a051 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 17 Nov 2025 11:12:09 +0000 Subject: [PATCH 04/29] fix bug --- rdagent/app/data_science/conf.py | 6 +-- .../data_science/proposal/exp_gen/proposal.py | 46 +++++++++++++--- .../proposal/exp_gen/reward_inference.py | 54 +++++++++++++++++-- 3 files changed, 91 insertions(+), 15 deletions(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index b8dc589e2..d15d47bac 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,15 +201,15 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" - + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" """The path to the reward model for hypothesis selection.""" reward_base_model: str = "Qwen/Qwen3-0.6B" """ Backbone of the reward model""" - max_length = 2200 + max_length : int = 2200 """ max_length of the reward model""" diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 7c62e4b13..c18811646 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1182,7 +1182,7 @@ def hypothesis_rank( appendix=hypothesis_dict[max_score_problem_name].get("appendix", None), ) - def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]: + def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]: """ Select hypothesis based on reward model scores. """ @@ -1190,7 +1190,7 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di from transformers import AutoTokenizer import os logdir = DS_RD_SETTING.reward_model_path - base_model = "gpt2" + base_model = DS_RD_SETTING.reward_base_model adapter_path = os.path.join(logdir, "lora_adapter") reward_head_path = os.path.join(logdir, "reward_head.pt") calib_path = os.path.join(logdir, "calib.json") @@ -1203,11 +1203,43 @@ def reward_model_select_hypothesis(self, hypothesis_dict: dict, problem_dict: di base_model_path=base_model, adapter_path=adapter_path, reward_head_path=reward_head_path, - calib_path=calib_path, ).to("cuda") + model.eval() + + parent_nodes = {} + for node in range(len(trace.hist)): + parents = trace.get_parents(node) + parent_nodes[node] = parents[-2] if len(parents) > 1 else None + # FIXME: add the convert logic to method in trace + if hasattr(trace, "idx2loop_id"): + parent_nodes = { + trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() + } + #if trace.current_selection: + # current_parent_record_id = trace.current_selection[0] # record id + current_parent_record_id = trace.current_selection[0] + loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} + loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) + + hypothesis_list = [ + trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis + for loop_id in loop_id_list + if trace.hist[loop_id2idx[loop_id]][1].decision == True + ][::-1] + sep = "->" + + hypothesis_chain_list = [] + accumulate = [] + for hyp in hypothesis_list: + accumulate.append(hyp) + hypothesis_chain_list.append(sep.join(accumulate)) + + last_text = [] texts = [] for name, data in hypothesis_dict.items(): + last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided")) texts.append(data.get("hypothesis", "Hypothesis not provided")) + rewards = model.compute_reward(texts, tokenizer) max_idx = rewards.index(max(rewards)) return texts[max_idx] @@ -1264,14 +1296,14 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ tokenizer.pad_token = tokenizer.eos_token model = RewardModelInference( - base_model_path=base_model, + base_model_name=base_model, adapter_path=adapter_path, reward_head_path=reward_head_path, ).to("cuda") model.eval() - parent_rewards = model(hypothesis_chain_list,tokenizer) - currnet_rewards = model(last_text,tokenizer) + parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer) + currnet_rewards = model.compute_reward(last_text,tokenizer) avg_win_rate = [] for re in currnet_rewards: @@ -1574,7 +1606,7 @@ def gen( ) pickled_problem_name = None else: - if DS_RD_SETTING.enable_reward_model_selection==True: + if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection: # logger.info("Selecting hypothesis using reward model.") # selected_hypothesis_text = self.reward_model_select_hypothesis( # hypothesis_dict=hypothesis_dict, diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py index 458a4f074..872f05adf 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -93,8 +93,52 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): return torch.exp(rewards).cpu().tolist() - - - - - + # @torch.no_grad() + # def compute_reward( + # self, + # texts: list[str], + # tokenizer, + # batch_size: int = 1, + # ) -> list[float]: + # """ + # 直接对字符串列表计算 reward。 + # 自动 tokenizer、batch、GPU 支持 + # """ + # if device is None: + # device = self.device + + # rewards_all = [] + + # # 分 batch 处理 + # for i in range(0, len(texts), batch_size): + # batch_texts = texts[i:i+batch_size] + # # 构建 prompt + # batch_prompts = [ + # ( + # "You are a senior data science competition judge and solution expert.\n" + # "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n" + # "Hypothesis Chain (each step separated by '->'):\n" + # f"{s}\n\n" + # "\n" + # "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n" + # "Focus on clarity, correctness, and improvement.\n" + # "Make sure to consider the chain direction from earliest to latest.\n" + # "\n\n" + # "Final Evaluation:\n" + # ) + # for s in batch_texts + # ] + + # enc = tokenizer( + # batch_prompts, + # truncation=True, + # padding=True, + # max_length=DS_RD_SETTING.max_length, + # return_tensors="pt" + # ) + # enc = {k: v.to(device) for k, v in enc.items()} + + # rewards = self.forward(enc["input_ids"], enc["attention_mask"]) + # rewards_all.extend(torch.exp(rewards).cpu().tolist()) + + # return rewards_all From 3d24be04b75b1865a05c156af76e348f82499056 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 17 Nov 2025 14:57:26 +0000 Subject: [PATCH 05/29] add install transformers --- pyproject.toml | 1 + requirements/reward.txt | 3 +++ 2 files changed, 4 insertions(+) create mode 100644 requirements/reward.txt diff --git a/pyproject.toml b/pyproject.toml index 184f517d0..135b61cbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,7 @@ lint = {file = ["requirements/lint.txt"]} package = {file = ["requirements/package.txt"]} test = {file = ["requirements/test.txt"]} torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch] +torch = {file = ["requirements/reward.txt"]} [tool.setuptools_scm] local_scheme = "no-local-version" diff --git a/requirements/reward.txt b/requirements/reward.txt new file mode 100644 index 000000000..89e792420 --- /dev/null +++ b/requirements/reward.txt @@ -0,0 +1,3 @@ +torch +transformers==4.53.1 +peft==0.6.1 \ No newline at end of file From 62de6d10f2d983c0bff9953c8c7f66375d6e0aac Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 18 Nov 2025 02:24:49 +0000 Subject: [PATCH 06/29] add --- requirements/test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/test.txt b/requirements/test.txt index 4b4fbc0b8..441c1bfab 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,3 +1,4 @@ # Requirements for test. coverage pytest +pydantic_ai \ No newline at end of file From 0453af1f12561355827611ffad4badc36de9a77e Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 18 Nov 2025 07:29:47 +0000 Subject: [PATCH 07/29] fix bug remove torch --- pyproject.toml | 1 + requirements/reward.txt | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 135b61cbd..c6652ebdc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,3 +123,4 @@ version_scheme = "guess-next-dev" all = true in_place = true trailing_comma_inline_array = true +c2defc195e9665a86ad7614f14f0fb8ca4f248f4 \ No newline at end of file diff --git a/requirements/reward.txt b/requirements/reward.txt index 89e792420..df9afb2a3 100644 --- a/requirements/reward.txt +++ b/requirements/reward.txt @@ -1,3 +1,2 @@ -torch transformers==4.53.1 peft==0.6.1 \ No newline at end of file From 9111b0cd367f397e2c0694fb5dc3c91c9ac85ff6 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 18 Nov 2025 07:30:19 +0000 Subject: [PATCH 08/29] fix bug --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c6652ebdc..3232a4d99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,5 +122,4 @@ version_scheme = "guess-next-dev" [tool.tomlsort] all = true in_place = true -trailing_comma_inline_array = true -c2defc195e9665a86ad7614f14f0fb8ca4f248f4 \ No newline at end of file +trailing_comma_inline_array = true \ No newline at end of file From b45aac0ad801af365e8b5108e8f82eda4ee87cb0 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 18 Nov 2025 07:33:57 +0000 Subject: [PATCH 09/29] fix bug 2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3232a4d99..70cc31e14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,7 +113,7 @@ lint = {file = ["requirements/lint.txt"]} package = {file = ["requirements/package.txt"]} test = {file = ["requirements/test.txt"]} torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch] -torch = {file = ["requirements/reward.txt"]} +reward = {file = ["requirements/reward.txt"]} [tool.setuptools_scm] local_scheme = "no-local-version" From fabcc846a260977b6f9af8536d39da60301d174d Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 18 Nov 2025 08:12:44 +0000 Subject: [PATCH 10/29] fix bug 3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a6787a88c..10d143b87 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ init-qlib-env: @source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate dev: - $(PIPRUN) pip install -e .[docs,lint,package,test] -c $(CONSTRAINTS_FILE) + $(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE) $(PIPRUN) pip install -U kaggle if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi From 6258915da3970e80039fa76c595f1f6a4eecd9bd Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Wed, 19 Nov 2025 07:57:15 +0000 Subject: [PATCH 11/29] fix pydantic-ai version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 619b19fa8..abb51499a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,7 +72,7 @@ azureml-mlflow types-pytz # Agent -pydantic-ai-slim[mcp,openai,prefect] +pydantic-ai-slim[mcp,openai,prefect]==1.9.1 nest-asyncio -prefect \ No newline at end of file +prefect==3.5.0 \ No newline at end of file From a62870c83bd1f983f23c1e765a4408a7feb75dec Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Wed, 19 Nov 2025 10:16:01 +0000 Subject: [PATCH 12/29] fix pydantic-ai-bug :( --- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +- requirements.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index c18811646..ae6a2bfed 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -9,7 +9,6 @@ from pydantic import BaseModel, Field from rdagent.app.data_science.conf import DS_RD_SETTING -from rdagent.components.agent.rag import Agent as RAGAgent from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask from rdagent.components.coder.data_science.feature.exp import FeatureTask from rdagent.components.coder.data_science.model.exp import ModelTask @@ -648,6 +647,7 @@ def hypothesis_gen( # knowledge retrieval if DS_RD_SETTING.enable_research_rag: + from rdagent.components.agent.rag import Agent as RAGAgent rag_agent = RAGAgent( system_prompt="""You are a helpful assistant. You help users retrieve relevant knowledge from community discussions and public code.""" diff --git a/requirements.txt b/requirements.txt index abb51499a..0c4e5ffa4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,7 +72,6 @@ azureml-mlflow types-pytz # Agent -pydantic-ai-slim[mcp,openai,prefect]==1.9.1 nest-asyncio prefect==3.5.0 \ No newline at end of file From 1d7ee2c7d30ec64a12fb50d31a18a5a09f8c8980 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Wed, 19 Nov 2025 16:50:13 +0000 Subject: [PATCH 13/29] fix bug 4 --- rdagent/components/coder/data_science/pipeline/eval.py | 2 +- requirements/test.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py index f296d986f..a5ee17744 100644 --- a/rdagent/components/coder/data_science/pipeline/eval.py +++ b/rdagent/components/coder/data_science/pipeline/eval.py @@ -8,7 +8,6 @@ import pandas as pd from rdagent.app.data_science.conf import DS_RD_SETTING -from rdagent.components.agent.context7 import Agent as DocAgent from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback from rdagent.components.coder.CoSTEER.evaluators import ( CoSTEEREvaluator, @@ -307,6 +306,7 @@ def evaluate( do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search if do_documentation_search: + from rdagent.components.agent.context7 import Agent as DocAgent # Use MCPAgent for clean, user-friendly interface try: # Create agent targeting Context7 service - model config comes from mcp_config.json diff --git a/requirements/test.txt b/requirements/test.txt index 441c1bfab..30699b666 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,4 +1,4 @@ # Requirements for test. coverage pytest -pydantic_ai \ No newline at end of file +#pydantic_ai \ No newline at end of file From f83cfb5db3bf31c26bf8e46345bad9b85c3255ef Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Thu, 20 Nov 2025 08:25:10 +0000 Subject: [PATCH 14/29] add two type reward model --- Makefile | 1 + rdagent/app/data_science/conf.py | 1 + .../data_science/proposal/exp_gen/proposal.py | 11 ++++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 10d143b87..16f10d24b 100644 --- a/Makefile +++ b/Makefile @@ -67,6 +67,7 @@ init-qlib-env: @source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate dev: + $(PIPRUN) pip install -U pip setuptools wheel $(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE) $(PIPRUN) pip install -U kaggle if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index d15d47bac..38a791c43 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -212,6 +212,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): max_length : int = 2200 """ max_length of the reward model""" + reward_select_type: int = 1 DS_RD_SETTING = DataScienceBasePropSetting() diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index ae6a2bfed..72c5d20de 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1613,9 +1613,14 @@ def gen( # problem_dict=all_problems, # ) logger.info("Selecting hypothesis using reward model. (avg win)") - - selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict) - + if DS_RD_SETTING.reward_select_type==1: + selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict) + elif DS_RD_SETTING.reward_select_type==2: + selected_hypothesis_text = self.reward_model_select_hypothesis( + trace=trace, + hypothesis_dict=hypothesis_dict, + problem_dict=all_problems, + ) # Find the problem name corresponding to the selected hypothesis text pickled_problem_name = None for problem_name, data in hypothesis_dict.items(): From 78e6b1ad3a3fc62ba621dae4ba0334d08cc8e765 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Thu, 20 Nov 2025 09:32:43 +0000 Subject: [PATCH 15/29] fix bug 5 --- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 72c5d20de..6794bb21c 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1200,7 +1200,7 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di tokenizer.pad_token = tokenizer.eos_token model = RewardModelInference( - base_model_path=base_model, + base_model_name=base_model, adapter_path=adapter_path, reward_head_path=reward_head_path, ).to("cuda") From 9356071cc93b80394e31e32e617b372ba60e2239 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Fri, 21 Nov 2025 10:45:56 +0000 Subject: [PATCH 16/29] fix bug 6 --- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 6794bb21c..c993e373d 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1606,7 +1606,7 @@ def gen( ) pickled_problem_name = None else: - if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection: + if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and not trace.is_selection_new_tree(): # logger.info("Selecting hypothesis using reward model.") # selected_hypothesis_text = self.reward_model_select_hypothesis( # hypothesis_dict=hypothesis_dict, From 1b28ffcb41bd7f1a45fc55100e8bd603e334240e Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 24 Nov 2025 08:38:31 +0000 Subject: [PATCH 17/29] add more competition --- rdagent/app/data_science/conf.py | 14 ++++- .../data_science/proposal/exp_gen/proposal.py | 20 +++++-- .../proposal/exp_gen/reward_inference.py | 56 ++----------------- 3 files changed, 31 insertions(+), 59 deletions(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 38a791c43..21f8b1280 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,15 +201,25 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_3" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" + + #reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_3" + + """The path to the reward model for hypothesis selection.""" + competition_mapping_path: str = "/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/comp_to_scen.json" + + #competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json" + + reward_base_model: str = "Qwen/Qwen3-0.6B" """ Backbone of the reward model""" - max_length : int = 2200 + max_length : int = 2300 """ max_length of the reward model""" reward_select_type: int = 1 diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index c993e373d..075dad165 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1240,7 +1240,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided")) texts.append(data.get("hypothesis", "Hypothesis not provided")) - rewards = model.compute_reward(texts, tokenizer) + comp_dict_path = DS_RD_SETTING.competition_mapping_path + with open(comp_dict_path, "r") as f: + comp_dict = json.load(f) + competition = trace.scen.competition + comp_description = comp_dict[competition] + rewards = model.compute_reward(texts, tokenizer,comp_description) max_idx = rewards.index(max(rewards)) return texts[max_idx] @@ -1301,9 +1306,13 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ reward_head_path=reward_head_path, ).to("cuda") model.eval() - - parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer) - currnet_rewards = model.compute_reward(last_text,tokenizer) + comp_dict_path = DS_RD_SETTING.competition_mapping_path + with open(comp_dict_path, "r") as f: + comp_dict = json.load(f) + competition = trace.scen.competition + comp_description = comp_dict[competition] + parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer,comp_description) + currnet_rewards = model.compute_reward(last_text,tokenizer,comp_description) avg_win_rate = [] for re in currnet_rewards: @@ -1606,7 +1615,8 @@ def gen( ) pickled_problem_name = None else: - if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and not trace.is_selection_new_tree(): + sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None) + if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag: # logger.info("Selecting hypothesis using reward model.") # selected_hypothesis_text = self.reward_model_select_hypothesis( # hypothesis_dict=hypothesis_dict, diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py index 872f05adf..fa70c282b 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -49,7 +49,7 @@ def forward(self, input_ids, attention_mask): reward = self.reward_head(pooled).squeeze(-1) return reward - def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): + def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, device="cuda"): if system_prompt is not None: self.system_prompt = system_prompt elif not hasattr(self, "system_prompt"): @@ -68,6 +68,7 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): for s in texts: prompt = ( f"{self.system_prompt}\n\n" + f"Competition description:\n{comp_description}\n\n" "Hypothesis Chain (each step separated by '->'):\n" f"{s}\n\n" "\n" @@ -77,6 +78,7 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): "\n\n" "Final Evaluation:\n" ) + inputs.append(prompt) enc = tokenizer( @@ -91,54 +93,4 @@ def compute_reward(self, texts, tokenizer, system_prompt=None, device="cuda"): rewards = self.forward(enc["input_ids"], enc["attention_mask"]) - return torch.exp(rewards).cpu().tolist() - - # @torch.no_grad() - # def compute_reward( - # self, - # texts: list[str], - # tokenizer, - # batch_size: int = 1, - # ) -> list[float]: - # """ - # 直接对字符串列表计算 reward。 - # 自动 tokenizer、batch、GPU 支持 - # """ - # if device is None: - # device = self.device - - # rewards_all = [] - - # # 分 batch 处理 - # for i in range(0, len(texts), batch_size): - # batch_texts = texts[i:i+batch_size] - # # 构建 prompt - # batch_prompts = [ - # ( - # "You are a senior data science competition judge and solution expert.\n" - # "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n" - # "Hypothesis Chain (each step separated by '->'):\n" - # f"{s}\n\n" - # "\n" - # "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n" - # "Focus on clarity, correctness, and improvement.\n" - # "Make sure to consider the chain direction from earliest to latest.\n" - # "\n\n" - # "Final Evaluation:\n" - # ) - # for s in batch_texts - # ] - - # enc = tokenizer( - # batch_prompts, - # truncation=True, - # padding=True, - # max_length=DS_RD_SETTING.max_length, - # return_tensors="pt" - # ) - # enc = {k: v.to(device) for k, v in enc.items()} - - # rewards = self.forward(enc["input_ids"], enc["attention_mask"]) - # rewards_all.extend(torch.exp(rewards).cpu().tolist()) - - # return rewards_all + return torch.exp(rewards).cpu().tolist() \ No newline at end of file From 4784b75fec16838026f6e1b6b328f02c404aa36d Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 24 Nov 2025 09:31:34 +0000 Subject: [PATCH 18/29] fix bug 7 --- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 075dad165..fbeaaf7a2 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1616,7 +1616,7 @@ def gen( pickled_problem_name = None else: sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None) - if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag: + if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag and not trace.is_selection_new_tree(): # logger.info("Selecting hypothesis using reward model.") # selected_hypothesis_text = self.reward_model_select_hypothesis( # hypothesis_dict=hypothesis_dict, From d20781d44ab71c58b178f51d0d92d09e0e2afe3f Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 24 Nov 2025 11:17:08 +0000 Subject: [PATCH 19/29] fix bug 8 --- .../data_science/proposal/exp_gen/proposal.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index fbeaaf7a2..c771b0191 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1215,9 +1215,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di parent_nodes = { trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } - #if trace.current_selection: - # current_parent_record_id = trace.current_selection[0] # record id - current_parent_record_id = trace.current_selection[0] + if trace.current_selection: + current_parent_record_id = trace.current_selection[0] # record id + else: + return texts[0] + # + #current_parent_record_id = trace.current_selection[0] loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) @@ -1262,9 +1265,11 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ parent_nodes = { trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } - #if trace.current_selection: - # current_parent_record_id = trace.current_selection[0] # record id - current_parent_record_id = trace.current_selection[0] + if trace.current_selection: + current_parent_record_id = trace.current_selection[0] # record id + else: + return texts[0] + loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) @@ -1616,7 +1621,7 @@ def gen( pickled_problem_name = None else: sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None) - if DS_RD_SETTING.enable_reward_model_selection==True and trace.current_selection and sota_flag and not trace.is_selection_new_tree(): + if DS_RD_SETTING.enable_reward_model_selection==True and sota_flag and not trace.is_selection_new_tree(): # logger.info("Selecting hypothesis using reward model.") # selected_hypothesis_text = self.reward_model_select_hypothesis( # hypothesis_dict=hypothesis_dict, From 89a3a610c4b786b365e89fd5f38b0b1bb1a3f511 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 24 Nov 2025 16:22:47 +0000 Subject: [PATCH 20/29] fix bug 9 --- .../scenarios/data_science/proposal/exp_gen/proposal.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index c771b0191..45c3b1f63 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1216,7 +1216,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } if trace.current_selection: - current_parent_record_id = trace.current_selection[0] # record id + current_parent_record_id = trace.current_selection[0] # record id + if current_parent_record_id == -1: + return texts[0] else: return texts[0] # @@ -1266,7 +1268,9 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } if trace.current_selection: - current_parent_record_id = trace.current_selection[0] # record id + current_parent_record_id = trace.current_selection[0] # record id + if current_parent_record_id == -1: + return texts[0] else: return texts[0] From 0fd51dbed37a9f31fbd0d426916e0107cf644a26 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 25 Nov 2025 09:20:27 +0000 Subject: [PATCH 21/29] fix bug 10 --- rdagent/scenarios/data_science/proposal/exp_gen/proposal.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 45c3b1f63..03d07be94 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1239,6 +1239,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di accumulate.append(hyp) hypothesis_chain_list.append(sep.join(accumulate)) + if not hypothesis_chain_list: + return texts[0] + last_text = [] texts = [] for name, data in hypothesis_dict.items(): @@ -1290,6 +1293,9 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ accumulate.append(hyp) hypothesis_chain_list.append(sep.join(accumulate)) + if not hypothesis_chain_list: + return texts[0] + last_text = [] texts = [] for name, data in hypothesis_dict.items(): From d0aa7324fa4f2f37cb74c7df62d9203d4cfbfa37 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 25 Nov 2025 10:43:01 +0000 Subject: [PATCH 22/29] fix bug 11 --- .../data_science/proposal/exp_gen/proposal.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 03d07be94..b4f49f76d 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1215,14 +1215,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di parent_nodes = { trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } - if trace.current_selection: - current_parent_record_id = trace.current_selection[0] # record id - if current_parent_record_id == -1: - return texts[0] - else: - return texts[0] - # - #current_parent_record_id = trace.current_selection[0] + + if not trace.current_selection or trace.current_selection[0] == -1: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + + current_parent_record_id = trace.current_selection[0] loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) @@ -1240,8 +1238,9 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di hypothesis_chain_list.append(sep.join(accumulate)) if not hypothesis_chain_list: - return texts[0] - + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + last_text = [] texts = [] for name, data in hypothesis_dict.items(): @@ -1270,12 +1269,11 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ parent_nodes = { trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() } - if trace.current_selection: - current_parent_record_id = trace.current_selection[0] # record id - if current_parent_record_id == -1: - return texts[0] - else: - return texts[0] + + if not trace.current_selection or trace.current_selection[0] == -1: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + current_parent_record_id = trace.current_selection[0] loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) @@ -1294,7 +1292,8 @@ def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_ hypothesis_chain_list.append(sep.join(accumulate)) if not hypothesis_chain_list: - return texts[0] + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text last_text = [] texts = [] From 35c60d901535cb675870dda22f13198aafa58bbd Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 8 Dec 2025 10:09:38 +0000 Subject: [PATCH 23/29] add new ckpt --- rdagent/app/data_science/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 21f8b1280..242127685 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_3" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_5" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" From f8904d97396968085f078d205107c32a688002e7 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Thu, 18 Dec 2025 10:23:29 +0000 Subject: [PATCH 24/29] add ckpt 6 --- rdagent/app/data_science/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 242127685..4406b61ab 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_5" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_6" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" From 38b9736590fdc91430f02bc6259ffa14fe588bc4 Mon Sep 17 00:00:00 2001 From: Star dust <93254841+jingyuanlm@users.noreply.github.com> Date: Fri, 9 Jan 2026 15:33:10 +0800 Subject: [PATCH 25/29] Evo (#1322) * add ckpt 7 * add ckpt 8 --- rdagent/app/data_science/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 4406b61ab..2c792372e 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_6" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" @@ -216,7 +216,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): #competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json" - reward_base_model: str = "Qwen/Qwen3-0.6B" + reward_base_model: str = "Qwen/Qwen3-4B" """ Backbone of the reward model""" max_length : int = 2300 From aa8637a10ca82f60be4dccc9b880d1cd9de7307d Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Fri, 9 Jan 2026 07:50:51 +0000 Subject: [PATCH 26/29] ckpt 7 --- rdagent/app/data_science/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 2c792372e..d714255d6 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_7" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2" From b6cd8133c43aec0d29170328f4aab9e77c7be1e8 Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Tue, 13 Jan 2026 07:36:06 +0000 Subject: [PATCH 27/29] fix cuda problem/ckpt7 --- .../scenarios/data_science/proposal/exp_gen/proposal.py | 8 ++++---- .../data_science/proposal/exp_gen/reward_inference.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index b4f49f76d..f40d6bdc8 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -40,7 +40,7 @@ from rdagent.utils.agent.tpl import T from rdagent.utils.repo.diff import generate_diff_from_dict from rdagent.utils.workflow import wait_retry - +import torch _COMPONENT_META: Dict[str, Dict[str, Any]] = { "DataLoadSpec": { "target_name": "Data loader and specification generation", @@ -1198,12 +1198,12 @@ def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_di tokenizer = AutoTokenizer.from_pretrained(base_model) if not getattr(tokenizer, "pad_token", None): tokenizer.pad_token = tokenizer.eos_token - + device = torch.device("cuda:1") model = RewardModelInference( base_model_name=base_model, adapter_path=adapter_path, - reward_head_path=reward_head_path, - ).to("cuda") + reward_head_path=reward_head_path,device=device + ) model.eval() parent_nodes = {} diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py index fa70c282b..8910476fb 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -49,7 +49,7 @@ def forward(self, input_ids, attention_mask): reward = self.reward_head(pooled).squeeze(-1) return reward - def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, device="cuda"): + def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None): if system_prompt is not None: self.system_prompt = system_prompt elif not hasattr(self, "system_prompt"): @@ -89,7 +89,7 @@ def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None, return_tensors="pt" ) - enc = {k: v.to(device) for k, v in enc.items()} + enc = {k: v.to(self.device) for k, v in enc.items()} rewards = self.forward(enc["input_ids"], enc["attention_mask"]) From 1fc1a6e14ffa13f16eff09d460a2d173f404499b Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Wed, 14 Jan 2026 10:45:35 +0000 Subject: [PATCH 28/29] ckpt 7 fix bug --- .../proposal/exp_gen/reward_inference.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py index 8910476fb..ab33009df 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -12,23 +12,33 @@ # Reward Model Wrapper # ===================== class RewardModelInference(nn.Module): - def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda"): + def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda:1"): super().__init__() - self.device = device + self.device = torch.device(device) + self.base = AutoModelForCausalLM.from_pretrained(base_model_name) self.base = PeftModel.from_pretrained(self.base, adapter_path) + self.base.to(self.device) + if hasattr(self.base, "gradient_checkpointing_enable"): self.base.gradient_checkpointing_enable() if hasattr(self.base.config, "use_cache"): self.base.config.use_cache = False - hs = getattr(self.base.config, "hidden_size", - getattr(self.base.config, "n_embd", - getattr(self.base.config, "d_model", None))) + + hs = getattr( + self.base.config, + "hidden_size", + getattr(self.base.config, "n_embd", + getattr(self.base.config, "d_model", None)) + ) if hs is None: hs = self.base.get_input_embeddings().embedding_dim - self.reward_head = nn.Linear(hs, 1).to(device) - self.reward_head.load_state_dict(torch.load(reward_head_path, map_location=device)) + self.reward_head = nn.Linear(hs, 1) + self.reward_head.load_state_dict( + torch.load(reward_head_path, map_location="cpu") + ) + self.reward_head.to(self.device) @staticmethod def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor: From d344fd680fb91f14b04667fbc3c6c7f776910d1f Mon Sep 17 00:00:00 2001 From: jingyuanlm <842442862@qq.com> Date: Mon, 19 Jan 2026 08:20:14 +0000 Subject: [PATCH 29/29] ckpt 8 --- rdagent/app/data_science/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index d714255d6..2c792372e 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -201,7 +201,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_reward_model_selection: bool = True """Enable reward model based hypothesis selection.""" - reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_7" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" #"/data/userdata/v-lijingyuan/last_run_2"