diff --git a/Makefile b/Makefile index a6787a88c..16f10d24b 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,8 @@ init-qlib-env: @source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate dev: - $(PIPRUN) pip install -e .[docs,lint,package,test] -c $(CONSTRAINTS_FILE) + $(PIPRUN) pip install -U pip setuptools wheel + $(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE) $(PIPRUN) pip install -U kaggle if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi diff --git a/pyproject.toml b/pyproject.toml index 184f517d0..70cc31e14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,6 +113,7 @@ lint = {file = ["requirements/lint.txt"]} package = {file = ["requirements/package.txt"]} test = {file = ["requirements/test.txt"]} torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch] +reward = {file = ["requirements/reward.txt"]} [tool.setuptools_scm] local_scheme = "no-local-version" @@ -121,4 +122,4 @@ version_scheme = "guess-next-dev" [tool.tomlsort] all = true in_place = true -trailing_comma_inline_array = true +trailing_comma_inline_array = true \ No newline at end of file diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 2d8ec6262..2c792372e 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -197,6 +197,33 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction" + #### reward model related + enable_reward_model_selection: bool = True + """Enable reward model based hypothesis selection.""" + + reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + + #"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2" + #"/data/userdata/v-lijingyuan/last_run_2" + + #reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_3" + + + """The path to the reward model for hypothesis selection.""" + + competition_mapping_path: str = "/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/comp_to_scen.json" + + #competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json" + + + reward_base_model: str = "Qwen/Qwen3-4B" + """ Backbone of the reward model""" + + max_length : int = 2300 + """ max_length of the reward model""" + + reward_select_type: int = 1 + DS_RD_SETTING = DataScienceBasePropSetting() diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py index f296d986f..a5ee17744 100644 --- a/rdagent/components/coder/data_science/pipeline/eval.py +++ b/rdagent/components/coder/data_science/pipeline/eval.py @@ -8,7 +8,6 @@ import pandas as pd from rdagent.app.data_science.conf import DS_RD_SETTING -from rdagent.components.agent.context7 import Agent as DocAgent from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback from rdagent.components.coder.CoSTEER.evaluators import ( CoSTEEREvaluator, @@ -307,6 +306,7 @@ def evaluate( do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search if do_documentation_search: + from rdagent.components.agent.context7 import Agent as DocAgent # Use MCPAgent for clean, user-friendly interface try: # Create agent targeting Context7 service - model config comes from mcp_config.json diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 8a0343f27..f40d6bdc8 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -9,7 +9,6 @@ from pydantic import BaseModel, Field from rdagent.app.data_science.conf import DS_RD_SETTING -from rdagent.components.agent.rag import Agent as RAGAgent from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask from rdagent.components.coder.data_science.feature.exp import FeatureTask from rdagent.components.coder.data_science.model.exp import ModelTask @@ -41,7 +40,7 @@ from rdagent.utils.agent.tpl import T from rdagent.utils.repo.diff import generate_diff_from_dict from rdagent.utils.workflow import wait_retry - +import torch _COMPONENT_META: Dict[str, Dict[str, Any]] = { "DataLoadSpec": { "target_name": "Data loader and specification generation", @@ -313,7 +312,6 @@ def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraft else: return None - class DSProposalV1ExpGen(ExpGen): def gen( self, @@ -649,6 +647,7 @@ def hypothesis_gen( # knowledge retrieval if DS_RD_SETTING.enable_research_rag: + from rdagent.components.agent.rag import Agent as RAGAgent rag_agent = RAGAgent( system_prompt="""You are a helpful assistant. You help users retrieve relevant knowledge from community discussions and public code.""" @@ -1182,6 +1181,163 @@ def hypothesis_rank( problem_label=problem_dict.get("label", "FEEDBACK_PROBLEM"), appendix=hypothesis_dict[max_score_problem_name].get("appendix", None), ) + + def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]: + """ + Select hypothesis based on reward model scores. + """ + from .reward_inference import RewardModelInference + from transformers import AutoTokenizer + import os + logdir = DS_RD_SETTING.reward_model_path + base_model = DS_RD_SETTING.reward_base_model + adapter_path = os.path.join(logdir, "lora_adapter") + reward_head_path = os.path.join(logdir, "reward_head.pt") + calib_path = os.path.join(logdir, "calib.json") + + tokenizer = AutoTokenizer.from_pretrained(base_model) + if not getattr(tokenizer, "pad_token", None): + tokenizer.pad_token = tokenizer.eos_token + device = torch.device("cuda:1") + model = RewardModelInference( + base_model_name=base_model, + adapter_path=adapter_path, + reward_head_path=reward_head_path,device=device + ) + model.eval() + + parent_nodes = {} + for node in range(len(trace.hist)): + parents = trace.get_parents(node) + parent_nodes[node] = parents[-2] if len(parents) > 1 else None + # FIXME: add the convert logic to method in trace + if hasattr(trace, "idx2loop_id"): + parent_nodes = { + trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() + } + + if not trace.current_selection or trace.current_selection[0] == -1: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + + current_parent_record_id = trace.current_selection[0] + loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} + loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) + + hypothesis_list = [ + trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis + for loop_id in loop_id_list + if trace.hist[loop_id2idx[loop_id]][1].decision == True + ][::-1] + sep = "->" + + hypothesis_chain_list = [] + accumulate = [] + for hyp in hypothesis_list: + accumulate.append(hyp) + hypothesis_chain_list.append(sep.join(accumulate)) + + if not hypothesis_chain_list: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + + last_text = [] + texts = [] + for name, data in hypothesis_dict.items(): + last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided")) + texts.append(data.get("hypothesis", "Hypothesis not provided")) + + comp_dict_path = DS_RD_SETTING.competition_mapping_path + with open(comp_dict_path, "r") as f: + comp_dict = json.load(f) + competition = trace.scen.competition + comp_description = comp_dict[competition] + rewards = model.compute_reward(texts, tokenizer,comp_description) + max_idx = rewards.index(max(rewards)) + return texts[max_idx] + + def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_dict): + """ + Select hypothesis based on avg win rate + """ + parent_nodes = {} + for node in range(len(trace.hist)): + parents = trace.get_parents(node) + parent_nodes[node] = parents[-2] if len(parents) > 1 else None + # FIXME: add the convert logic to method in trace + if hasattr(trace, "idx2loop_id"): + parent_nodes = { + trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items() + } + + if not trace.current_selection or trace.current_selection[0] == -1: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + current_parent_record_id = trace.current_selection[0] + + loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()} + loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes) + + hypothesis_list = [ + trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis + for loop_id in loop_id_list + if trace.hist[loop_id2idx[loop_id]][1].decision == True + ][::-1] + sep = "->" + + hypothesis_chain_list = [] + accumulate = [] + for hyp in hypothesis_list: + accumulate.append(hyp) + hypothesis_chain_list.append(sep.join(accumulate)) + + if not hypothesis_chain_list: + first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided") + return first_text + + last_text = [] + texts = [] + for name, data in hypothesis_dict.items(): + last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided")) + texts.append(data.get("hypothesis", "Hypothesis not provided")) + + from .reward_inference import RewardModelInference + from transformers import AutoTokenizer + import os + logdir = DS_RD_SETTING.reward_model_path + base_model = DS_RD_SETTING.reward_base_model + + adapter_path = os.path.join(logdir, "lora_adapter") + reward_head_path = os.path.join(logdir, "reward_head.pt") + + tokenizer = AutoTokenizer.from_pretrained(base_model) + if not getattr(tokenizer, "pad_token", None): + tokenizer.pad_token = tokenizer.eos_token + + model = RewardModelInference( + base_model_name=base_model, + adapter_path=adapter_path, + reward_head_path=reward_head_path, + ).to("cuda") + model.eval() + comp_dict_path = DS_RD_SETTING.competition_mapping_path + with open(comp_dict_path, "r") as f: + comp_dict = json.load(f) + competition = trace.scen.competition + comp_description = comp_dict[competition] + parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer,comp_description) + currnet_rewards = model.compute_reward(last_text,tokenizer,comp_description) + + avg_win_rate = [] + for re in currnet_rewards: + win_rate = [] + for p_re in parent_rewards: + current_win_rate = re/(re + p_re) + win_rate.append(current_win_rate) + avg_win_rate.append(np.mean(win_rate)) + max_idx = avg_win_rate.index(max(avg_win_rate)) + return texts[max_idx] + def task_gen( self, @@ -1473,7 +1629,45 @@ def gen( ) pickled_problem_name = None else: - pickled_problem_name, new_hypothesis = self.hypothesis_rank( + sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None) + if DS_RD_SETTING.enable_reward_model_selection==True and sota_flag and not trace.is_selection_new_tree(): + # logger.info("Selecting hypothesis using reward model.") + # selected_hypothesis_text = self.reward_model_select_hypothesis( + # hypothesis_dict=hypothesis_dict, + # problem_dict=all_problems, + # ) + logger.info("Selecting hypothesis using reward model. (avg win)") + if DS_RD_SETTING.reward_select_type==1: + selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict) + elif DS_RD_SETTING.reward_select_type==2: + selected_hypothesis_text = self.reward_model_select_hypothesis( + trace=trace, + hypothesis_dict=hypothesis_dict, + problem_dict=all_problems, + ) + # Find the problem name corresponding to the selected hypothesis text + pickled_problem_name = None + for problem_name, data in hypothesis_dict.items(): + if data.get("hypothesis", "") == selected_hypothesis_text: + pickled_problem_name = problem_name + break + if pickled_problem_name is None: + raise ValueError("Selected hypothesis text does not match any known hypothesis.") + new_hypothesis = DSHypothesis( + component=hypothesis_dict[pickled_problem_name].get("component", "Model"), + hypothesis=hypothesis_dict[pickled_problem_name].get("hypothesis", "Hypothesis not provided"), + reason=hypothesis_dict[pickled_problem_name].get("reason", "Reason not provided"), + problem_name=pickled_problem_name, + problem_desc=all_problems.get(pickled_problem_name, {}).get( + "problem", "Problem description not provided" + ), + problem_label=all_problems.get(pickled_problem_name, {}).get( + "label", "FEEDBACK_PROBLEM" + ), + appendix=hypothesis_dict[pickled_problem_name].get("appendix", None), + ) + else: + pickled_problem_name, new_hypothesis = self.hypothesis_rank( hypothesis_dict=hypothesis_dict, problem_dict=all_problems, ) diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py new file mode 100644 index 000000000..ab33009df --- /dev/null +++ b/rdagent/scenarios/data_science/proposal/exp_gen/reward_inference.py @@ -0,0 +1,106 @@ +import os +import json +import torch +import torch.nn as nn +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel +from rdagent.app.data_science.conf import DS_RD_SETTING +from rdagent.app.data_science.conf import DS_RD_SETTING + + +# ===================== +# Reward Model Wrapper +# ===================== +class RewardModelInference(nn.Module): + def __init__(self, base_model_name, adapter_path, reward_head_path, device="cuda:1"): + super().__init__() + self.device = torch.device(device) + + self.base = AutoModelForCausalLM.from_pretrained(base_model_name) + self.base = PeftModel.from_pretrained(self.base, adapter_path) + self.base.to(self.device) + + if hasattr(self.base, "gradient_checkpointing_enable"): + self.base.gradient_checkpointing_enable() + if hasattr(self.base.config, "use_cache"): + self.base.config.use_cache = False + + hs = getattr( + self.base.config, + "hidden_size", + getattr(self.base.config, "n_embd", + getattr(self.base.config, "d_model", None)) + ) + if hs is None: + hs = self.base.get_input_embeddings().embedding_dim + + self.reward_head = nn.Linear(hs, 1) + self.reward_head.load_state_dict( + torch.load(reward_head_path, map_location="cpu") + ) + self.reward_head.to(self.device) + + @staticmethod + def pool_last_nonpad(last_hidden: torch.Tensor, attn_mask: torch.Tensor) -> torch.Tensor: + lengths = attn_mask.sum(dim=1) - 1 + lengths = lengths.clamp(min=0) + idx = lengths.view(-1, 1, 1).expand(-1, 1, last_hidden.size(-1)) + return last_hidden.gather(1, idx).squeeze(1) + + def forward(self, input_ids, attention_mask): + out = self.base( + input_ids=input_ids.to(self.device), + attention_mask=attention_mask.to(self.device), + output_hidden_states=True, + use_cache=False + ) + last_hidden = out.hidden_states[-1] + pooled = self.pool_last_nonpad(last_hidden, attention_mask) + reward = self.reward_head(pooled).squeeze(-1) + return reward + + def compute_reward(self, texts, tokenizer,comp_description, system_prompt=None): + if system_prompt is not None: + self.system_prompt = system_prompt + elif not hasattr(self, "system_prompt"): + self.system_prompt = ( + "You are a senior data science competition judge and solution expert.\n" + "Your task is to evaluate the quality, reasoning progression, and innovation of hypothesis chains.\n" + "A hypothesis chain shows iterative improvement of solutions.\n" + "You should assess:\n" + "1) reasoning correctness and consistency across steps,\n" + "2) improvement and refinement through the chain,\n" + "3) final hypothesis quality and practicality.\n" + "Be strict and fair. Provide expert-level insight." + ) + + inputs = [] + for s in texts: + prompt = ( + f"{self.system_prompt}\n\n" + f"Competition description:\n{comp_description}\n\n" + "Hypothesis Chain (each step separated by '->'):\n" + f"{s}\n\n" + "\n" + "Analyze the evolution of hypotheses, step-by-step, identifying strengths, weaknesses, and logical progression.\n" + "Focus on clarity, correctness, and improvement.\n" + "Make sure to consider the chain direction from earliest to latest.\n" + "\n\n" + "Final Evaluation:\n" + ) + + inputs.append(prompt) + + enc = tokenizer( + inputs, + truncation=True, + padding=True, + max_length=DS_RD_SETTING.max_length, + return_tensors="pt" + ) + + enc = {k: v.to(self.device) for k, v in enc.items()} + + rewards = self.forward(enc["input_ids"], enc["attention_mask"]) + + return torch.exp(rewards).cpu().tolist() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 619b19fa8..0c4e5ffa4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,7 +72,6 @@ azureml-mlflow types-pytz # Agent -pydantic-ai-slim[mcp,openai,prefect] nest-asyncio -prefect \ No newline at end of file +prefect==3.5.0 \ No newline at end of file diff --git a/requirements/reward.txt b/requirements/reward.txt new file mode 100644 index 000000000..df9afb2a3 --- /dev/null +++ b/requirements/reward.txt @@ -0,0 +1,2 @@ +transformers==4.53.1 +peft==0.6.1 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 4b4fbc0b8..30699b666 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,3 +1,4 @@ # Requirements for test. coverage pytest +#pydantic_ai \ No newline at end of file