Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ddec8c3
add reward model
jingyuanlm Oct 23, 2025
c337494
Merge remote-tracking branch 'origin/main' into evo
jingyuanlm Oct 23, 2025
1065594
Merge remote-tracking branch 'origin/main' into evo
jingyuanlm Oct 23, 2025
5dbb355
Merge remote-tracking branch 'origin/main' into evo
jingyuanlm Oct 24, 2025
2fd918c
add avg win reward model
jingyuanlm Nov 17, 2025
98ca9cf
add path online
jingyuanlm Nov 17, 2025
7d3dc7f
fix bug
jingyuanlm Nov 17, 2025
3d24be0
add install transformers
jingyuanlm Nov 17, 2025
62de6d1
add
jingyuanlm Nov 18, 2025
0453af1
fix bug remove torch
jingyuanlm Nov 18, 2025
9111b0c
fix bug
jingyuanlm Nov 18, 2025
b45aac0
fix bug 2
jingyuanlm Nov 18, 2025
fabcc84
fix bug 3
jingyuanlm Nov 18, 2025
6258915
fix pydantic-ai version
jingyuanlm Nov 19, 2025
a62870c
fix pydantic-ai-bug :(
jingyuanlm Nov 19, 2025
1d7ee2c
fix bug 4
jingyuanlm Nov 19, 2025
f83cfb5
add two type reward model
jingyuanlm Nov 20, 2025
78e6b1a
fix bug 5
jingyuanlm Nov 20, 2025
9356071
fix bug 6
jingyuanlm Nov 21, 2025
1b28ffc
add more competition
jingyuanlm Nov 24, 2025
4784b75
fix bug 7
jingyuanlm Nov 24, 2025
d20781d
fix bug 8
jingyuanlm Nov 24, 2025
89a3a61
fix bug 9
jingyuanlm Nov 24, 2025
0fd51db
fix bug 10
jingyuanlm Nov 25, 2025
d0aa732
fix bug 11
jingyuanlm Nov 25, 2025
35c60d9
add new ckpt
jingyuanlm Dec 8, 2025
f8904d9
add ckpt 6
jingyuanlm Dec 18, 2025
38b9736
Evo (#1322)
jingyuanlm Jan 9, 2026
aa8637a
ckpt 7
jingyuanlm Jan 9, 2026
b6cd813
fix cuda problem/ckpt7
jingyuanlm Jan 13, 2026
1fc1a6e
ckpt 7 fix bug
jingyuanlm Jan 14, 2026
d344fd6
ckpt 8
jingyuanlm Jan 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ init-qlib-env:
@source $$(conda info --base)/etc/profile.d/conda.sh && conda activate qlibRDAgent && which pip && pip install pyqlib && pip install ruamel-yaml==0.17.21 && pip install torch==2.1.1 && pip install catboost==0.24.3 && conda deactivate

dev:
$(PIPRUN) pip install -e .[docs,lint,package,test] -c $(CONSTRAINTS_FILE)
$(PIPRUN) pip install -U pip setuptools wheel
$(PIPRUN) pip install -e .[docs,lint,package,test,torch,reward] -c $(CONSTRAINTS_FILE)
$(PIPRUN) pip install -U kaggle
if [ "$(CI)" != "true" ] && command -v pre-commit > /dev/null 2>&1; then pre-commit install --hook-type pre-push; fi

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ lint = {file = ["requirements/lint.txt"]}
package = {file = ["requirements/package.txt"]}
test = {file = ["requirements/test.txt"]}
torch = {file = ["requirements/torch.txt"]} # some agent algorithms need torch. pip install rdagent[torch]
reward = {file = ["requirements/reward.txt"]}

[tool.setuptools_scm]
local_scheme = "no-local-version"
Expand All @@ -121,4 +122,4 @@ version_scheme = "guess-next-dev"
[tool.tomlsort]
all = true
in_place = true
trailing_comma_inline_array = true
trailing_comma_inline_array = true
27 changes: 27 additions & 0 deletions rdagent/app/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,33 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction
user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction"

#### reward model related
enable_reward_model_selection: bool = True
"""Enable reward model based hypothesis selection."""

reward_model_path: str ="/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_8" #"/data/userdata/v-lijingyuan/last_run_2"# "/data/userdata/v-lijingyuan/last_run_2"#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"

#"/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/last_run_2"
#"/data/userdata/v-lijingyuan/last_run_2"

#reward_model_path: str = "/data/userdata/v-lijingyuan/last_run_3"


"""The path to the reward model for hypothesis selection."""

competition_mapping_path: str = "/data/Blob_EastUS/FinetuneAgenticLLM/reward_ckpt/comp_to_scen.json"

#competition_mapping_path: str = "/data/userdata/v-lijingyuan/dpo/comp_to_scen.json"


reward_base_model: str = "Qwen/Qwen3-4B"
""" Backbone of the reward model"""

max_length : int = 2300
""" max_length of the reward model"""

reward_select_type: int = 1


DS_RD_SETTING = DataScienceBasePropSetting()

Expand Down
2 changes: 1 addition & 1 deletion rdagent/components/coder/data_science/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pandas as pd

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.components.agent.context7 import Agent as DocAgent
from rdagent.components.coder.CoSTEER import CoSTEERMultiFeedback
from rdagent.components.coder.CoSTEER.evaluators import (
CoSTEEREvaluator,
Expand Down Expand Up @@ -307,6 +306,7 @@ def evaluate(
do_documentation_search = enable_mcp_documentation_search and wfb.requires_documentation_search

if do_documentation_search:
from rdagent.components.agent.context7 import Agent as DocAgent
# Use MCPAgent for clean, user-friendly interface
try:
# Create agent targeting Context7 service - model config comes from mcp_config.json
Expand Down
202 changes: 198 additions & 4 deletions rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from pydantic import BaseModel, Field

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.components.agent.rag import Agent as RAGAgent
from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
from rdagent.components.coder.data_science.feature.exp import FeatureTask
from rdagent.components.coder.data_science.model.exp import ModelTask
Expand Down Expand Up @@ -41,7 +40,7 @@
from rdagent.utils.agent.tpl import T
from rdagent.utils.repo.diff import generate_diff_from_dict
from rdagent.utils.workflow import wait_retry

import torch
_COMPONENT_META: Dict[str, Dict[str, Any]] = {
"DataLoadSpec": {
"target_name": "Data loader and specification generation",
Expand Down Expand Up @@ -313,7 +312,6 @@ def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraft
else:
return None


class DSProposalV1ExpGen(ExpGen):
def gen(
self,
Expand Down Expand Up @@ -649,6 +647,7 @@ def hypothesis_gen(

# knowledge retrieval
if DS_RD_SETTING.enable_research_rag:
from rdagent.components.agent.rag import Agent as RAGAgent
rag_agent = RAGAgent(
system_prompt="""You are a helpful assistant.
You help users retrieve relevant knowledge from community discussions and public code."""
Expand Down Expand Up @@ -1182,6 +1181,163 @@ def hypothesis_rank(
problem_label=problem_dict.get("label", "FEEDBACK_PROBLEM"),
appendix=hypothesis_dict[max_score_problem_name].get("appendix", None),
)

def reward_model_select_hypothesis(self,trace, hypothesis_dict: dict, problem_dict: dict) -> Tuple[str, DSHypothesis]:
"""
Select hypothesis based on reward model scores.
"""
from .reward_inference import RewardModelInference
from transformers import AutoTokenizer
import os
logdir = DS_RD_SETTING.reward_model_path
base_model = DS_RD_SETTING.reward_base_model
adapter_path = os.path.join(logdir, "lora_adapter")
reward_head_path = os.path.join(logdir, "reward_head.pt")
calib_path = os.path.join(logdir, "calib.json")

tokenizer = AutoTokenizer.from_pretrained(base_model)
if not getattr(tokenizer, "pad_token", None):
tokenizer.pad_token = tokenizer.eos_token
device = torch.device("cuda:1")
model = RewardModelInference(
base_model_name=base_model,
adapter_path=adapter_path,
reward_head_path=reward_head_path,device=device
)
model.eval()

parent_nodes = {}
for node in range(len(trace.hist)):
parents = trace.get_parents(node)
parent_nodes[node] = parents[-2] if len(parents) > 1 else None
# FIXME: add the convert logic to method in trace
if hasattr(trace, "idx2loop_id"):
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}

if not trace.current_selection or trace.current_selection[0] == -1:
first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
return first_text

current_parent_record_id = trace.current_selection[0]
loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)

hypothesis_list = [
trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
for loop_id in loop_id_list
if trace.hist[loop_id2idx[loop_id]][1].decision == True
][::-1]
sep = "->"

hypothesis_chain_list = []
accumulate = []
for hyp in hypothesis_list:
accumulate.append(hyp)
hypothesis_chain_list.append(sep.join(accumulate))

if not hypothesis_chain_list:
first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
return first_text

last_text = []
texts = []
for name, data in hypothesis_dict.items():
last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
texts.append(data.get("hypothesis", "Hypothesis not provided"))

comp_dict_path = DS_RD_SETTING.competition_mapping_path
with open(comp_dict_path, "r") as f:
comp_dict = json.load(f)
competition = trace.scen.competition
comp_description = comp_dict[competition]
rewards = model.compute_reward(texts, tokenizer,comp_description)
max_idx = rewards.index(max(rewards))
return texts[max_idx]

def reward_model_select_hypothesis_base_on_avg_win_rate(self, trace, hypothesis_dict):
"""
Select hypothesis based on avg win rate
"""
parent_nodes = {}
for node in range(len(trace.hist)):
parents = trace.get_parents(node)
parent_nodes[node] = parents[-2] if len(parents) > 1 else None
# FIXME: add the convert logic to method in trace
if hasattr(trace, "idx2loop_id"):
parent_nodes = {
trace.idx2loop_id[n]: trace.idx2loop_id[r] if r is not None else r for n, r in parent_nodes.items()
}

if not trace.current_selection or trace.current_selection[0] == -1:
first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
return first_text
current_parent_record_id = trace.current_selection[0]

loop_id2idx = {v: k for k, v in trace.idx2loop_id.items()}
loop_id_list = self._get_path(trace.idx2loop_id[current_parent_record_id], parent_nodes)

hypothesis_list = [
trace.hist[loop_id2idx[loop_id]][0].hypothesis.hypothesis
for loop_id in loop_id_list
if trace.hist[loop_id2idx[loop_id]][1].decision == True
][::-1]
sep = "->"

hypothesis_chain_list = []
accumulate = []
for hyp in hypothesis_list:
accumulate.append(hyp)
hypothesis_chain_list.append(sep.join(accumulate))

if not hypothesis_chain_list:
first_text = next(iter(hypothesis_dict.values())).get("hypothesis", "Hypothesis not provided")
return first_text

last_text = []
texts = []
for name, data in hypothesis_dict.items():
last_text.append(hypothesis_chain_list[-1] + sep + data.get("hypothesis", "Hypothesis not provided"))
texts.append(data.get("hypothesis", "Hypothesis not provided"))

from .reward_inference import RewardModelInference
from transformers import AutoTokenizer
import os
logdir = DS_RD_SETTING.reward_model_path
base_model = DS_RD_SETTING.reward_base_model

adapter_path = os.path.join(logdir, "lora_adapter")
reward_head_path = os.path.join(logdir, "reward_head.pt")

tokenizer = AutoTokenizer.from_pretrained(base_model)
if not getattr(tokenizer, "pad_token", None):
tokenizer.pad_token = tokenizer.eos_token

model = RewardModelInference(
base_model_name=base_model,
adapter_path=adapter_path,
reward_head_path=reward_head_path,
).to("cuda")
model.eval()
comp_dict_path = DS_RD_SETTING.competition_mapping_path
with open(comp_dict_path, "r") as f:
comp_dict = json.load(f)
competition = trace.scen.competition
comp_description = comp_dict[competition]
parent_rewards = model.compute_reward(hypothesis_chain_list,tokenizer,comp_description)
currnet_rewards = model.compute_reward(last_text,tokenizer,comp_description)

avg_win_rate = []
for re in currnet_rewards:
win_rate = []
for p_re in parent_rewards:
current_win_rate = re/(re + p_re)
win_rate.append(current_win_rate)
avg_win_rate.append(np.mean(win_rate))
max_idx = avg_win_rate.index(max(avg_win_rate))
return texts[max_idx]


def task_gen(
self,
Expand Down Expand Up @@ -1473,7 +1629,45 @@ def gen(
)
pickled_problem_name = None
else:
pickled_problem_name, new_hypothesis = self.hypothesis_rank(
sota_flag = (hasattr(trace, "sota_exp_to_submit") and trace.sota_exp_to_submit is not None)
if DS_RD_SETTING.enable_reward_model_selection==True and sota_flag and not trace.is_selection_new_tree():
# logger.info("Selecting hypothesis using reward model.")
# selected_hypothesis_text = self.reward_model_select_hypothesis(
# hypothesis_dict=hypothesis_dict,
# problem_dict=all_problems,
# )
logger.info("Selecting hypothesis using reward model. (avg win)")
if DS_RD_SETTING.reward_select_type==1:
selected_hypothesis_text= self.reward_model_select_hypothesis_base_on_avg_win_rate(trace=trace, hypothesis_dict=hypothesis_dict)
elif DS_RD_SETTING.reward_select_type==2:
selected_hypothesis_text = self.reward_model_select_hypothesis(
trace=trace,
hypothesis_dict=hypothesis_dict,
problem_dict=all_problems,
)
# Find the problem name corresponding to the selected hypothesis text
pickled_problem_name = None
for problem_name, data in hypothesis_dict.items():
if data.get("hypothesis", "") == selected_hypothesis_text:
pickled_problem_name = problem_name
break
if pickled_problem_name is None:
raise ValueError("Selected hypothesis text does not match any known hypothesis.")
new_hypothesis = DSHypothesis(
component=hypothesis_dict[pickled_problem_name].get("component", "Model"),
hypothesis=hypothesis_dict[pickled_problem_name].get("hypothesis", "Hypothesis not provided"),
reason=hypothesis_dict[pickled_problem_name].get("reason", "Reason not provided"),
problem_name=pickled_problem_name,
problem_desc=all_problems.get(pickled_problem_name, {}).get(
"problem", "Problem description not provided"
),
problem_label=all_problems.get(pickled_problem_name, {}).get(
"label", "FEEDBACK_PROBLEM"
),
appendix=hypothesis_dict[pickled_problem_name].get("appendix", None),
)
else:
pickled_problem_name, new_hypothesis = self.hypothesis_rank(
hypothesis_dict=hypothesis_dict,
problem_dict=all_problems,
)
Expand Down
Loading