Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/workspaces/elle-elle-aime/benchmarks/run_bug_run/buggy_test_results.tgz filter=lfs diff=lfs merge=lfs -text
Binary file added benchmarks/run_bug_run/buggy_test_results.tgz
Binary file not shown.
Empty file.
137 changes: 137 additions & 0 deletions elleelleaime/core/benchmarks/runbugrun/output_matcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import re
from decimal import Decimal


class ParseError(Exception):
pass


class OutputParser:
def __init__(self, output, strict=False):
self.output = output
self.lines = output.splitlines()
self.strict = strict

def parse(self):
return [self.parse_line(line) for line in self.lines]

def parse_line(self, line):
elements = []
tokens = re.finditer(r"\S+", line)
for token in tokens:
element = self.parse_element(token.group(0))
if element is None:
raise ParseError(f"Failed to match element at '{token.group(0)}'")
elements.append(element)
return elements

def parse_element(self, token):
if (number := self.parse_number(token)) is not None:
return number
return token

def parse_number(self, token):
if re.match(r"-?\d+(?:\.\d+)?$", token):
return (
int(token)
if self.strict and re.match(r"^-?\d+$", token)
else Decimal(token)
)
return None


DEFAULT_FLOAT_EPS = 1e-4
FLOAT_EPS = {
"p02400": 1e-5,
"p02008": 1e-6,
"p03882": 1e-9,
"p02805": 1e-6,
"p03585": 1e-9,
"p03619": 1e-11,
"p01562": 1e-6,
"p03428": 1e-5,
"p01837": 1e-6,
"p03135": 1e-3,
"p02764": 1e-6,
"p03888": 1e-6,
"p03110": 1e-5,
"p03901": 1e-6,
"p01836": 1e-8,
"p00973": 1e-6,
"p03043": 1e-9,
"p01948": 1e-6,
"p01800": 1e-6,
"p03304": 1e-6,
"p01704": 1e-4,
"p03001": 1e-9,
"p02072": 1e-3,
"p02897": 1e-6,
"p03754": 1e-6,
"p02731": 1e-6,
"p03879": 1e-9,
"p02677": 1e-9,
"p03953": 1e-9,
"p02894": 1e-9,
"p02705": 1e-2,
"p01825": 1e-6,
"p03514": 1e-9,
"p01672": 1e-8,
"p02882": 1e-6,
"p03881": 1e-9,
"p02075": 1e-9,
"p00988": 1e-7,
"p03744": 1e-6,
"p01685": 1e-6,
"p03872": 1e-9,
"p01703": 1e-8,
"p03869": 1e-9,
"p02884": 1e-6,
"p03866": 1e-9,
"p02780": 1e-6,
"p01568": 1e-6,
"p01705": 1e-4,
"p01576": 1e-8,
"p02935": 1e-5,
"p03004": 1e-9,
"p02011": 1e-6,
"p01708": 1e-2,
"p03776": 1e-6,
"p02934": 1e-5,
"p01363": 1e-6,
"p01510": 1e-9,
"p03871": 1e-9,
"p02379": 1e-4,
}


def match(expected_output, actual_output, problem_id):
if actual_output is None:
return False

expected_output = expected_output.rstrip("\n")
actual_output = actual_output.rstrip("\n")
if expected_output == actual_output:
return True

expected_parsed = OutputParser(expected_output).parse()
actual_parsed = OutputParser(actual_output).parse()

if len(expected_parsed) != len(actual_parsed):
return False

float_eps = FLOAT_EPS.get(problem_id, DEFAULT_FLOAT_EPS)

for expected_line, actual_line in zip(expected_parsed, actual_parsed):
if len(expected_line) != len(actual_line):
return False

for expected_element, actual_element in zip(expected_line, actual_line):
if isinstance(expected_element, Decimal) and isinstance(
actual_element, Decimal
):
if abs(actual_element - expected_element) > float_eps:
return False
elif actual_element != expected_element:
return False

return True
194 changes: 194 additions & 0 deletions elleelleaime/core/benchmarks/runbugrun/runbugrun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
from pathlib import Path
from unidiff import PatchSet
from elleelleaime.core.benchmarks.benchmark import Benchmark
from elleelleaime.core.benchmarks.runbugrun.runbugrunbug import RunBugRunBug
from elleelleaime.core.benchmarks.runbugrun.output_matcher import match as match_output

import os
import json
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import concurrent.futures


class RunBugRun(Benchmark):
"""
The class for representing the RunBugRun benchmark.
"""

def __init__(self, path: Path = Path("benchmarks/run_bug_run").absolute()) -> None:
super().__init__("runbugrun", path)

def initialize(self) -> None:
"""
Initializes the RunBugRun benchmark object by collecting all the bugs.
"""
logging.info("Initializing RunBugRun benchmark...")

python_path = Path(self.get_path(), "python_valid0.jsonl")
test_path = Path(self.get_path(), "tests_all.jsonl")

python_df = pd.read_json(open(python_path), lines=True).set_index("problem_id")
test_df = pd.read_json(open(test_path), lines=True).set_index("id")

subprocess.run(
f"mkdir -p {self.path}/buggy",
shell=True,
capture_output=True,
check=True,
)

subprocess.run(
f"mkdir -p {self.path}/fixed",
shell=True,
capture_output=True,
check=True,
)

buggy_submissions = python_df.drop_duplicates(
subset=["buggy_submission_id"]
) # .head(105)
pbar = tqdm(
buggy_submissions[
[
"buggy_submission_id",
"buggy_code",
"fixed_submission_id",
"fixed_code",
"errors",
]
].iterrows(),
total=len(buggy_submissions),
)

for prob_id, (
buggy_submission_id,
buggy_code,
fixed_submission_id,
fixed_code,
errors,
) in pbar:
buggy_file = Path(self.path, "buggy", f"{prob_id}_{buggy_submission_id}.py")
fixed_file = Path(
self.path, "fixed", f"{prob_id}_{buggy_submission_id}.py"
) # using buggy id for both to maintain file correspondence

pbar.set_postfix({"file": buggy_file})
pbar.update()

with open(buggy_file, "w") as f:
f.write(buggy_code)
f.write("\n")

with open(fixed_file, "w") as f:
f.write(fixed_code)
f.write("\n")

run = subprocess.run(
f"""cd {self.get_path()} &&
diff --unified {fixed_file.relative_to(self.path)} {buggy_file.relative_to(self.path)}""",
shell=True,
capture_output=True,
)

diff = PatchSet(run.stdout.decode("utf-8"))
# Change the source file path to point to the buggy version
diff[0].source_file = f"{buggy_file.relative_to(self.path)}"

test_rows = test_df[test_df.problem_id == prob_id][["input", "output"]]
failing_tests = self.get_failing_tests(
buggy_file, errors, test_rows, prob_id
)
if failing_tests:
self.add_bug(
RunBugRunBug(
self,
f"{prob_id}_{buggy_submission_id}",
str(diff),
failing_tests,
)
)

def get_failing_tests(self, buggy_file, errors, test_rows, prob_id):
failing_tests = {}
test_results = []

results_path = Path(self.get_path(), buggy_file.with_suffix(".jsonl"))
already_cached = os.path.exists(results_path)
if already_cached:
test_results = pd.read_json(open(results_path), lines=True).set_index("id")

with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
futures = []
futures_to_tests = {}

for test_id, (test_input, test_output) in test_rows.iterrows():
test_input = test_input.strip()
test_output = test_output.strip()

if isinstance(errors, list):
result = errors[0]["exception"] + "\n" + errors[0]["output"]
cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
failing_tests[f"""test_{test_id}"""] = cause
elif (
not already_cached
): # if there isn't a runtime exception, need to execute to get the cause of test failure
futures.append(
executor.submit(
RunBugRunBug.execute_test_case, buggy_file, test_input
)
)
futures_to_tests[futures[-1]] = (
test_id,
test_input.strip(),
test_output.strip(),
)
else:
pass

if not already_cached:
for future in concurrent.futures.as_completed(futures):
returncode, result = future.result()
test_id, test_input, test_output = futures_to_tests[future]
if returncode:
cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
failing_tests[f"""test_{test_id}"""] = cause
elif not match_output(test_output, result, prob_id):
cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
failing_tests[f"""test_{test_id}"""] = cause
else:
pass
test_results.append(
{
"id": test_id,
"input": test_input,
"output": test_output,
"returncode": returncode,
"result": result,
}
)

if test_results:
pd.DataFrame(test_results).to_json(
results_path, orient="records", lines=True
)

else:
for test_id, (
test_input,
test_output,
returncode,
result,
) in test_results.iterrows():
if returncode:
cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
failing_tests[f"""test_{test_id}"""] = cause
elif result != test_output:
cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
failing_tests[f"""test_{test_id}"""] = cause
else:
continue

return failing_tests
Loading
Loading