ASSERT-KTH · andre15silva · Mar 19, 2025 · Oct 27, 2024 · Oct 30, 2024 · Nov 13, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+/workspaces/elle-elle-aime/benchmarks/run_bug_run/buggy_test_results.tgz filter=lfs diff=lfs merge=lfs -text
diff --git a/benchmarks/run_bug_run/buggy_test_results.tgz b/benchmarks/run_bug_run/buggy_test_results.tgz
diff --git a/elleelleaime/core/benchmarks/runbugrun/__init__.py b/elleelleaime/core/benchmarks/runbugrun/__init__.py
diff --git a/elleelleaime/core/benchmarks/runbugrun/output_matcher.py b/elleelleaime/core/benchmarks/runbugrun/output_matcher.py
@@ -0,0 +1,137 @@
+import re
+from decimal import Decimal
+
+
+class ParseError(Exception):
+    pass
+
+
+class OutputParser:
+    def __init__(self, output, strict=False):
+        self.output = output
+        self.lines = output.splitlines()
+        self.strict = strict
+
+    def parse(self):
+        return [self.parse_line(line) for line in self.lines]
+
+    def parse_line(self, line):
+        elements = []
+        tokens = re.finditer(r"\S+", line)
+        for token in tokens:
+            element = self.parse_element(token.group(0))
+            if element is None:
+                raise ParseError(f"Failed to match element at '{token.group(0)}'")
+            elements.append(element)
+        return elements
+
+    def parse_element(self, token):
+        if (number := self.parse_number(token)) is not None:
+            return number
+        return token
+
+    def parse_number(self, token):
+        if re.match(r"-?\d+(?:\.\d+)?$", token):
+            return (
+                int(token)
+                if self.strict and re.match(r"^-?\d+$", token)
+                else Decimal(token)
+            )
+        return None
+
+
+DEFAULT_FLOAT_EPS = 1e-4
+FLOAT_EPS = {
+    "p02400": 1e-5,
+    "p02008": 1e-6,
+    "p03882": 1e-9,
+    "p02805": 1e-6,
+    "p03585": 1e-9,
+    "p03619": 1e-11,
+    "p01562": 1e-6,
+    "p03428": 1e-5,
+    "p01837": 1e-6,
+    "p03135": 1e-3,
+    "p02764": 1e-6,
+    "p03888": 1e-6,
+    "p03110": 1e-5,
+    "p03901": 1e-6,
+    "p01836": 1e-8,
+    "p00973": 1e-6,
+    "p03043": 1e-9,
+    "p01948": 1e-6,
+    "p01800": 1e-6,
+    "p03304": 1e-6,
+    "p01704": 1e-4,
+    "p03001": 1e-9,
+    "p02072": 1e-3,
+    "p02897": 1e-6,
+    "p03754": 1e-6,
+    "p02731": 1e-6,
+    "p03879": 1e-9,
+    "p02677": 1e-9,
+    "p03953": 1e-9,
+    "p02894": 1e-9,
+    "p02705": 1e-2,
+    "p01825": 1e-6,
+    "p03514": 1e-9,
+    "p01672": 1e-8,
+    "p02882": 1e-6,
+    "p03881": 1e-9,
+    "p02075": 1e-9,
+    "p00988": 1e-7,
+    "p03744": 1e-6,
+    "p01685": 1e-6,
+    "p03872": 1e-9,
+    "p01703": 1e-8,
+    "p03869": 1e-9,
+    "p02884": 1e-6,
+    "p03866": 1e-9,
+    "p02780": 1e-6,
+    "p01568": 1e-6,
+    "p01705": 1e-4,
+    "p01576": 1e-8,
+    "p02935": 1e-5,
+    "p03004": 1e-9,
+    "p02011": 1e-6,
+    "p01708": 1e-2,
+    "p03776": 1e-6,
+    "p02934": 1e-5,
+    "p01363": 1e-6,
+    "p01510": 1e-9,
+    "p03871": 1e-9,
+    "p02379": 1e-4,
+}
+
+
+def match(expected_output, actual_output, problem_id):
+    if actual_output is None:
+        return False
+
+    expected_output = expected_output.rstrip("\n")
+    actual_output = actual_output.rstrip("\n")
+    if expected_output == actual_output:
+        return True
+
+    expected_parsed = OutputParser(expected_output).parse()
+    actual_parsed = OutputParser(actual_output).parse()
+
+    if len(expected_parsed) != len(actual_parsed):
+        return False
+
+    float_eps = FLOAT_EPS.get(problem_id, DEFAULT_FLOAT_EPS)
+
+    for expected_line, actual_line in zip(expected_parsed, actual_parsed):
+        if len(expected_line) != len(actual_line):
+            return False
+
+        for expected_element, actual_element in zip(expected_line, actual_line):
+            if isinstance(expected_element, Decimal) and isinstance(
+                actual_element, Decimal
+            ):
+                if abs(actual_element - expected_element) > float_eps:
+                    return False
+            elif actual_element != expected_element:
+                return False
+
+    return True
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
@@ -0,0 +1,194 @@
+from pathlib import Path
+from unidiff import PatchSet
+from elleelleaime.core.benchmarks.benchmark import Benchmark
+from elleelleaime.core.benchmarks.runbugrun.runbugrunbug import RunBugRunBug
+from elleelleaime.core.benchmarks.runbugrun.output_matcher import match as match_output
+
+import os
+import json
+import subprocess
+import logging
+from tqdm import tqdm
+import pandas as pd
+import concurrent.futures
+
+
+class RunBugRun(Benchmark):
+    """
+    The class for representing the RunBugRun benchmark.
+    """
+
+    def __init__(self, path: Path = Path("benchmarks/run_bug_run").absolute()) -> None:
+        super().__init__("runbugrun", path)
+
+    def initialize(self) -> None:
+        """
+        Initializes the RunBugRun benchmark object by collecting all the bugs.
+        """
+        logging.info("Initializing RunBugRun benchmark...")
+
+        python_path = Path(self.get_path(), "python_valid0.jsonl")
+        test_path = Path(self.get_path(), "tests_all.jsonl")
+
+        python_df = pd.read_json(open(python_path), lines=True).set_index("problem_id")
+        test_df = pd.read_json(open(test_path), lines=True).set_index("id")
+
+        subprocess.run(
+            f"mkdir -p {self.path}/buggy",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        subprocess.run(
+            f"mkdir -p {self.path}/fixed",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        buggy_submissions = python_df.drop_duplicates(
+            subset=["buggy_submission_id"]
+        )  # .head(105)
+        pbar = tqdm(
+            buggy_submissions[
+                [
+                    "buggy_submission_id",
+                    "buggy_code",
+                    "fixed_submission_id",
+                    "fixed_code",
+                    "errors",
+                ]
+            ].iterrows(),
+            total=len(buggy_submissions),
+        )
+
+        for prob_id, (
+            buggy_submission_id,
+            buggy_code,
+            fixed_submission_id,
+            fixed_code,
+            errors,
+        ) in pbar:
+            buggy_file = Path(self.path, "buggy", f"{prob_id}_{buggy_submission_id}.py")
+            fixed_file = Path(
+                self.path, "fixed", f"{prob_id}_{buggy_submission_id}.py"
+            )  # using buggy id for both to maintain file correspondence
+
+            pbar.set_postfix({"file": buggy_file})
+            pbar.update()
+
+            with open(buggy_file, "w") as f:
+                f.write(buggy_code)
+                f.write("\n")
+
+            with open(fixed_file, "w") as f:
+                f.write(fixed_code)
+                f.write("\n")
+
+            run = subprocess.run(
+                f"""cd {self.get_path()} && 
+                diff --unified {fixed_file.relative_to(self.path)} {buggy_file.relative_to(self.path)}""",
+                shell=True,
+                capture_output=True,
+            )
+
+            diff = PatchSet(run.stdout.decode("utf-8"))
+            # Change the source file path to point to the buggy version
+            diff[0].source_file = f"{buggy_file.relative_to(self.path)}"
+
+            test_rows = test_df[test_df.problem_id == prob_id][["input", "output"]]
+            failing_tests = self.get_failing_tests(
+                buggy_file, errors, test_rows, prob_id
+            )
+            if failing_tests:
+                self.add_bug(
+                    RunBugRunBug(
+                        self,
+                        f"{prob_id}_{buggy_submission_id}",
+                        str(diff),
+                        failing_tests,
+                    )
+                )
+
+    def get_failing_tests(self, buggy_file, errors, test_rows, prob_id):
+        failing_tests = {}
+        test_results = []
+
+        results_path = Path(self.get_path(), buggy_file.with_suffix(".jsonl"))
+        already_cached = os.path.exists(results_path)
+        if already_cached:
+            test_results = pd.read_json(open(results_path), lines=True).set_index("id")
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            futures = []
+            futures_to_tests = {}
+
+            for test_id, (test_input, test_output) in test_rows.iterrows():
+                test_input = test_input.strip()
+                test_output = test_output.strip()
+
+                if isinstance(errors, list):
+                    result = errors[0]["exception"] + "\n" + errors[0]["output"]
+                    cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                    failing_tests[f"""test_{test_id}"""] = cause
+                elif (
+                    not already_cached
+                ):  # if there isn't a runtime exception, need to execute to get the cause of test failure
+                    futures.append(
+                        executor.submit(
+                            RunBugRunBug.execute_test_case, buggy_file, test_input
+                        )
+                    )
+                    futures_to_tests[futures[-1]] = (
+                        test_id,
+                        test_input.strip(),
+                        test_output.strip(),
+                    )
+                else:
+                    pass
+
+            if not already_cached:
+                for future in concurrent.futures.as_completed(futures):
+                    returncode, result = future.result()
+                    test_id, test_input, test_output = futures_to_tests[future]
+                    if returncode:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    elif not match_output(test_output, result, prob_id):
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    else:
+                        pass
+                    test_results.append(
+                        {
+                            "id": test_id,
+                            "input": test_input,
+                            "output": test_output,
+                            "returncode": returncode,
+                            "result": result,
+                        }
+                    )
+
+                if test_results:
+                    pd.DataFrame(test_results).to_json(
+                        results_path, orient="records", lines=True
+                    )
+
+            else:
+                for test_id, (
+                    test_input,
+                    test_output,
+                    returncode,
+                    result,
+                ) in test_results.iterrows():
+                    if returncode:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    elif result != test_output:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    else:
+                        continue
+
+        return failing_tests
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		/workspaces/elle-elle-aime/benchmarks/run_bug_run/buggy_test_results.tgz filter=lfs diff=lfs merge=lfs -text