ASSERT-KTH · andre15silva · Mar 19, 2025 · Oct 27, 2024 · Oct 30, 2024 · Nov 13, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "cache"]
 	path = cache
 	url = https://github.com/ASSERT-KTH/elle-elle-aime-cache.git
+# [submodule "benchmarks/run_bug_run"]
+# 	path = benchmarks/run_bug_run
+# 	url = https://github.com/ASSERT-KTH/run_bug_run.git
diff --git a/benchmarks/run_bug_run/buggy_test_results.tgz b/benchmarks/run_bug_run/buggy_test_results.tgz
diff --git a/elleelleaime/core/benchmarks/runbugrun/__init__.py b/elleelleaime/core/benchmarks/runbugrun/__init__.py
diff --git a/elleelleaime/core/benchmarks/runbugrun/output_matcher.py b/elleelleaime/core/benchmarks/runbugrun/output_matcher.py
@@ -0,0 +1,80 @@
+import re
+from decimal import Decimal
+
+class ParseError(Exception):
+    pass
+
+class OutputParser:
+    def __init__(self, output, strict=False):
+        self.output = output
+        self.lines = output.splitlines()
+        self.strict = strict
+
+    def parse(self):
+        return [self.parse_line(line) for line in self.lines]
+
+    def parse_line(self, line):
+        elements = []
+        tokens = re.finditer(r'\S+', line)
+        for token in tokens:
+            element = self.parse_element(token.group(0))
+            if element is None:
+                raise ParseError(f"Failed to match element at '{token.group(0)}'")
+            elements.append(element)
+        return elements
+
+    def parse_element(self, token):
+        if (number := self.parse_number(token)) is not None:
+            return number
+        return token
+
+    def parse_number(self, token):
+        if re.match(r'-?\d+(?:\.\d+)?$', token):
+            return int(token) if self.strict and re.match(r'^-?\d+$', token) else Decimal(token)
+        return None
+
+DEFAULT_FLOAT_EPS = 1e-4
+FLOAT_EPS = {
+    'p02400': 1e-5, 'p02008': 1e-6, 'p03882': 1e-9, 'p02805': 1e-6, 'p03585': 1e-9,
+    'p03619': 1e-11, 'p01562': 1e-6, 'p03428': 1e-5, 'p01837': 1e-6, 'p03135': 1e-3,
+    'p02764': 1e-6, 'p03888': 1e-6, 'p03110': 1e-5, 'p03901': 1e-6, 'p01836': 1e-8,
+    'p00973': 1e-6, 'p03043': 1e-9, 'p01948': 1e-6, 'p01800': 1e-6, 'p03304': 1e-6,
+    'p01704': 1e-4, 'p03001': 1e-9, 'p02072': 1e-3, 'p02897': 1e-6, 'p03754': 1e-6,
+    'p02731': 1e-6, 'p03879': 1e-9, 'p02677': 1e-9, 'p03953': 1e-9, 'p02894': 1e-9,
+    'p02705': 1e-2, 'p01825': 1e-6, 'p03514': 1e-9, 'p01672': 1e-8, 'p02882': 1e-6,
+    'p03881': 1e-9, 'p02075': 1e-9, 'p00988': 1e-7, 'p03744': 1e-6, 'p01685': 1e-6,
+    'p03872': 1e-9, 'p01703': 1e-8, 'p03869': 1e-9, 'p02884': 1e-6, 'p03866': 1e-9,
+    'p02780': 1e-6, 'p01568': 1e-6, 'p01705': 1e-4, 'p01576': 1e-8, 'p02935': 1e-5,
+    'p03004': 1e-9, 'p02011': 1e-6, 'p01708': 1e-2, 'p03776': 1e-6, 'p02934': 1e-5,
+    'p01363': 1e-6, 'p01510': 1e-9, 'p03871': 1e-9, 'p02379': 1e-4
+}
+
+def match(expected_output, actual_output, problem_id):
+    if actual_output is None:
+        return False
+
+    expected_output = expected_output.rstrip('\n')
+    actual_output = actual_output.rstrip('\n')
+    if expected_output == actual_output:
+        return True
+
+    expected_parsed = OutputParser(expected_output).parse()
+    actual_parsed = OutputParser(actual_output).parse()
+
+    if len(expected_parsed) != len(actual_parsed):
+        return False
+
+    float_eps = FLOAT_EPS.get(problem_id, DEFAULT_FLOAT_EPS)
+
+    for expected_line, actual_line in zip(expected_parsed, actual_parsed):
+        if len(expected_line) != len(actual_line):
+            return False
+
+        for expected_element, actual_element in zip(expected_line, actual_line):
+            if isinstance(expected_element, Decimal) and isinstance(actual_element, Decimal):
+                if abs(actual_element - expected_element) > float_eps:
+                    return False
+            elif actual_element != expected_element:
+                return False
+
+    return True
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrun.py b/elleelleaime/core/benchmarks/runbugrun/runbugrun.py
@@ -0,0 +1,193 @@
+from pathlib import Path
+from unidiff import PatchSet
+from elleelleaime.core.benchmarks.benchmark import Benchmark
+from elleelleaime.core.benchmarks.runbugrun.runbugrunbug import RunBugRunBug
+from elleelleaime.core.benchmarks.runbugrun.output_matcher import match as match_output
+
+import os
+import json
+import subprocess
+import logging
+from tqdm import tqdm
+import pandas as pd
+import concurrent.futures
+
+
+class RunBugRun(Benchmark):
+    """
+    The class for representing the RunBugRun benchmark.
+    """
+
+    def __init__(self, path: Path = Path("benchmarks/run_bug_run").absolute()) -> None:
+        super().__init__("runbugrun", path)
+
+    def initialize(self) -> None:
+        """
+        Initializes the RunBugRun benchmark object by collecting all the bugs.
+        """
+        logging.info("Initializing RunBugRun benchmark...")
+
+        python_path = Path(self.get_path(), "python_valid0.jsonl")
+        test_path = Path(self.get_path(), "tests_all.jsonl")
+
+        python_df = pd.read_json(open(python_path), lines=True).set_index("problem_id")
+        test_df = pd.read_json(open(test_path), lines=True).set_index("id")
+
+        subprocess.run(
+            f"mkdir -p {self.path}/buggy",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        subprocess.run(
+            f"mkdir -p {self.path}/fixed",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        buggy_submissions = python_df.drop_duplicates(
+            subset=["buggy_submission_id"]
+        )  # .head(105)
+        pbar = tqdm(
+            buggy_submissions[
+                [
+                    "buggy_submission_id",
+                    "buggy_code",
+                    "fixed_submission_id",
+                    "fixed_code",
+                    "errors",
+                ]
+            ].iterrows(),
+            total=len(buggy_submissions),
+        )
+
+        for prob_id, (
+            buggy_submission_id,
+            buggy_code,
+            fixed_submission_id,
+            fixed_code,
+            errors,
+        ) in pbar:
+            buggy_file = Path(self.path, "buggy", f"{prob_id}_{buggy_submission_id}.py")
+            fixed_file = Path(
+                self.path, "fixed", f"{prob_id}_{buggy_submission_id}.py"
+            )  # using buggy id for both to maintain file correspondence
+
+            pbar.set_postfix({"file": buggy_file})
+            pbar.update()
+
+            with open(buggy_file, "w") as f:
+                f.write(buggy_code)
+                f.write("\n")
+
+            with open(fixed_file, "w") as f:
+                f.write(fixed_code)
+                f.write("\n")
+
+            run = subprocess.run(
+                f"""cd {self.get_path()} && 
+                diff --unified {fixed_file.relative_to(self.path)} {buggy_file.relative_to(self.path)}""",
+                shell=True,
+                capture_output=True,
+            )
+
+            diff = PatchSet(run.stdout.decode("utf-8"))
+            # Change the source file path to point to the buggy version
+            diff[0].source_file = f"{buggy_file.relative_to(self.path)}"
+
+            test_rows = test_df[test_df.problem_id == prob_id][["input", "output"]]
+            failing_tests = self.get_failing_tests(buggy_file, errors, test_rows, prob_id)
+            if failing_tests:
+                self.add_bug(
+                    RunBugRunBug(
+                        self,
+                        f"{prob_id}_{buggy_submission_id}",
+                        str(diff),
+                        failing_tests,
+                    )
+                )
+
+    def get_failing_tests(self, buggy_file, errors, test_rows, prob_id):
+        failing_tests = {}
+        test_results = []
+
+        results_path = Path(self.get_path(), buggy_file.with_suffix(".jsonl"))
+        already_cached = os.path.exists(results_path)
+        if already_cached:
+            test_results = pd.read_json(open(results_path), lines=True).set_index("id")
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+            futures = []
+            futures_to_tests = {}
+
+            for test_id, (test_input, test_output) in test_rows.iterrows():
+                test_input = test_input.strip()
+                test_output = test_output.strip()
+
+                if isinstance(errors, list):
+                    result = errors[0]["exception"] + "\n" + errors[0]["output"]
+                    cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                    failing_tests[f"""test_{test_id}"""] = cause
+                elif (
+                    not already_cached
+                ):  # if there isn't a runtime exception, need to execute to get the cause of test failure
+                    # TODO: checkout first?
+                    futures.append(
+                        executor.submit(
+                            RunBugRunBug.execute_test_case, buggy_file, test_input
+                        )
+                    )
+                    futures_to_tests[futures[-1]] = (
+                        test_id,
+                        test_input.strip(),
+                        test_output.strip(),
+                    )
+                else:
+                    pass
+
+            if not already_cached:
+                for future in concurrent.futures.as_completed(futures):
+                    returncode, result = future.result()
+                    test_id, test_input, test_output = futures_to_tests[future]
+                    if returncode:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    elif not match_output(test_output, result, prob_id):
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    else:
+                        pass
+                    test_results.append(
+                        {
+                            "id": test_id,
+                            "input": test_input,
+                            "output": test_output,
+                            "returncode": returncode,
+                            "result": result,
+                        }
+                    )
+
+                if test_results:
+                    pd.DataFrame(test_results).to_json(
+                        results_path, orient="records", lines=True
+                    )
+
+            else:
+                for test_id, (
+                    test_input,
+                    test_output,
+                    returncode,
+                    result,
+                ) in test_results.iterrows():
+                    if returncode:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nfailed with error:\n{result.strip()}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    elif result != test_output:
+                        cause = f"""Function with input:\n{test_input}\nexpected to output:\n{test_output}\nbut got:\n{result}"""
+                        failing_tests[f"""test_{test_id}"""] = cause
+                    else:
+                        continue
+
+        return failing_tests
diff --git a/elleelleaime/core/benchmarks/runbugrun/runbugrunbug.py b/elleelleaime/core/benchmarks/runbugrun/runbugrunbug.py
@@ -0,0 +1,102 @@
+import subprocess
+import shutil
+import os
+from pathlib import Path
+import re
+
+from elleelleaime.core.benchmarks.benchmark import Benchmark
+from elleelleaime.core.benchmarks.bug import RichBug
+from elleelleaime.core.benchmarks.test_result import TestResult
+from elleelleaime.core.benchmarks.compile_result import CompileResult
+from elleelleaime.core.benchmarks.runbugrun.output_matcher import match as match_output
+
+
+class RunBugRunBug(RichBug):
+    """
+    The class for representing RunBugRun bugs
+    """
+
+    def checkout(self, path: str, fixed: bool = False) -> bool:
+        # Remove the directory if it exists
+        shutil.rmtree(path, ignore_errors=True)
+        # Make the directory
+        subprocess.run(
+            f"mkdir -p {path}",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        # Checkout the bug is the same as copying the entire benchmark
+        # Copy source file
+
+        subfolder = "fixed" if fixed else "buggy"
+        cmd = f"cd {self.benchmark.get_path()}; mkdir {path}/buggy; cp {subfolder}/{self.identifier}.py {path}/buggy"  # FIXME
+        run = subprocess.run(cmd, shell=True, capture_output=True, check=True)
+
+        return run.returncode == 0
+
+    def compile(self, path: str) -> CompileResult:
+        file_path = Path(path, "buggy", f"{self.get_identifier()}.py")
+        assert file_path.exists()
+
+        with open(file_path) as f:
+            bug_code = f.read()
+        assert bug_code
+
+        try:
+            compile(bug_code, file_path, "exec")
+            return CompileResult(True)
+        except:
+            return CompileResult(False)
+
+    def test(self, path: str) -> TestResult:
+        file_path = Path(path, "buggy", f"{self.get_identifier()}.py")
+        assert file_path.exists()
+
+        for test_case, cause in self.failing_tests.items():
+            match = re.search(
+                "Function with input:\n(.*)\nexpected to output:\n(.*)\n(?:failed|but got)",
+                cause,
+                re.DOTALL,
+            )
+            test_input, test_output = match.group(1), match.group(2)
+            error_code, result = RunBugRunBug.execute_test_case(file_path, test_input)
+
+            if error_code and 'Syntax error: EOF in backquote substitution' in result:
+                continue
+
+            if error_code:
+                return TestResult(False)
+            elif not match_output(test_output.strip(), result.strip(), self.get_identifier().split('_')[0]):
+                return TestResult(False)
+
+        return TestResult(True)
+
+    @staticmethod
+    def execute_test_case(code_path, test_input):
+        if test_input.strip():
+            cmd = f"""echo "{test_input}" | python {code_path}"""
+        else:
+            cmd = f"""python {code_path}"""
+        try:
+            run = subprocess.run(
+                cmd,
+                shell=True,
+                capture_output=True,
+                check=False,
+                timeout=1,
+            )
+        except OSError:
+            return 255, "OSError: [Errno 7] Argument list too long: '/bin/sh'"
+        except subprocess.TimeoutExpired:
+            return 1, f"Command '{cmd}' timed out after 1 seconds"
+
+        return run.returncode, (
+            run.stderr.decode("utf-8").strip()
+            if run.returncode
+            else run.stdout.decode("utf-8").strip()
+        )
+
+    def get_src_test_dir(self, path: str) -> str:
+        return path