From b762e659d1e8e5316dcd4eaaa124f96e17db6026 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Wed, 20 Mar 2024 10:08:51 +0100 Subject: [PATCH 001/124] Added explicit jobid logging to srun --- contrib/slurm/slurmexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 961b8cb8b..0ad5660ae 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -204,7 +204,7 @@ def execute(self, run): return None -jobid_pattern = re.compile(r"job (\d*) queued") +jobid_pattern = re.compile(r"job (\d*) started") def wait_for(func, timeout_sec=None, poll_interval_sec=1): @@ -288,7 +288,7 @@ def run_slurm(benchmark, args, log_file): [ "sh", "-c", - f"{' '.join(map(util.escape_string_shell, args))}; echo $? > exitcode", + f"echo job $SLURM_JOB_ID started; {' '.join(map(util.escape_string_shell, args))}; echo $? > exitcode", ] ) From 702f525f2426c359e57c389f64851fc61252a982 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Wed, 20 Mar 2024 10:11:43 +0100 Subject: [PATCH 002/124] Added logging to retry --- contrib/slurm/slurmexecutor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 0ad5660ae..a114b1a85 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -311,6 +311,7 @@ def run_slurm(benchmark, args, log_file): if jobid_match: jobid = int(jobid_match.group(1)) break + logging.debug("Pattern not found in log line: %s", line) seff_command = ["seff", str(jobid)] logging.debug( From 6037e7f4191f56371a04dd2947fcd3eb56021ed1 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Wed, 20 Mar 2024 11:04:55 +0100 Subject: [PATCH 003/124] Trying with small wait before seff --- contrib/slurm/slurmexecutor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index a114b1a85..ce773cead 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -318,6 +318,8 @@ def run_slurm(benchmark, args, log_file): "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) ) + time.sleep(10) + def get_checked_seff_result(): seff_result = subprocess.run( seff_command, From 512fc56daa5c016e7e34142e01567e40fa9b475a Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 13:35:17 +0100 Subject: [PATCH 004/124] Fixed cancelling flow, as well as job id matching --- contrib/slurm/slurmexecutor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index ce773cead..6f821d7f2 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -216,7 +216,7 @@ def wait_for(func, timeout_sec=None, poll_interval_sec=1): """ start_time = time.monotonic() - while True: + while not STOPPED_BY_INTERRUPT: ret = func() if ret is not None: return ret @@ -296,7 +296,7 @@ def run_slurm(benchmark, args, log_file): "Command to run: %s", " ".join(map(util.escape_string_shell, srun_command)) ) jobid = None - while jobid is None: + while jobid is None and not STOPPED_BY_INTERRUPT: with open(tmp_log, "w") as tmp_log_f: subprocess.run( srun_command, @@ -304,9 +304,9 @@ def run_slurm(benchmark, args, log_file): stderr=subprocess.STDOUT, ) - # we try to read back the log, in the first two lines there should be the jobid + # we try to read back the log, in the first three lines, there should be the jobid with open(tmp_log, "r") as tmp_log_f: - for line in itertools.islice(tmp_log_f, 2): + for line in itertools.islice(tmp_log_f, 3): jobid_match = jobid_pattern.search(line) if jobid_match: jobid = int(jobid_match.group(1)) From d8f14956e13ae69213a8567eaf995dfba89fd79b Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:02:19 +0100 Subject: [PATCH 005/124] Using sacct instead of seff by default --- contrib/slurm-benchmark.py | 6 ++ contrib/slurm/slurmexecutor.py | 109 +++++++++++++++++++++++++-------- 2 files changed, 90 insertions(+), 25 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index d91e298b8..794b40393 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -63,6 +63,12 @@ def create_argument_parser(self): default="0", help="Retry killed jobs this many times. Use -1 for unbounded retry attempts.", ) + slurm_args.add_argument( + "--use-seff", + dest="seff", + action="store_true", + help="Use seff instead of sacct for resource measurement data.", + ) return parser diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 6f821d7f2..b192a25ce 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -230,6 +230,8 @@ def wait_for(func, timeout_sec=None, poll_interval_sec=1): def run_slurm(benchmark, args, log_file): + global STOPPED_BY_INTERRUPT + timelimit = benchmark.rlimits.cputime cpus = benchmark.rlimits.cpu_cores memory = benchmark.rlimits.memory @@ -304,6 +306,9 @@ def run_slurm(benchmark, args, log_file): stderr=subprocess.STDOUT, ) + if STOPPED_BY_INTERRUPT: # job cancelled while srun was running, log not necessarily finalized + return + # we try to read back the log, in the first three lines, there should be the jobid with open(tmp_log, "r") as tmp_log_f: for line in itertools.islice(tmp_log_f, 3): @@ -313,30 +318,10 @@ def run_slurm(benchmark, args, log_file): break logging.debug("Pattern not found in log line: %s", line) - seff_command = ["seff", str(jobid)] - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) - ) - - time.sleep(10) - - def get_checked_seff_result(): - seff_result = subprocess.run( - seff_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - if "exit code" in str(seff_result.stdout): - return seff_result - else: - return None + if STOPPED_BY_INTERRUPT: # job was cancelled during log parsing, no job id present + return - # sometimes `seff` needs a few extra seconds to realize the task has ended - result = wait_for(get_checked_seff_result, 30, 2) - - slurm_status, exit_code, cpu_time, wall_time, memory_usage = parse_seff( - str(result.stdout) - ) + raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) if os.path.exists(exitcode_file): with open(exitcode_file, "r") as f: @@ -379,12 +364,86 @@ def get_checked_seff_result(): if benchmark.config.debug: with open(log_file + ".debug_info", "w+") as file: file.write(f"jobid: {jobid}\n") - file.write(f"seff output: {str(result.stdout)}\n") + file.write(f"seff output: {str(raw_output)}\n") file.write(f"Parsed data: {str(ret)}\n") return ret +time_pattern = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?") + + +def get_seconds_from_time(time_str): + time_match = time_pattern.search(time_str) + if time_match: + hours, minutes, seconds, millis = time_match.groups() + return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 + + +def run_sacct(jobid): + global STOPPED_BY_INTERRUPT + + sacct_command = ["sacct", "-j", str(jobid), "-n", "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS"] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) + ) + + def get_checked_seff_result(): + sacct_result = subprocess.run( + sacct_command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + try: + lines = str(sacct_result.stdout).split("\n") + if len(lines) < 2: + return None # jobs not yet ready + parent_job = lines[0].split() # State is read from here + child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here + if parent_job[0] in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: + return None # not finished + return (sacct_result.stdout, + parent_job[0], # State + child_job[1].split(":")[0], # ExitCode + get_seconds_from_time(child_job[2]), #TotalCPU in seconds + get_seconds_from_time(child_job[3]), #Elapsed in seconds + float(child_job[4][:-1])*1000) # MaxRSS in K * 1000 -> Bytes + except ValueError: + return None + + # sometimes `seff` needs a few extra seconds to realize the task has ended + return wait_for(get_checked_seff_result, 30, 2) + + +def run_seff(jobid): + global STOPPED_BY_INTERRUPT + + seff_command = ["seff", str(jobid)] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) + ) + + time.sleep(10) + + def get_checked_seff_result(): + seff_result = subprocess.run( + seff_command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if "exit code" in str(seff_result.stdout): + return seff_result + else: + return None + + # sometimes `seff` needs a few extra seconds to realize the task has ended + result = wait_for(get_checked_seff_result, 30, 2) + if STOPPED_BY_INTERRUPT: # job was cancelled + return + + return result.stdout, *parse_seff(str(result.stdout)) + + exit_code_pattern = re.compile(r"State: ([A-Z-_]*) \(exit code (\d+)\)") cpu_time_pattern = re.compile(r"CPU Utilized: (\d+):(\d+):(\d+)") wall_time_pattern = re.compile(r"Job Wall-clock time: (\d+):(\d+):(\d+)") @@ -417,4 +476,4 @@ def parse_seff(result): f"Exit code: {exit_code}, memory usage: {memory_usage}, walltime: {wall_time}, cpu time: {cpu_time}" ) - return slurm_status, exit_code, cpu_time, wall_time, memory_usage + return slurm_status, exit_code, cpu_time, wall_time, memory_usage \ No newline at end of file From 5431275be23bc53681f02c6d1c750095f94fee2e Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:07:53 +0100 Subject: [PATCH 006/124] Added logging --- contrib/slurm/slurmexecutor.py | 35 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index b192a25ce..c7a6790da 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -388,31 +388,30 @@ def run_sacct(jobid): "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) ) - def get_checked_seff_result(): + def get_checked_sacct_result(): sacct_result = subprocess.run( sacct_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) - try: - lines = str(sacct_result.stdout).split("\n") - if len(lines) < 2: - return None # jobs not yet ready - parent_job = lines[0].split() # State is read from here - child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here - if parent_job[0] in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: - return None # not finished - return (sacct_result.stdout, - parent_job[0], # State - child_job[1].split(":")[0], # ExitCode - get_seconds_from_time(child_job[2]), #TotalCPU in seconds - get_seconds_from_time(child_job[3]), #Elapsed in seconds - float(child_job[4][:-1])*1000) # MaxRSS in K * 1000 -> Bytes - except ValueError: - return None + lines = str(sacct_result.stdout).split("\n") + if len(lines) < 2: + logging.debug("Sacct output not yet ready: %s", sacct_result.stdout) + return None # jobs not yet ready + parent_job = lines[0].split() # State is read from here + child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here + if parent_job[0] in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: + logging.debug("Sacct output not yet ready due to state: %s", parent_job[0]) + return None # not finished + return (sacct_result.stdout, + parent_job[0], # State + child_job[1].split(":")[0], # ExitCode + get_seconds_from_time(child_job[2]), #TotalCPU in seconds + get_seconds_from_time(child_job[3]), #Elapsed in seconds + float(child_job[4][:-1])*1000) # MaxRSS in K * 1000 -> Bytes # sometimes `seff` needs a few extra seconds to realize the task has ended - return wait_for(get_checked_seff_result, 30, 2) + return wait_for(get_checked_sacct_result, 30, 2) def run_seff(jobid): From 18a827e1b9fcc49ca39c169521a58f5f044e5c05 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:09:26 +0100 Subject: [PATCH 007/124] Fixd log --- contrib/slurm/slurmexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index c7a6790da..d0ad23e45 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -396,7 +396,7 @@ def get_checked_sacct_result(): ) lines = str(sacct_result.stdout).split("\n") if len(lines) < 2: - logging.debug("Sacct output not yet ready: %s", sacct_result.stdout) + logging.debug("Sacct output not yet ready: %s", lines) return None # jobs not yet ready parent_job = lines[0].split() # State is read from here child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here From 5e5f74acef608d28cc854c1147e2f7d6fb2ba02f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:12:03 +0100 Subject: [PATCH 008/124] Fixed line splitting --- contrib/slurm/slurmexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index d0ad23e45..f36f2addb 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -394,7 +394,7 @@ def get_checked_sacct_result(): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) - lines = str(sacct_result.stdout).split("\n") + lines = sacct_result.stdout.splitlines() if len(lines) < 2: logging.debug("Sacct output not yet ready: %s", lines) return None # jobs not yet ready From 796ef71c103e9463855acd86e0c751efe7aa42f6 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:13:40 +0100 Subject: [PATCH 009/124] str() --- contrib/slurm/slurmexecutor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index f36f2addb..fd76180d5 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -404,11 +404,11 @@ def get_checked_sacct_result(): logging.debug("Sacct output not yet ready due to state: %s", parent_job[0]) return None # not finished return (sacct_result.stdout, - parent_job[0], # State - child_job[1].split(":")[0], # ExitCode - get_seconds_from_time(child_job[2]), #TotalCPU in seconds - get_seconds_from_time(child_job[3]), #Elapsed in seconds - float(child_job[4][:-1])*1000) # MaxRSS in K * 1000 -> Bytes + str(parent_job[0]), # State + str(child_job[1]).split(":")[0], # ExitCode + get_seconds_from_time(str(child_job[2])), #TotalCPU in seconds + get_seconds_from_time(str(child_job[3])), #Elapsed in seconds + float(str(child_job[4])[:-1])*1000) # MaxRSS in K * 1000 -> Bytes # sometimes `seff` needs a few extra seconds to realize the task has ended return wait_for(get_checked_sacct_result, 30, 2) From 093b44823261a7f315b9fe2c5e6ad3848ab14a3c Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:16:09 +0100 Subject: [PATCH 010/124] Added none checks to int() --- contrib/slurm/slurmexecutor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index fd76180d5..ceb4afc9a 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -377,6 +377,14 @@ def get_seconds_from_time(time_str): time_match = time_pattern.search(time_str) if time_match: hours, minutes, seconds, millis = time_match.groups() + if hours is None: + hours = 0 + if minutes is None: + minutes = 0 # realistically never None, but doesn't hurt + if seconds is None: + seconds = 0 # realistically never None, but doesn't hurt + if millis is None: + millis = 0 return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 From cafd1d97abe46fd424c3e61a71543b0be5e23cfa Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:21:37 +0100 Subject: [PATCH 011/124] Added log --- contrib/slurm/slurmexecutor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index ceb4afc9a..41d0e7a88 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -408,15 +408,16 @@ def get_checked_sacct_result(): return None # jobs not yet ready parent_job = lines[0].split() # State is read from here child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here + logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) if parent_job[0] in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: logging.debug("Sacct output not yet ready due to state: %s", parent_job[0]) return None # not finished return (sacct_result.stdout, str(parent_job[0]), # State - str(child_job[1]).split(":")[0], # ExitCode - get_seconds_from_time(str(child_job[2])), #TotalCPU in seconds - get_seconds_from_time(str(child_job[3])), #Elapsed in seconds - float(str(child_job[4])[:-1])*1000) # MaxRSS in K * 1000 -> Bytes + str(child_job[1]).split(":")[0], # ExitCode + get_seconds_from_time(str(child_job[2])), #TotalCPU in seconds + get_seconds_from_time(str(child_job[3])), #Elapsed in seconds + float(str(child_job[4])[:-1])*1000) # MaxRSS in K * 1000 -> Bytes # sometimes `seff` needs a few extra seconds to realize the task has ended return wait_for(get_checked_sacct_result, 30, 2) From 35bfb00f0983665d4c67add0b1905768a1b43e57 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:27:44 +0100 Subject: [PATCH 012/124] str() -> .decode() --- contrib/slurm/slurmexecutor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 41d0e7a88..4a783b7eb 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -413,11 +413,11 @@ def get_checked_sacct_result(): logging.debug("Sacct output not yet ready due to state: %s", parent_job[0]) return None # not finished return (sacct_result.stdout, - str(parent_job[0]), # State - str(child_job[1]).split(":")[0], # ExitCode - get_seconds_from_time(str(child_job[2])), #TotalCPU in seconds - get_seconds_from_time(str(child_job[3])), #Elapsed in seconds - float(str(child_job[4])[:-1])*1000) # MaxRSS in K * 1000 -> Bytes + parent_job[0].decode(), # State + child_job[1].decode().split(":")[0], # ExitCode + get_seconds_from_time(child_job[2].decode()), #TotalCPU in seconds + get_seconds_from_time(child_job[3].decode()), #Elapsed in seconds + float(child_job[4].decode()[:-1])*1000) # MaxRSS in K * 1000 -> Bytes # sometimes `seff` needs a few extra seconds to realize the task has ended return wait_for(get_checked_sacct_result, 30, 2) From 25958b1cabb4b848d18df4e4a41cc0104fa32901 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:30:01 +0100 Subject: [PATCH 013/124] Missing decode() --- contrib/slurm/slurmexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 4a783b7eb..18b2bb207 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -409,8 +409,8 @@ def get_checked_sacct_result(): parent_job = lines[0].split() # State is read from here child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) - if parent_job[0] in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: - logging.debug("Sacct output not yet ready due to state: %s", parent_job[0]) + if parent_job[0].decode() in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: + logging.debug("Sacct output not yet ready due to state: %s", parent_job[0].decode()) return None # not finished return (sacct_result.stdout, parent_job[0].decode(), # State From 1c47cd02d63645552695e037f2a12c3fa52bd048 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:31:54 +0100 Subject: [PATCH 014/124] Check for not available memory --- contrib/slurm/slurmexecutor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 18b2bb207..197734d46 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -412,6 +412,9 @@ def get_checked_sacct_result(): if parent_job[0].decode() in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: logging.debug("Sacct output not yet ready due to state: %s", parent_job[0].decode()) return None # not finished + if len(child_job) < 5: + logging.debug("Sacct output not yet ready due to memory not available: %s", child_job) + return None # not finished return (sacct_result.stdout, parent_job[0].decode(), # State child_job[1].decode().split(":")[0], # ExitCode From 83ca9195ffa23b46da65c995b65ac0045b967534 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 16:34:27 +0100 Subject: [PATCH 015/124] Removed sleep() --- contrib/slurm/slurmexecutor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 197734d46..48a4c3981 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -434,8 +434,6 @@ def run_seff(jobid): "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) ) - time.sleep(10) - def get_checked_seff_result(): seff_result = subprocess.run( seff_command, From 1ab722db43cb5ed2c0575a4c728ea43b822789d8 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Mar 2024 17:29:12 +0100 Subject: [PATCH 016/124] Formatted --- contrib/slurm/slurmexecutor.py | 73 ++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 48a4c3981..5aa77b59b 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -306,7 +306,9 @@ def run_slurm(benchmark, args, log_file): stderr=subprocess.STDOUT, ) - if STOPPED_BY_INTERRUPT: # job cancelled while srun was running, log not necessarily finalized + if ( + STOPPED_BY_INTERRUPT + ): # job cancelled while srun was running, log not necessarily finalized return # we try to read back the log, in the first three lines, there should be the jobid @@ -318,10 +320,14 @@ def run_slurm(benchmark, args, log_file): break logging.debug("Pattern not found in log line: %s", line) - if STOPPED_BY_INTERRUPT: # job was cancelled during log parsing, no job id present + if ( + STOPPED_BY_INTERRUPT + ): # job was cancelled during log parsing, no job id present return - raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) + raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( + run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) + ) if os.path.exists(exitcode_file): with open(exitcode_file, "r") as f: @@ -380,9 +386,9 @@ def get_seconds_from_time(time_str): if hours is None: hours = 0 if minutes is None: - minutes = 0 # realistically never None, but doesn't hurt + minutes = 0 # realistically never None, but doesn't hurt if seconds is None: - seconds = 0 # realistically never None, but doesn't hurt + seconds = 0 # realistically never None, but doesn't hurt if millis is None: millis = 0 return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 @@ -391,7 +397,13 @@ def get_seconds_from_time(time_str): def run_sacct(jobid): global STOPPED_BY_INTERRUPT - sacct_command = ["sacct", "-j", str(jobid), "-n", "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS"] + sacct_command = [ + "sacct", + "-j", + str(jobid), + "-n", + "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS", + ] logging.debug( "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) ) @@ -405,22 +417,41 @@ def get_checked_sacct_result(): lines = sacct_result.stdout.splitlines() if len(lines) < 2: logging.debug("Sacct output not yet ready: %s", lines) - return None # jobs not yet ready - parent_job = lines[0].split() # State is read from here - child_job = lines[1].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here + return None # jobs not yet ready + parent_job = lines[0].split() # State is read from here + child_job = lines[ + 1 + ].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) - if parent_job[0].decode() in ["RUNNING", "PENDING", "REQUEUED", "RESIZING", "SUSPENDED", "R", "PD", "RQ", "RS", "S"]: - logging.debug("Sacct output not yet ready due to state: %s", parent_job[0].decode()) - return None # not finished + if parent_job[0].decode() in [ + "RUNNING", + "PENDING", + "REQUEUED", + "RESIZING", + "SUSPENDED", + "R", + "PD", + "RQ", + "RS", + "S", + ]: + logging.debug( + "Sacct output not yet ready due to state: %s", parent_job[0].decode() + ) + return None # not finished if len(child_job) < 5: - logging.debug("Sacct output not yet ready due to memory not available: %s", child_job) - return None # not finished - return (sacct_result.stdout, - parent_job[0].decode(), # State - child_job[1].decode().split(":")[0], # ExitCode - get_seconds_from_time(child_job[2].decode()), #TotalCPU in seconds - get_seconds_from_time(child_job[3].decode()), #Elapsed in seconds - float(child_job[4].decode()[:-1])*1000) # MaxRSS in K * 1000 -> Bytes + logging.debug( + "Sacct output not yet ready due to memory not available: %s", child_job + ) + return None # not finished + return ( + sacct_result.stdout, + parent_job[0].decode(), # State + child_job[1].decode().split(":")[0], # ExitCode + get_seconds_from_time(child_job[2].decode()), # TotalCPU in seconds + get_seconds_from_time(child_job[3].decode()), # Elapsed in seconds + float(child_job[4].decode()[:-1]) * 1000, + ) # MaxRSS in K * 1000 -> Bytes # sometimes `seff` needs a few extra seconds to realize the task has ended return wait_for(get_checked_sacct_result, 30, 2) @@ -485,4 +516,4 @@ def parse_seff(result): f"Exit code: {exit_code}, memory usage: {memory_usage}, walltime: {wall_time}, cpu time: {cpu_time}" ) - return slurm_status, exit_code, cpu_time, wall_time, memory_usage \ No newline at end of file + return slurm_status, exit_code, cpu_time, wall_time, memory_usage From 2b297b9d5beee84e4ca3cf0c2f50e4a3a52a879c Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sun, 21 Apr 2024 20:57:38 +0200 Subject: [PATCH 017/124] Fixed stopping --- contrib/slurm/slurmexecutor.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 5aa77b59b..596685929 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -170,20 +170,23 @@ def execute(self, run): args, run.log_file, ) - if ( - "terminationreason" not in run_result - or not run_result["terminationreason"] == "killed" - or (attempts >= self.benchmark.config.retry >= 0) - or STOPPED_BY_INTERRUPT - ): - break - attempts += 1 - time.sleep(1) # as to not overcrowd a failing scheduler - logging.debug( - "Retrying after %d attempts, limit: %d", - attempts, - self.benchmark.config.retry, - ) + if run_result is None: + stop() + else: + if ( + "terminationreason" not in run_result + or not run_result["terminationreason"] == "killed" + or (attempts >= self.benchmark.config.retry >= 0) + or STOPPED_BY_INTERRUPT + ): + break + attempts += 1 + time.sleep(1) # as to not overcrowd a failing scheduler + logging.debug( + "Retrying after %d attempts, limit: %d", + attempts, + self.benchmark.config.retry, + ) except KeyboardInterrupt: # If the run was interrupted, we ignore the result and cleanup. From 86aff971bf9df68573c284434338f579f1a217e7 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 14 Oct 2024 12:08:20 +0200 Subject: [PATCH 018/124] using slurm arrayjob --- contrib/slurm/slurmexecutor.py | 457 +++++++++++++++------------------ 1 file changed, 205 insertions(+), 252 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 596685929..5ecec5d4c 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -30,7 +30,11 @@ def init(config, benchmark): tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) - benchmark.tool_version = benchmark.tool.version(benchmark.executable) + try: + benchmark.tool_version = benchmark.tool.version(benchmark.executable) + except Exception as e: + logging.warning("could not determine version due to error: %s", e) + benchmark.tool_version = None def get_system_info(): @@ -70,142 +74,75 @@ def _execute_run_set( benchmark, output_handler, ): + global STOPPED_BY_INTERRUPT + # get times before runSet walltime_before = time.monotonic() output_handler.output_before_run_set(runSet) - # put all runs into a queue - for run in runSet.runs: - _Worker.working_queue.put(run) - - # keep a counter of unfinished runs for the below assertion - unfinished_runs = len(runSet.runs) - unfinished_runs_lock = threading.Lock() + if not benchmark.config.scratchdir: + sys.exit("No scratchdir present. Please specify using --scratchdir .") + elif not os.path.exists(benchmark.config.scratchdir): + os.makedirs(benchmark.config.scratchdir) + logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") + elif not os.path.isdir(benchmark.config.scratchdir): + sys.exit( + f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." + ) - def run_finished(): - nonlocal unfinished_runs - with unfinished_runs_lock: - unfinished_runs -= 1 + os.makedirs("tmp") - # create some workers - for _ in range(min(benchmark.num_of_threads, unfinished_runs)): - if STOPPED_BY_INTERRUPT: - break - WORKER_THREADS.append(_Worker(benchmark, output_handler, run_finished)) - - # wait until workers are finished (all tasks done or STOPPED_BY_INTERRUPT) - for worker in WORKER_THREADS: - worker.join() - assert unfinished_runs == 0 or STOPPED_BY_INTERRUPT - - # get times after runSet - walltime_after = time.monotonic() - usedWallTime = walltime_after - walltime_before - - if STOPPED_BY_INTERRUPT: - output_handler.set_error("interrupted", runSet) - output_handler.output_after_run_set( - runSet, - walltime=usedWallTime, - ) + with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: + tempdir = "tmp" + batch_lines = ["#!/bin/sh"] + for setting in get_resource_limits(benchmark, tempdir): + batch_lines.extend(["\n#SBATCH " + str(setting)]) -def stop(): - global STOPPED_BY_INTERRUPT - STOPPED_BY_INTERRUPT = True + batch_lines.extend([f"\n#SBATCH --array=0-{len(runSet.runs) - 1}%{benchmark.num_of_threads}"]) + batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + # put all runs into a queue + for i, run in enumerate(runSet.runs): + batch_lines.extend(["\n" + str(i) + ") " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i)))) + ";;"]) -class _Worker(threading.Thread): - """ - A Worker is a deamonic thread, that takes jobs from the working_queue and runs them. - """ + batch_lines.extend(["\nesac"]) - working_queue = queue.Queue() - - def __init__(self, benchmark, output_handler, run_finished_callback): - threading.Thread.__init__(self) # constuctor of superclass - self.run_finished_callback = run_finished_callback - self.benchmark = benchmark - self.output_handler = output_handler - self.setDaemon(True) - - self.start() - - def run(self): - while not STOPPED_BY_INTERRUPT: - try: - currentRun = _Worker.working_queue.get_nowait() - except queue.Empty: - return - - try: - logging.debug('Executing run "%s"', currentRun.identifier) - self.execute(currentRun) - logging.debug('Finished run "%s"', currentRun.identifier) - except SystemExit as e: - logging.critical(e) - except BenchExecException as e: - logging.critical(e) - except BaseException: - logging.exception("Exception during run execution") - self.run_finished_callback() - _Worker.working_queue.task_done() - - def execute(self, run): - """ - This function executes the tool with a sourcefile with options. - It also calls functions for output before and after the run. - """ - self.output_handler.output_before_run(run) - - args = run.cmdline() - logging.debug("Command line of run is %s", args) + batchfile = os.path.join(tempdir, "array.sbatch") + with open(batchfile, "w") as f: + f.writelines(batch_lines) try: - attempts = 0 - while True: - run_result = run_slurm( - self.benchmark, - args, - run.log_file, - ) - if run_result is None: - stop() - else: - if ( - "terminationreason" not in run_result - or not run_result["terminationreason"] == "killed" - or (attempts >= self.benchmark.config.retry >= 0) - or STOPPED_BY_INTERRUPT - ): - break - attempts += 1 - time.sleep(1) # as to not overcrowd a failing scheduler - logging.debug( - "Retrying after %d attempts, limit: %d", - attempts, - self.benchmark.config.retry, - ) - + sbatch_cmd = ["sbatch", "--wait", str(batchfile)] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, sbatch_cmd)) + ) + sbatch_result = subprocess.run( + sbatch_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) except KeyboardInterrupt: # If the run was interrupted, we ignore the result and cleanup. - stop() + STOPPED_BY_INTERRUPT = True + - if STOPPED_BY_INTERRUPT: - try: - if self.benchmark.config.debug: - os.rename(run.log_file, run.log_file + ".killed") - else: - os.remove(run.log_file) - except OSError: - pass - return 1 - run.set_result(run_result) - self.output_handler.output_after_run(run) - return None + for i, run in enumerate(runSet.runs): + run.set_result(get_run_result(benchmark, os.path.join(tempdir, str(i)), run)) + output_handler.output_after_run(run) + # get times after runSet + walltime_after = time.monotonic() + usedWallTime = walltime_after - walltime_before + + if STOPPED_BY_INTERRUPT: + output_handler.set_error("interrupted", runSet) + output_handler.output_after_run_set( + runSet, + walltime=usedWallTime, + ) jobid_pattern = re.compile(r"job (\d*) started") @@ -232,151 +169,125 @@ def wait_for(func, timeout_sec=None, poll_interval_sec=1): time.sleep(poll_interval_sec) -def run_slurm(benchmark, args, log_file): - global STOPPED_BY_INTERRUPT - +def get_resource_limits(benchmark, tempdir): timelimit = benchmark.rlimits.cputime cpus = benchmark.rlimits.cpu_cores memory = benchmark.rlimits.memory + os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) srun_timelimit_h = int(timelimit / 3600) srun_timelimit_m = int((timelimit % 3600) / 60) srun_timelimit_s = int(timelimit % 60) - srun_timelimit = f"{srun_timelimit_h}:{srun_timelimit_m}:{srun_timelimit_s}" + srun_timelimit = f"{srun_timelimit_h:02d}:{srun_timelimit_m:02d}:{srun_timelimit_s:02d}" - if not benchmark.config.scratchdir: - sys.exit("No scratchdir present. Please specify using --scratchdir .") - elif not os.path.exists(benchmark.config.scratchdir): - os.makedirs(benchmark.config.scratchdir) - logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") - elif not os.path.isdir(benchmark.config.scratchdir): - sys.exit( - f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." - ) + ret = [f"--output={tempdir}/logs/%A_%a.out", + "--time=" + str(srun_timelimit), + "--cpus-per-task=" + str(cpus), + "--mem=" + str(int(memory / 1000000)) + "M", + "--threads-per-core=1", # --use_hyperthreading=False is always given here + "--mincpus=" + str(cpus), + "--ntasks=1"] + return ret - with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: - tmp_log = os.path.join(tempdir, "log") - - os.makedirs(os.path.join(tempdir, "upper")) - os.makedirs(os.path.join(tempdir, "work")) - exitcode_file = f"{tempdir}/upper/exitcode" +def get_run_cli(benchmark, args, tempdir): + os.makedirs(os.path.join(tempdir, "upper")) + os.makedirs(os.path.join(tempdir, "work")) + cli = [] - srun_command = [ - "srun", - "--quit-on-interrupt", - "-t", - str(srun_timelimit), - "-c", - str(cpus), - "--mem", - str(int(memory / 1000000)) + "M", - "--threads-per-core=1", # --use_hyperthreading=False is always given here - "--ntasks=1", - ] - if benchmark.config.singularity: - srun_command.extend( - [ - "singularity", - "exec", - "-B", - "./:/lower", - "--no-home", - "-B", - f"{tempdir}:/overlay", - "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", - benchmark.config.singularity, - ] - ) - srun_command.extend( + if benchmark.config.singularity: + cli.extend( [ - "sh", - "-c", - f"echo job $SLURM_JOB_ID started; {' '.join(map(util.escape_string_shell, args))}; echo $? > exitcode", + "singularity", + "exec", + "-B", + "./:/lower", + "--no-home", + "-B", + f"{tempdir}:/overlay", + "--fusemount", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", + benchmark.config.singularity, ] ) + cli.extend( + [ + "sh", + "-c", + f"echo $SLURM_JOB_ID > jobid; {' '.join(map(util.escape_string_shell, args))} > log 2>&1; echo $? > exitcode", + ] + ) - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, srun_command)) - ) - jobid = None - while jobid is None and not STOPPED_BY_INTERRUPT: - with open(tmp_log, "w") as tmp_log_f: - subprocess.run( - srun_command, - stdout=tmp_log_f, - stderr=subprocess.STDOUT, - ) - - if ( - STOPPED_BY_INTERRUPT - ): # job cancelled while srun was running, log not necessarily finalized - return - - # we try to read back the log, in the first three lines, there should be the jobid - with open(tmp_log, "r") as tmp_log_f: - for line in itertools.islice(tmp_log_f, 3): - jobid_match = jobid_pattern.search(line) - if jobid_match: - jobid = int(jobid_match.group(1)) - break - logging.debug("Pattern not found in log line: %s", line) - - if ( - STOPPED_BY_INTERRUPT - ): # job was cancelled during log parsing, no job id present - return - - raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( - run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) - ) + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, cli)) + ) + return " ".join(map(util.escape_string_shell, cli)) + +def get_run_result(benchmark, tempdir, run): + exitcode_file = f"{tempdir}/upper/exitcode" + jobid_file = f"{tempdir}/upper/jobid" + tmp_log = f"{tempdir}/upper/log" + with open(jobid_file, "r") as f: + jobid = int(f.read()) + + raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( + run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) + ) + + def get_returncode(): if os.path.exists(exitcode_file): with open(exitcode_file, "r") as f: returncode = int(f.read()) logging.debug("Exit code in file %s: %d", exitcode_file, returncode) + return returncode else: - assert ( - slurm_status != "COMPLETED" - ), "Should never happen: exit code not found, but task was reported COMPLETED." - logging.debug("Exit code not found in file: %s", exitcode_file) - returncode = 0 - - ret = { - "walltime": wall_time, - "cputime": cpu_time, - "memory": memory_usage, - "exitcode": ProcessExitCode.create(value=returncode), - } - - if slurm_status != "COMPLETED": - ret["terminationreason"] = { - "OUT_OF_MEMORY": "memory", - "OUT_OF_ME+": "memory", - "TIMEOUT": "cputime", - "ERROR": "failed", - "FAILED": "killed", - "CANCELLED": "killed", - }.get(slurm_status, slurm_status) - - # Runexec would populate the first 6 lines with metadata - with open(log_file, "w+") as file: - with open(tmp_log, "r") as log_source: - content = log_source.read() - file.write(f"{' '.join(map(util.escape_string_shell, args))}") - file.write("\n\n\n" + "-" * 80 + "\n\n\n") - file.write(content) - if content == "": - file.write("Original log file did not contain anything.") - - if benchmark.config.debug: - with open(log_file + ".debug_info", "w+") as file: - file.write(f"jobid: {jobid}\n") - file.write(f"seff output: {str(raw_output)}\n") - file.write(f"Parsed data: {str(ret)}\n") - - return ret + return None + + + if slurm_status == "COMPLETED": + try: + returncode = wait_for(get_returncode, 30, 2) + except Exception as e: + print(tempdir) + raise e + else: + returncode = 0 + + ret = { + "walltime": wall_time, + "cputime": cpu_time, + "memory": memory_usage, + "exitcode": ProcessExitCode.create(value=returncode), + } + + if slurm_status != "COMPLETED": + ret["terminationreason"] = { + "OUT_OF_MEMORY": "memory", + "OUT_OF_ME+": "memory", + "TIMEOUT": "cputime", + "ERROR": "failed", + "FAILED": "killed", + "CANCELLED": "killed", + }.get(slurm_status, slurm_status) + + # Runexec would populate the first 6 lines with metadata + with open(run.log_file, "w+") as file: + with open(tmp_log, "r") as log_source: + content = log_source.read() + file.write(f"{' '.join(map(util.escape_string_shell, run.cmdline()))}") + file.write("\n\n\n" + "-" * 80 + "\n\n\n") + file.write(content) + if content == "": + file.write("Original log file did not contain anything.") + + if benchmark.config.debug: + with open(run.log_file + ".debug_info", "w+") as file: + file.write(f"jobid: {jobid}\n") + file.write(f"seff output: {str(raw_output)}\n") + file.write(f"Parsed data: {str(ret)}\n") + + return ret time_pattern = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?") @@ -405,7 +316,7 @@ def run_sacct(jobid): "-j", str(jobid), "-n", - "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS", + "--format=State,ExitCode,TotalCpu,Elapsed,MaxVMSize", ] logging.debug( "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) @@ -421,9 +332,9 @@ def get_checked_sacct_result(): if len(lines) < 2: logging.debug("Sacct output not yet ready: %s", lines) return None # jobs not yet ready - parent_job = lines[0].split() # State is read from here + parent_job = lines[-2].split() # State is read from here child_job = lines[ - 1 + -1 ].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) if parent_job[0].decode() in [ @@ -447,14 +358,56 @@ def get_checked_sacct_result(): "Sacct output not yet ready due to memory not available: %s", child_job ) return None # not finished + + stdout = sacct_result.stdout + try: + state = parent_job[0].decode() + except Exception as e: + logging.warning( + "Could not get state due to error: %s", e + ) + state = "" + + try: + exitcode = child_job[1].decode().split(":")[0] + except Exception as e: + logging.warning( + "Could not get exitcode due to error: %s", e + ) + exitcode = "-1" + + try: + totalcpu = get_seconds_from_time(child_job[2].decode()) + except Exception as e: + logging.warning( + "Could not get TotalCPU due to error: %s", e + ) + totalcpu = 0 + + try: + elapsed = get_seconds_from_time(child_job[3].decode()) + except Exception as e: + logging.warning( + "Could not get Elapsed due to error: %s", e + ) + elapsed = 0 + + try: + maxvmsize = float(child_job[4].decode()[:-1]) * 1000 + except Exception as e: + logging.warning( + "Could not get MaxVMSize due to error: %s", e + ) + maxvmsize = 0 + return ( - sacct_result.stdout, - parent_job[0].decode(), # State - child_job[1].decode().split(":")[0], # ExitCode - get_seconds_from_time(child_job[2].decode()), # TotalCPU in seconds - get_seconds_from_time(child_job[3].decode()), # Elapsed in seconds - float(child_job[4].decode()[:-1]) * 1000, - ) # MaxRSS in K * 1000 -> Bytes + stdout, + state, + exitcode, + totalcpu, + elapsed, + maxvmsize + ) # sometimes `seff` needs a few extra seconds to realize the task has ended return wait_for(get_checked_sacct_result, 30, 2) From c96316319293ebbe0de67c435531e8a03977e41d Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 17 Oct 2024 12:13:14 +0200 Subject: [PATCH 019/124] Added slurm with array-based aggregation --- contrib/slurm/slurmexecutor.py | 357 +++++++-------------------------- 1 file changed, 78 insertions(+), 279 deletions(-) diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 5ecec5d4c..b1d1a1769 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -68,6 +68,9 @@ def execute_benchmark(benchmark, output_handler): output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) +aggregation_factor = 25 + +sbatch_pattern = re.compile(r"Submitted batch job (\d+)") def _execute_run_set( runSet, @@ -91,21 +94,29 @@ def _execute_run_set( f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." ) - os.makedirs("tmp") + number_of_bins = int(len(runSet.runs) / aggregation_factor) + 1 with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: - tempdir = "tmp" - batch_lines = ["#!/bin/sh"] + batch_lines = ["#!/bin/bash"] for setting in get_resource_limits(benchmark, tempdir): batch_lines.extend(["\n#SBATCH " + str(setting)]) - batch_lines.extend([f"\n#SBATCH --array=0-{len(runSet.runs) - 1}%{benchmark.num_of_threads}"]) + batch_lines.extend([f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"]) batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + bins={} # put all runs into a queue for i, run in enumerate(runSet.runs): - batch_lines.extend(["\n" + str(i) + ") " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i)))) + ";;"]) + if i % number_of_bins not in bins: + bins[i % number_of_bins] = [] + bins[i % number_of_bins].append((i, run)) + + for bin in bins: + batch_lines.extend(["\n" + str(bin) + ") "]) + for (i, run) in bins[bin]: + batch_lines.extend(["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i))))]) + batch_lines.extend(["\n;;"]) batch_lines.extend(["\nesac"]) @@ -123,15 +134,27 @@ def _execute_run_set( stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) + except KeyboardInterrupt: # If the run was interrupted, we ignore the result and cleanup. STOPPED_BY_INTERRUPT = True - + if STOPPED_BY_INTERRUPT: + logging.debug(f"Canceling sbatch job if already started") + if sbatch_result and sbatch_result.stdout: + for line in sbatch_result.stdout.splitlines(): + jobid_match = sbatch_pattern.search(str(line)) + if jobid_match: + jobid = int(jobid_match.group(1)) + logging.debug(f"Canceling sbatch job #{jobid}") + subprocess.run(["scancel", str(jobid)]) for i, run in enumerate(runSet.runs): - run.set_result(get_run_result(benchmark, os.path.join(tempdir, str(i)), run)) - output_handler.output_after_run(run) + try: + run.set_result(get_run_result(os.path.join(tempdir, str(i)), run)) + output_handler.output_after_run(run) + except: + logging.debug(f"Output missing for run #{i}") # get times after runSet walltime_after = time.monotonic() @@ -144,35 +167,14 @@ def _execute_run_set( walltime=usedWallTime, ) -jobid_pattern = re.compile(r"job (\d*) started") - - -def wait_for(func, timeout_sec=None, poll_interval_sec=1): - """ - Waits until the func() returns non-None - :param func: function to call until a value is returned - :param timeout_sec: How much time to give up after - :param poll_interval_sec: How frequently to check the result - """ - start_time = time.monotonic() - - while not STOPPED_BY_INTERRUPT: - ret = func() - if ret is not None: - return ret - - if timeout_sec is not None and time.monotonic() - start_time > timeout_sec: - raise BenchExecException( - "Timeout exceeded for waiting for job to realize it has finished. Scheduler may be failing." - ) - - time.sleep(poll_interval_sec) - +def stop(): + global STOPPED_BY_INTERRUPT + STOPPED_BY_INTERRUPT = True def get_resource_limits(benchmark, tempdir): - timelimit = benchmark.rlimits.cputime + timelimit = benchmark.rlimits.cputime*aggregation_factor*2 # safe overapprox cpus = benchmark.rlimits.cpu_cores - memory = benchmark.rlimits.memory + memory = benchmark.rlimits.memory*1.5 # so that runexec catches the OOM, not SLURM os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) srun_timelimit_h = int(timelimit / 3600) @@ -194,6 +196,19 @@ def get_run_cli(benchmark, args, tempdir): os.makedirs(os.path.join(tempdir, "upper")) os.makedirs(os.path.join(tempdir, "work")) cli = [] + runexec = ["python3", "benchexec/bin/runexec", "--no-container"] + if benchmark.rlimits.cputime_hard: + runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) + if benchmark.rlimits.cputime: + runexec.extend(["--softtimelimit", str(benchmark.rlimits.cputime)]) + if benchmark.rlimits.walltime: + runexec.extend(["--walltimelimit", str(benchmark.rlimits.walltime)]) + # if benchmark.rlimits.cpu_cores: + # runexec.extend(["--???", str(benchmark.rlimits.cpu_cores)]) + if benchmark.rlimits.memory: + runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) + + args = [*runexec, "--", *args] if benchmark.config.singularity: cli.extend( @@ -201,6 +216,8 @@ def get_run_cli(benchmark, args, tempdir): "singularity", "exec", "-B", + "/sys/fs/cgroup:/sys/fs/cgroup", + "-B", "./:/lower", "--no-home", "-B", @@ -214,7 +231,7 @@ def get_run_cli(benchmark, args, tempdir): [ "sh", "-c", - f"echo $SLURM_JOB_ID > jobid; {' '.join(map(util.escape_string_shell, args))} > log 2>&1; echo $? > exitcode", + f"{' '.join(map(util.escape_string_shell, args))} > log 2>&1" ] ) @@ -223,253 +240,35 @@ def get_run_cli(benchmark, args, tempdir): ) return " ".join(map(util.escape_string_shell, cli)) -def get_run_result(benchmark, tempdir, run): - exitcode_file = f"{tempdir}/upper/exitcode" - jobid_file = f"{tempdir}/upper/jobid" - tmp_log = f"{tempdir}/upper/log" +def get_run_result(tempdir, run): + runexec_log = f"{tempdir}/upper/log" + tmp_log = f"{tempdir}/upper/output.log" + + data_dict = {} + with open(runexec_log, "r") as file: + for line in file: + line = line.strip() + if line and '=' in line: + key, value = line.split('=', 1) + data_dict[key.strip()] = value.strip() + + ret = {} + if "walltime" in data_dict: + ret["walltime"] = float(data_dict["walltime"][:-1]) # ends in 's' + if "cputime" in data_dict: + ret["cputime"] = float(data_dict["cputime"][:-1]) # ends in 's' + if "memory" in data_dict: + ret["memory"] = int(data_dict["memory"][:-1]) # ends in 'B' + if "returnvalue" in data_dict: + ret["exitcode"] = ProcessExitCode.create(value=int(data_dict["returnvalue"])) + if "exitsignal" in data_dict: + ret["exitcode"] = ProcessExitCode.create(signal=int(data_dict["exitsignal"])) + if "terminationreason" in data_dict: + ret["terminationreason"] = data_dict["terminationreason"] - with open(jobid_file, "r") as f: - jobid = int(f.read()) - - raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( - run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) - ) - - def get_returncode(): - if os.path.exists(exitcode_file): - with open(exitcode_file, "r") as f: - returncode = int(f.read()) - logging.debug("Exit code in file %s: %d", exitcode_file, returncode) - return returncode - else: - return None - - - if slurm_status == "COMPLETED": - try: - returncode = wait_for(get_returncode, 30, 2) - except Exception as e: - print(tempdir) - raise e - else: - returncode = 0 - - ret = { - "walltime": wall_time, - "cputime": cpu_time, - "memory": memory_usage, - "exitcode": ProcessExitCode.create(value=returncode), - } - - if slurm_status != "COMPLETED": - ret["terminationreason"] = { - "OUT_OF_MEMORY": "memory", - "OUT_OF_ME+": "memory", - "TIMEOUT": "cputime", - "ERROR": "failed", - "FAILED": "killed", - "CANCELLED": "killed", - }.get(slurm_status, slurm_status) - - # Runexec would populate the first 6 lines with metadata with open(run.log_file, "w+") as file: with open(tmp_log, "r") as log_source: content = log_source.read() - file.write(f"{' '.join(map(util.escape_string_shell, run.cmdline()))}") - file.write("\n\n\n" + "-" * 80 + "\n\n\n") file.write(content) - if content == "": - file.write("Original log file did not contain anything.") - - if benchmark.config.debug: - with open(run.log_file + ".debug_info", "w+") as file: - file.write(f"jobid: {jobid}\n") - file.write(f"seff output: {str(raw_output)}\n") - file.write(f"Parsed data: {str(ret)}\n") - - return ret - - -time_pattern = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?") - - -def get_seconds_from_time(time_str): - time_match = time_pattern.search(time_str) - if time_match: - hours, minutes, seconds, millis = time_match.groups() - if hours is None: - hours = 0 - if minutes is None: - minutes = 0 # realistically never None, but doesn't hurt - if seconds is None: - seconds = 0 # realistically never None, but doesn't hurt - if millis is None: - millis = 0 - return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 - - -def run_sacct(jobid): - global STOPPED_BY_INTERRUPT - - sacct_command = [ - "sacct", - "-j", - str(jobid), - "-n", - "--format=State,ExitCode,TotalCpu,Elapsed,MaxVMSize", - ] - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) - ) - - def get_checked_sacct_result(): - sacct_result = subprocess.run( - sacct_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - lines = sacct_result.stdout.splitlines() - if len(lines) < 2: - logging.debug("Sacct output not yet ready: %s", lines) - return None # jobs not yet ready - parent_job = lines[-2].split() # State is read from here - child_job = lines[ - -1 - ].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here - logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) - if parent_job[0].decode() in [ - "RUNNING", - "PENDING", - "REQUEUED", - "RESIZING", - "SUSPENDED", - "R", - "PD", - "RQ", - "RS", - "S", - ]: - logging.debug( - "Sacct output not yet ready due to state: %s", parent_job[0].decode() - ) - return None # not finished - if len(child_job) < 5: - logging.debug( - "Sacct output not yet ready due to memory not available: %s", child_job - ) - return None # not finished - - stdout = sacct_result.stdout - try: - state = parent_job[0].decode() - except Exception as e: - logging.warning( - "Could not get state due to error: %s", e - ) - state = "" - - try: - exitcode = child_job[1].decode().split(":")[0] - except Exception as e: - logging.warning( - "Could not get exitcode due to error: %s", e - ) - exitcode = "-1" - - try: - totalcpu = get_seconds_from_time(child_job[2].decode()) - except Exception as e: - logging.warning( - "Could not get TotalCPU due to error: %s", e - ) - totalcpu = 0 - - try: - elapsed = get_seconds_from_time(child_job[3].decode()) - except Exception as e: - logging.warning( - "Could not get Elapsed due to error: %s", e - ) - elapsed = 0 - - try: - maxvmsize = float(child_job[4].decode()[:-1]) * 1000 - except Exception as e: - logging.warning( - "Could not get MaxVMSize due to error: %s", e - ) - maxvmsize = 0 - - return ( - stdout, - state, - exitcode, - totalcpu, - elapsed, - maxvmsize - ) - - # sometimes `seff` needs a few extra seconds to realize the task has ended - return wait_for(get_checked_sacct_result, 30, 2) - - -def run_seff(jobid): - global STOPPED_BY_INTERRUPT - - seff_command = ["seff", str(jobid)] - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) - ) - - def get_checked_seff_result(): - seff_result = subprocess.run( - seff_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - if "exit code" in str(seff_result.stdout): - return seff_result - else: - return None - - # sometimes `seff` needs a few extra seconds to realize the task has ended - result = wait_for(get_checked_seff_result, 30, 2) - if STOPPED_BY_INTERRUPT: # job was cancelled - return - - return result.stdout, *parse_seff(str(result.stdout)) - - -exit_code_pattern = re.compile(r"State: ([A-Z-_]*) \(exit code (\d+)\)") -cpu_time_pattern = re.compile(r"CPU Utilized: (\d+):(\d+):(\d+)") -wall_time_pattern = re.compile(r"Job Wall-clock time: (\d+):(\d+):(\d+)") -memory_pattern = re.compile(r"Memory Utilized: (\d+\.\d+) MB") - - -def parse_seff(result): - logging.debug(f"Got output from seff: {result}") - exit_code_match = exit_code_pattern.search(result) - cpu_time_match = cpu_time_pattern.search(result) - wall_time_match = wall_time_pattern.search(result) - memory_match = memory_pattern.search(result) - exit_code = None - if exit_code_match: - slurm_status = str(exit_code_match.group(1)) - exit_code = int(exit_code_match.group(2)) - else: - slurm_status = "ERROR" - cpu_time = None - if cpu_time_match: - hours, minutes, seconds = map(int, cpu_time_match.groups()) - cpu_time = hours * 3600 + minutes * 60 + seconds - wall_time = None - if wall_time_match: - hours, minutes, seconds = map(int, wall_time_match.groups()) - wall_time = hours * 3600 + minutes * 60 + seconds - memory_usage = float(memory_match.group(1)) * 1000000 if memory_match else None - - logging.debug( - f"Exit code: {exit_code}, memory usage: {memory_usage}, walltime: {wall_time}, cpu time: {cpu_time}" - ) - return slurm_status, exit_code, cpu_time, wall_time, memory_usage + return ret \ No newline at end of file From b77be4965a8fe7ff1430b4629821a47e73567190 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 19 Oct 2024 10:55:02 +0200 Subject: [PATCH 020/124] Added back non-array executor --- contrib/slurm-benchmark.py | 27 +- contrib/slurm/arrayexecutor.py | 286 ++++++++++++++++ contrib/slurm/slurmexecutor.py | 578 +++++++++++++++++++++++---------- 3 files changed, 724 insertions(+), 167 deletions(-) create mode 100644 contrib/slurm/arrayexecutor.py diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 794b40393..d255357fb 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -43,6 +43,12 @@ def create_argument_parser(self): action="store_true", help="Use SLURM to execute benchmarks.", ) + slurm_args.add_argument( + "--slurm-array", + dest="slurm_array", + action="store_true", + help="Use SLURM array jobs to execute benchmarks.", + ) slurm_args.add_argument( "--singularity", dest="singularity", @@ -61,18 +67,35 @@ def create_argument_parser(self): dest="retry", type=int, default="0", - help="Retry killed jobs this many times. Use -1 for unbounded retry attempts.", + help="Retry killed jobs this many times. Use -1 for unbounded retry attempts (cannot be used with --slurm-array)", ) slurm_args.add_argument( "--use-seff", dest="seff", action="store_true", - help="Use seff instead of sacct for resource measurement data.", + help="Use seff instead of sacct for resource measurement data (cannot be used with --slurm-array).", + ) + + slurm_args.add_argument( + "--aggregation-factor", + dest="aggregation_factor", + type=int, + default="10", + help="Aggregation factor for batch jobs (this many tasks will run in a single SLURM job)", + ) + slurm_args.add_argument( + "--batch-size", + dest="batch_size", + type=int, + default="5000", + help="Split run sets into batches of at most this size. Helpful in avoiding errors with script sizes.", ) return parser def load_executor(self): + if self.config.slurm_array: + from slurm import arrayexecutor as executor if self.config.slurm: from slurm import slurmexecutor as executor else: diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py new file mode 100644 index 000000000..47a3c28d3 --- /dev/null +++ b/contrib/slurm/arrayexecutor.py @@ -0,0 +1,286 @@ +# This file is part of BenchExec, a framework for reliable benchmarking: +# https://github.com/sosy-lab/benchexec +# +# SPDX-FileCopyrightText: 2007-2020 Dirk Beyer +# SPDX-FileCopyrightText: 2024 Levente Bajczi +# SPDX-FileCopyrightText: Critical Systems Research Group +# SPDX-FileCopyrightText: Budapest University of Technology and Economics +# +# SPDX-License-Identifier: Apache-2.0 +import itertools +import logging +import os +import queue +import re +import subprocess +import sys +import tempfile +import threading +import time + +from benchexec import BenchExecException, tooladapter, util +from benchexec.util import ProcessExitCode + +sys.dont_write_bytecode = True # prevent creation of .pyc files + +WORKER_THREADS = [] +STOPPED_BY_INTERRUPT = False + + +def init(config, benchmark): + tool_locator = tooladapter.create_tool_locator(config) + benchmark.executable = benchmark.tool.executable(tool_locator) + try: + benchmark.tool_version = benchmark.tool.version(benchmark.executable) + except Exception as e: + logging.warning("could not determine version due to error: %s", e) + benchmark.tool_version = None + + +def get_system_info(): + return None + + +def execute_benchmark(benchmark, output_handler): + if benchmark.config.use_hyperthreading: + sys.exit( + "SLURM can only work properly without hyperthreading enabled, by passing the --no-hyperthreading option. See README.md for details." + ) + + for runSet in benchmark.run_sets: + if STOPPED_BY_INTERRUPT: + break + + if not runSet.should_be_executed(): + output_handler.output_for_skipping_run_set(runSet) + + elif not runSet.runs: + output_handler.output_for_skipping_run_set( + runSet, "because it has no files" + ) + + else: + _execute_run_set( + runSet, + benchmark, + output_handler, + ) + + output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) + +sbatch_pattern = re.compile(r"Submitted batch job (\d+)") + +def _execute_run_set( + runSet, + benchmark, + output_handler, +): + global STOPPED_BY_INTERRUPT + + # get times before runSet + walltime_before = time.monotonic() + + output_handler.output_before_run_set(runSet) + + if not benchmark.config.scratchdir: + sys.exit("No scratchdir present. Please specify using --scratchdir .") + elif not os.path.exists(benchmark.config.scratchdir): + os.makedirs(benchmark.config.scratchdir) + logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") + elif not os.path.isdir(benchmark.config.scratchdir): + sys.exit( + f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." + ) + + # get times after runSet + walltime_after = time.monotonic() + usedWallTime = walltime_after - walltime_before + + for i in range(0, len(runSet.runs), benchmark.config.batch_size): + if not STOPPED_BY_INTERRUPT: + chunk = runSet.runs[i:min(i+benchmark.config.batch_size, len(runSet.runs))] + execute_batch(chunk, benchmark, output_handler) + + + if STOPPED_BY_INTERRUPT: + output_handler.set_error("interrupted", runSet) + + + output_handler.output_after_run_set( + runSet, + walltime=usedWallTime, + ) + +def execute_batch( + runs, + benchmark, + output_handler, +): + global STOPPED_BY_INTERRUPT + number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 + + with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: + batch_lines = ["#!/bin/sh"] + + for setting in get_resource_limits(benchmark, tempdir): + batch_lines.extend(["\n#SBATCH " + str(setting)]) + + batch_lines.extend([f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"]) + batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + + bins={} + # put all runs into a queue + for i, run in enumerate(runs): + if i % number_of_bins not in bins: + bins[i % number_of_bins] = [] + bins[i % number_of_bins].append((i, run)) + + for bin in bins: + batch_lines.extend(["\n" + str(bin) + ") "]) + for (i, run) in bins[bin]: + batch_lines.extend(["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i))))]) + batch_lines.extend(["\n;;"]) + + batch_lines.extend(["\nesac"]) + + batchfile = os.path.join(tempdir, "array.sbatch") + with open(batchfile, "w") as f: + f.writelines(batch_lines) + + try: + sbatch_cmd = ["sbatch", "--wait", str(batchfile)] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, sbatch_cmd)) + ) + sbatch_result = subprocess.run( + sbatch_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + + except KeyboardInterrupt: + STOPPED_BY_INTERRUPT = True + + if STOPPED_BY_INTERRUPT: + logging.debug(f"Canceling sbatch job if already started") + if sbatch_result and sbatch_result.stdout: + for line in sbatch_result.stdout.splitlines(): + jobid_match = sbatch_pattern.search(str(line)) + if jobid_match: + jobid = int(jobid_match.group(1)) + logging.debug(f"Canceling sbatch job #{jobid}") + subprocess.run(["scancel", str(jobid)]) + + for i, run in enumerate(runs): + try: + run.set_result(get_run_result(os.path.join(tempdir, str(i)), run)) + output_handler.output_after_run(run) + except: + logging.debug(f"Output missing for run #{i}") + + +def stop(): + global STOPPED_BY_INTERRUPT + STOPPED_BY_INTERRUPT = True + +def get_resource_limits(benchmark, tempdir): + timelimit = benchmark.rlimits.cputime*benchmark.config.aggregation_factor*2 # safe overapprox + cpus = benchmark.rlimits.cpu_cores + memory = benchmark.rlimits.memory*1.5 # so that runexec catches the OOM, not SLURM + os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) + + srun_timelimit_h = int(timelimit / 3600) + srun_timelimit_m = int((timelimit % 3600) / 60) + srun_timelimit_s = int(timelimit % 60) + srun_timelimit = f"{srun_timelimit_h:02d}:{srun_timelimit_m:02d}:{srun_timelimit_s:02d}" + + ret = [f"--output={tempdir}/logs/%A_%a.out", + "--time=" + str(srun_timelimit), + "--cpus-per-task=" + str(cpus), + "--mem=" + str(int(memory / 1000000)) + "M", + "--threads-per-core=1", # --use_hyperthreading=False is always given here + "--mincpus=" + str(cpus), + "--ntasks=1"] + return ret + + +def get_run_cli(benchmark, args, tempdir): + os.makedirs(os.path.join(tempdir, "upper")) + os.makedirs(os.path.join(tempdir, "work")) + cli = [] + runexec = ["python3", "benchexec/bin/runexec", "--no-container"] + if benchmark.rlimits.cputime_hard: + runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) + if benchmark.rlimits.cputime: + runexec.extend(["--softtimelimit", str(benchmark.rlimits.cputime)]) + if benchmark.rlimits.walltime: + runexec.extend(["--walltimelimit", str(benchmark.rlimits.walltime)]) + # if benchmark.rlimits.cpu_cores: + # runexec.extend(["--???", str(benchmark.rlimits.cpu_cores)]) + if benchmark.rlimits.memory: + runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) + + args = [*runexec, "--", *args] + + if benchmark.config.singularity: + cli.extend( + [ + "singularity", + "exec", + "-B", + "/sys/fs/cgroup:/sys/fs/cgroup", + "-B", + "./:/lower", + "--no-home", + "-B", + f"{tempdir}:/overlay", + "--fusemount", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", + benchmark.config.singularity, + ] + ) + cli.extend( + [ + "sh", + "-c", + f"{' '.join(map(util.escape_string_shell, args))} > log 2>&1" + ] + ) + + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, cli)) + ) + return " ".join(map(util.escape_string_shell, cli)) + +def get_run_result(tempdir, run): + runexec_log = f"{tempdir}/upper/log" + tmp_log = f"{tempdir}/upper/output.log" + + data_dict = {} + with open(runexec_log, "r") as file: + for line in file: + line = line.strip() + if line and '=' in line: + key, value = line.split('=', 1) + data_dict[key.strip()] = value.strip() + + ret = {} + if "walltime" in data_dict: + ret["walltime"] = float(data_dict["walltime"][:-1]) # ends in 's' + if "cputime" in data_dict: + ret["cputime"] = float(data_dict["cputime"][:-1]) # ends in 's' + if "memory" in data_dict: + ret["memory"] = int(data_dict["memory"][:-1]) # ends in 'B' + if "returnvalue" in data_dict: + ret["exitcode"] = ProcessExitCode.create(value=int(data_dict["returnvalue"])) + if "exitsignal" in data_dict: + ret["exitcode"] = ProcessExitCode.create(signal=int(data_dict["exitsignal"])) + if "terminationreason" in data_dict: + ret["terminationreason"] = data_dict["terminationreason"] + + with open(run.log_file, "w+") as file: + with open(tmp_log, "r") as log_source: + content = log_source.read() + file.write(content) + + return ret \ No newline at end of file diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index b1d1a1769..596685929 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -30,11 +30,7 @@ def init(config, benchmark): tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) - try: - benchmark.tool_version = benchmark.tool.version(benchmark.executable) - except Exception as e: - logging.warning("could not determine version due to error: %s", e) - benchmark.tool_version = None + benchmark.tool_version = benchmark.tool.version(benchmark.executable) def get_system_info(): @@ -68,207 +64,459 @@ def execute_benchmark(benchmark, output_handler): output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) -aggregation_factor = 25 - -sbatch_pattern = re.compile(r"Submitted batch job (\d+)") def _execute_run_set( runSet, benchmark, output_handler, ): - global STOPPED_BY_INTERRUPT - # get times before runSet walltime_before = time.monotonic() output_handler.output_before_run_set(runSet) - if not benchmark.config.scratchdir: - sys.exit("No scratchdir present. Please specify using --scratchdir .") - elif not os.path.exists(benchmark.config.scratchdir): - os.makedirs(benchmark.config.scratchdir) - logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") - elif not os.path.isdir(benchmark.config.scratchdir): - sys.exit( - f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." - ) + # put all runs into a queue + for run in runSet.runs: + _Worker.working_queue.put(run) - number_of_bins = int(len(runSet.runs) / aggregation_factor) + 1 + # keep a counter of unfinished runs for the below assertion + unfinished_runs = len(runSet.runs) + unfinished_runs_lock = threading.Lock() - with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: - batch_lines = ["#!/bin/bash"] + def run_finished(): + nonlocal unfinished_runs + with unfinished_runs_lock: + unfinished_runs -= 1 - for setting in get_resource_limits(benchmark, tempdir): - batch_lines.extend(["\n#SBATCH " + str(setting)]) + # create some workers + for _ in range(min(benchmark.num_of_threads, unfinished_runs)): + if STOPPED_BY_INTERRUPT: + break + WORKER_THREADS.append(_Worker(benchmark, output_handler, run_finished)) + + # wait until workers are finished (all tasks done or STOPPED_BY_INTERRUPT) + for worker in WORKER_THREADS: + worker.join() + assert unfinished_runs == 0 or STOPPED_BY_INTERRUPT + + # get times after runSet + walltime_after = time.monotonic() + usedWallTime = walltime_after - walltime_before + + if STOPPED_BY_INTERRUPT: + output_handler.set_error("interrupted", runSet) + output_handler.output_after_run_set( + runSet, + walltime=usedWallTime, + ) - batch_lines.extend([f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"]) - batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) - bins={} - # put all runs into a queue - for i, run in enumerate(runSet.runs): - if i % number_of_bins not in bins: - bins[i % number_of_bins] = [] - bins[i % number_of_bins].append((i, run)) +def stop(): + global STOPPED_BY_INTERRUPT + STOPPED_BY_INTERRUPT = True + - for bin in bins: - batch_lines.extend(["\n" + str(bin) + ") "]) - for (i, run) in bins[bin]: - batch_lines.extend(["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i))))]) - batch_lines.extend(["\n;;"]) +class _Worker(threading.Thread): + """ + A Worker is a deamonic thread, that takes jobs from the working_queue and runs them. + """ - batch_lines.extend(["\nesac"]) + working_queue = queue.Queue() - batchfile = os.path.join(tempdir, "array.sbatch") - with open(batchfile, "w") as f: - f.writelines(batch_lines) + def __init__(self, benchmark, output_handler, run_finished_callback): + threading.Thread.__init__(self) # constuctor of superclass + self.run_finished_callback = run_finished_callback + self.benchmark = benchmark + self.output_handler = output_handler + self.setDaemon(True) + + self.start() + + def run(self): + while not STOPPED_BY_INTERRUPT: + try: + currentRun = _Worker.working_queue.get_nowait() + except queue.Empty: + return + + try: + logging.debug('Executing run "%s"', currentRun.identifier) + self.execute(currentRun) + logging.debug('Finished run "%s"', currentRun.identifier) + except SystemExit as e: + logging.critical(e) + except BenchExecException as e: + logging.critical(e) + except BaseException: + logging.exception("Exception during run execution") + self.run_finished_callback() + _Worker.working_queue.task_done() + + def execute(self, run): + """ + This function executes the tool with a sourcefile with options. + It also calls functions for output before and after the run. + """ + self.output_handler.output_before_run(run) + + args = run.cmdline() + logging.debug("Command line of run is %s", args) try: - sbatch_cmd = ["sbatch", "--wait", str(batchfile)] - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, sbatch_cmd)) - ) - sbatch_result = subprocess.run( - sbatch_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) + attempts = 0 + while True: + run_result = run_slurm( + self.benchmark, + args, + run.log_file, + ) + if run_result is None: + stop() + else: + if ( + "terminationreason" not in run_result + or not run_result["terminationreason"] == "killed" + or (attempts >= self.benchmark.config.retry >= 0) + or STOPPED_BY_INTERRUPT + ): + break + attempts += 1 + time.sleep(1) # as to not overcrowd a failing scheduler + logging.debug( + "Retrying after %d attempts, limit: %d", + attempts, + self.benchmark.config.retry, + ) except KeyboardInterrupt: # If the run was interrupted, we ignore the result and cleanup. - STOPPED_BY_INTERRUPT = True + stop() if STOPPED_BY_INTERRUPT: - logging.debug(f"Canceling sbatch job if already started") - if sbatch_result and sbatch_result.stdout: - for line in sbatch_result.stdout.splitlines(): - jobid_match = sbatch_pattern.search(str(line)) - if jobid_match: - jobid = int(jobid_match.group(1)) - logging.debug(f"Canceling sbatch job #{jobid}") - subprocess.run(["scancel", str(jobid)]) - - for i, run in enumerate(runSet.runs): try: - run.set_result(get_run_result(os.path.join(tempdir, str(i)), run)) - output_handler.output_after_run(run) - except: - logging.debug(f"Output missing for run #{i}") + if self.benchmark.config.debug: + os.rename(run.log_file, run.log_file + ".killed") + else: + os.remove(run.log_file) + except OSError: + pass + return 1 + + run.set_result(run_result) + self.output_handler.output_after_run(run) + return None + + +jobid_pattern = re.compile(r"job (\d*) started") + + +def wait_for(func, timeout_sec=None, poll_interval_sec=1): + """ + Waits until the func() returns non-None + :param func: function to call until a value is returned + :param timeout_sec: How much time to give up after + :param poll_interval_sec: How frequently to check the result + """ + start_time = time.monotonic() + + while not STOPPED_BY_INTERRUPT: + ret = func() + if ret is not None: + return ret + + if timeout_sec is not None and time.monotonic() - start_time > timeout_sec: + raise BenchExecException( + "Timeout exceeded for waiting for job to realize it has finished. Scheduler may be failing." + ) - # get times after runSet - walltime_after = time.monotonic() - usedWallTime = walltime_after - walltime_before + time.sleep(poll_interval_sec) - if STOPPED_BY_INTERRUPT: - output_handler.set_error("interrupted", runSet) - output_handler.output_after_run_set( - runSet, - walltime=usedWallTime, - ) -def stop(): +def run_slurm(benchmark, args, log_file): global STOPPED_BY_INTERRUPT - STOPPED_BY_INTERRUPT = True -def get_resource_limits(benchmark, tempdir): - timelimit = benchmark.rlimits.cputime*aggregation_factor*2 # safe overapprox + timelimit = benchmark.rlimits.cputime cpus = benchmark.rlimits.cpu_cores - memory = benchmark.rlimits.memory*1.5 # so that runexec catches the OOM, not SLURM - os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) + memory = benchmark.rlimits.memory srun_timelimit_h = int(timelimit / 3600) srun_timelimit_m = int((timelimit % 3600) / 60) srun_timelimit_s = int(timelimit % 60) - srun_timelimit = f"{srun_timelimit_h:02d}:{srun_timelimit_m:02d}:{srun_timelimit_s:02d}" - - ret = [f"--output={tempdir}/logs/%A_%a.out", - "--time=" + str(srun_timelimit), - "--cpus-per-task=" + str(cpus), - "--mem=" + str(int(memory / 1000000)) + "M", - "--threads-per-core=1", # --use_hyperthreading=False is always given here - "--mincpus=" + str(cpus), - "--ntasks=1"] - return ret - - -def get_run_cli(benchmark, args, tempdir): - os.makedirs(os.path.join(tempdir, "upper")) - os.makedirs(os.path.join(tempdir, "work")) - cli = [] - runexec = ["python3", "benchexec/bin/runexec", "--no-container"] - if benchmark.rlimits.cputime_hard: - runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) - if benchmark.rlimits.cputime: - runexec.extend(["--softtimelimit", str(benchmark.rlimits.cputime)]) - if benchmark.rlimits.walltime: - runexec.extend(["--walltimelimit", str(benchmark.rlimits.walltime)]) - # if benchmark.rlimits.cpu_cores: - # runexec.extend(["--???", str(benchmark.rlimits.cpu_cores)]) - if benchmark.rlimits.memory: - runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) - - args = [*runexec, "--", *args] - - if benchmark.config.singularity: - cli.extend( - [ - "singularity", - "exec", - "-B", - "/sys/fs/cgroup:/sys/fs/cgroup", - "-B", - "./:/lower", - "--no-home", - "-B", - f"{tempdir}:/overlay", - "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", - benchmark.config.singularity, - ] + srun_timelimit = f"{srun_timelimit_h}:{srun_timelimit_m}:{srun_timelimit_s}" + + if not benchmark.config.scratchdir: + sys.exit("No scratchdir present. Please specify using --scratchdir .") + elif not os.path.exists(benchmark.config.scratchdir): + os.makedirs(benchmark.config.scratchdir) + logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") + elif not os.path.isdir(benchmark.config.scratchdir): + sys.exit( + f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." ) - cli.extend( - [ - "sh", + + with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: + tmp_log = os.path.join(tempdir, "log") + + os.makedirs(os.path.join(tempdir, "upper")) + os.makedirs(os.path.join(tempdir, "work")) + + exitcode_file = f"{tempdir}/upper/exitcode" + + srun_command = [ + "srun", + "--quit-on-interrupt", + "-t", + str(srun_timelimit), "-c", - f"{' '.join(map(util.escape_string_shell, args))} > log 2>&1" + str(cpus), + "--mem", + str(int(memory / 1000000)) + "M", + "--threads-per-core=1", # --use_hyperthreading=False is always given here + "--ntasks=1", ] + if benchmark.config.singularity: + srun_command.extend( + [ + "singularity", + "exec", + "-B", + "./:/lower", + "--no-home", + "-B", + f"{tempdir}:/overlay", + "--fusemount", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", + benchmark.config.singularity, + ] + ) + srun_command.extend( + [ + "sh", + "-c", + f"echo job $SLURM_JOB_ID started; {' '.join(map(util.escape_string_shell, args))}; echo $? > exitcode", + ] + ) + + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, srun_command)) + ) + jobid = None + while jobid is None and not STOPPED_BY_INTERRUPT: + with open(tmp_log, "w") as tmp_log_f: + subprocess.run( + srun_command, + stdout=tmp_log_f, + stderr=subprocess.STDOUT, + ) + + if ( + STOPPED_BY_INTERRUPT + ): # job cancelled while srun was running, log not necessarily finalized + return + + # we try to read back the log, in the first three lines, there should be the jobid + with open(tmp_log, "r") as tmp_log_f: + for line in itertools.islice(tmp_log_f, 3): + jobid_match = jobid_pattern.search(line) + if jobid_match: + jobid = int(jobid_match.group(1)) + break + logging.debug("Pattern not found in log line: %s", line) + + if ( + STOPPED_BY_INTERRUPT + ): # job was cancelled during log parsing, no job id present + return + + raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( + run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) + ) + + if os.path.exists(exitcode_file): + with open(exitcode_file, "r") as f: + returncode = int(f.read()) + logging.debug("Exit code in file %s: %d", exitcode_file, returncode) + else: + assert ( + slurm_status != "COMPLETED" + ), "Should never happen: exit code not found, but task was reported COMPLETED." + logging.debug("Exit code not found in file: %s", exitcode_file) + returncode = 0 + + ret = { + "walltime": wall_time, + "cputime": cpu_time, + "memory": memory_usage, + "exitcode": ProcessExitCode.create(value=returncode), + } + + if slurm_status != "COMPLETED": + ret["terminationreason"] = { + "OUT_OF_MEMORY": "memory", + "OUT_OF_ME+": "memory", + "TIMEOUT": "cputime", + "ERROR": "failed", + "FAILED": "killed", + "CANCELLED": "killed", + }.get(slurm_status, slurm_status) + + # Runexec would populate the first 6 lines with metadata + with open(log_file, "w+") as file: + with open(tmp_log, "r") as log_source: + content = log_source.read() + file.write(f"{' '.join(map(util.escape_string_shell, args))}") + file.write("\n\n\n" + "-" * 80 + "\n\n\n") + file.write(content) + if content == "": + file.write("Original log file did not contain anything.") + + if benchmark.config.debug: + with open(log_file + ".debug_info", "w+") as file: + file.write(f"jobid: {jobid}\n") + file.write(f"seff output: {str(raw_output)}\n") + file.write(f"Parsed data: {str(ret)}\n") + + return ret + + +time_pattern = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?") + + +def get_seconds_from_time(time_str): + time_match = time_pattern.search(time_str) + if time_match: + hours, minutes, seconds, millis = time_match.groups() + if hours is None: + hours = 0 + if minutes is None: + minutes = 0 # realistically never None, but doesn't hurt + if seconds is None: + seconds = 0 # realistically never None, but doesn't hurt + if millis is None: + millis = 0 + return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 + + +def run_sacct(jobid): + global STOPPED_BY_INTERRUPT + + sacct_command = [ + "sacct", + "-j", + str(jobid), + "-n", + "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS", + ] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) ) + def get_checked_sacct_result(): + sacct_result = subprocess.run( + sacct_command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + lines = sacct_result.stdout.splitlines() + if len(lines) < 2: + logging.debug("Sacct output not yet ready: %s", lines) + return None # jobs not yet ready + parent_job = lines[0].split() # State is read from here + child_job = lines[ + 1 + ].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here + logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) + if parent_job[0].decode() in [ + "RUNNING", + "PENDING", + "REQUEUED", + "RESIZING", + "SUSPENDED", + "R", + "PD", + "RQ", + "RS", + "S", + ]: + logging.debug( + "Sacct output not yet ready due to state: %s", parent_job[0].decode() + ) + return None # not finished + if len(child_job) < 5: + logging.debug( + "Sacct output not yet ready due to memory not available: %s", child_job + ) + return None # not finished + return ( + sacct_result.stdout, + parent_job[0].decode(), # State + child_job[1].decode().split(":")[0], # ExitCode + get_seconds_from_time(child_job[2].decode()), # TotalCPU in seconds + get_seconds_from_time(child_job[3].decode()), # Elapsed in seconds + float(child_job[4].decode()[:-1]) * 1000, + ) # MaxRSS in K * 1000 -> Bytes + + # sometimes `seff` needs a few extra seconds to realize the task has ended + return wait_for(get_checked_sacct_result, 30, 2) + + +def run_seff(jobid): + global STOPPED_BY_INTERRUPT + + seff_command = ["seff", str(jobid)] + logging.debug( + "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) + ) + + def get_checked_seff_result(): + seff_result = subprocess.run( + seff_command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if "exit code" in str(seff_result.stdout): + return seff_result + else: + return None + + # sometimes `seff` needs a few extra seconds to realize the task has ended + result = wait_for(get_checked_seff_result, 30, 2) + if STOPPED_BY_INTERRUPT: # job was cancelled + return + + return result.stdout, *parse_seff(str(result.stdout)) + + +exit_code_pattern = re.compile(r"State: ([A-Z-_]*) \(exit code (\d+)\)") +cpu_time_pattern = re.compile(r"CPU Utilized: (\d+):(\d+):(\d+)") +wall_time_pattern = re.compile(r"Job Wall-clock time: (\d+):(\d+):(\d+)") +memory_pattern = re.compile(r"Memory Utilized: (\d+\.\d+) MB") + + +def parse_seff(result): + logging.debug(f"Got output from seff: {result}") + exit_code_match = exit_code_pattern.search(result) + cpu_time_match = cpu_time_pattern.search(result) + wall_time_match = wall_time_pattern.search(result) + memory_match = memory_pattern.search(result) + exit_code = None + if exit_code_match: + slurm_status = str(exit_code_match.group(1)) + exit_code = int(exit_code_match.group(2)) + else: + slurm_status = "ERROR" + cpu_time = None + if cpu_time_match: + hours, minutes, seconds = map(int, cpu_time_match.groups()) + cpu_time = hours * 3600 + minutes * 60 + seconds + wall_time = None + if wall_time_match: + hours, minutes, seconds = map(int, wall_time_match.groups()) + wall_time = hours * 3600 + minutes * 60 + seconds + memory_usage = float(memory_match.group(1)) * 1000000 if memory_match else None + logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, cli)) + f"Exit code: {exit_code}, memory usage: {memory_usage}, walltime: {wall_time}, cpu time: {cpu_time}" ) - return " ".join(map(util.escape_string_shell, cli)) - -def get_run_result(tempdir, run): - runexec_log = f"{tempdir}/upper/log" - tmp_log = f"{tempdir}/upper/output.log" - - data_dict = {} - with open(runexec_log, "r") as file: - for line in file: - line = line.strip() - if line and '=' in line: - key, value = line.split('=', 1) - data_dict[key.strip()] = value.strip() - - ret = {} - if "walltime" in data_dict: - ret["walltime"] = float(data_dict["walltime"][:-1]) # ends in 's' - if "cputime" in data_dict: - ret["cputime"] = float(data_dict["cputime"][:-1]) # ends in 's' - if "memory" in data_dict: - ret["memory"] = int(data_dict["memory"][:-1]) # ends in 'B' - if "returnvalue" in data_dict: - ret["exitcode"] = ProcessExitCode.create(value=int(data_dict["returnvalue"])) - if "exitsignal" in data_dict: - ret["exitcode"] = ProcessExitCode.create(signal=int(data_dict["exitsignal"])) - if "terminationreason" in data_dict: - ret["terminationreason"] = data_dict["terminationreason"] - - with open(run.log_file, "w+") as file: - with open(tmp_log, "r") as log_source: - content = log_source.read() - file.write(content) - - return ret \ No newline at end of file + + return slurm_status, exit_code, cpu_time, wall_time, memory_usage From 2fbcac4c973278a0b7d9306e38064a7c2080364f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 21 Oct 2024 12:42:24 +0200 Subject: [PATCH 021/124] Finalized arrayexecutor --- contrib/slurm-benchmark.py | 13 +++- contrib/slurm/arrayexecutor.py | 129 +++++++++++++++++++++++---------- 2 files changed, 102 insertions(+), 40 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index d255357fb..32daa7e81 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -67,7 +67,7 @@ def create_argument_parser(self): dest="retry", type=int, default="0", - help="Retry killed jobs this many times. Use -1 for unbounded retry attempts (cannot be used with --slurm-array)", + help="Retry killed jobs this many times. Use -1 for unbounded retry attempts (cannot be used with --slurm-array).", ) slurm_args.add_argument( "--use-seff", @@ -81,7 +81,7 @@ def create_argument_parser(self): dest="aggregation_factor", type=int, default="10", - help="Aggregation factor for batch jobs (this many tasks will run in a single SLURM job)", + help="Aggregation factor for batch jobs (this many tasks will run in a single SLURM job).", ) slurm_args.add_argument( "--batch-size", @@ -90,13 +90,20 @@ def create_argument_parser(self): default="5000", help="Split run sets into batches of at most this size. Helpful in avoiding errors with script sizes.", ) + slurm_args.add_argument( + "--parallelization", + dest="concurrency_factor", + type=int, + default="4", + help="Run this many tasks at once in one job.", + ) return parser def load_executor(self): if self.config.slurm_array: from slurm import arrayexecutor as executor - if self.config.slurm: + elif self.config.slurm: from slurm import slurmexecutor as executor else: logging.warning( diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 47a3c28d3..11425c816 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -68,12 +68,14 @@ def execute_benchmark(benchmark, output_handler): output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) + sbatch_pattern = re.compile(r"Submitted batch job (\d+)") + def _execute_run_set( - runSet, - benchmark, - output_handler, + runSet, + benchmark, + output_handler, ): global STOPPED_BY_INTERRUPT @@ -98,48 +100,94 @@ def _execute_run_set( for i in range(0, len(runSet.runs), benchmark.config.batch_size): if not STOPPED_BY_INTERRUPT: - chunk = runSet.runs[i:min(i+benchmark.config.batch_size, len(runSet.runs))] + chunk = runSet.runs[i:min(i + benchmark.config.batch_size, len(runSet.runs))] execute_batch(chunk, benchmark, output_handler) - if STOPPED_BY_INTERRUPT: output_handler.set_error("interrupted", runSet) - output_handler.output_after_run_set( runSet, walltime=usedWallTime, ) + +def get_cpu_cmd(concurrency_factor, cores): + get_cpus = ("cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " + "awk -F= ' { print $2 } ' | head -n1 | " + "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " + "{ split($i, range, \"-\"); for (j = range[1]; j <= range[2]; j++ ) { print j } } " + "else { print $i } } }'))") + for i in range(concurrency_factor): + get_cpus = get_cpus + f"\nexport cpuset{i}=$(IFS=,; echo \"${{cpus[*]:{i * cores}:{cores}}}\")" + return get_cpus + + +def lock_cpu_cmds(concurrency_factor, tempdir, bin): + lock_cpus = "CPUSET=\"\"; while ! {" + for i in range(concurrency_factor): + lock_cpus = lock_cpus + f" {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET=\"$cpuset{i}\"; }}" + if (i == concurrency_factor - 1): + lock_cpus = lock_cpus + "; }; do sleep 1; done" + else: + lock_cpus = lock_cpus + " ||" + unlock_cpus = f"rm -r {tempdir}/cpuset_{bin}_$cpuset" + return lock_cpus, unlock_cpus + + def execute_batch( - runs, - benchmark, - output_handler, + runs, + benchmark, + output_handler, ): global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 + use_concurrency = benchmark.config.concurrency_factor != 1 + if use_concurrency: + get_cpus = get_cpu_cmd(benchmark.config.concurrency_factor, benchmark.rlimits.cpu_cores) + with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: - batch_lines = ["#!/bin/sh"] + batch_lines = ["#!/bin/bash"] for setting in get_resource_limits(benchmark, tempdir): batch_lines.extend(["\n#SBATCH " + str(setting)]) batch_lines.extend([f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"]) - batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + batch_lines.extend(["\n\nTMPDIR=$(mktemp -d)"]) - bins={} + bins = {} # put all runs into a queue for i, run in enumerate(runs): if i % number_of_bins not in bins: bins[i % number_of_bins] = [] bins[i % number_of_bins].append((i, run)) - for bin in bins: - batch_lines.extend(["\n" + str(bin) + ") "]) - for (i, run) in bins[bin]: - batch_lines.extend(["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join(tempdir, str(i))))]) - batch_lines.extend(["\n;;"]) + if use_concurrency: + batch_lines.extend(["\n\n" + get_cpus]) + batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + for bin in bins: + lock_cpus, unlock_cpus = lock_cpu_cmds(benchmark.config.concurrency_factor, tempdir, bin) + batch_lines.extend(["\n" + str(bin) + ") "]) + taskfile_name = f"bin{str(bin)}.tasks" + taskfile = os.path.join(tempdir, taskfile_name) + with open(taskfile, "w") as f: + task_lines = [] + for (i, run) in bins[bin]: + task_lines.extend( + [lock_cpus + " && " + str(get_run_cli(benchmark, run.cmdline(), os.path.join("$TMPDIR", str(i)), os.path.join(tempdir, str(i)))) + "; " + unlock_cpus + "\n"]) + f.writelines(task_lines) + batch_lines.extend(f"\n while read -r x; do /bin/sh -c \"$x\" & done < {taskfile}") + batch_lines.extend("\n wait") + batch_lines.extend(["\n;;"]) + else: + batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + for bin in bins: + batch_lines.extend(["\n" + str(bin) + ") "]) + for (i, run) in bins[bin]: + batch_lines.extend( + ["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join("$TMPDIR", str(i)), os.path.join(tempdir, str(i))))]) + batch_lines.extend(["\n;;"]) batch_lines.extend(["\nesac"]) @@ -183,10 +231,11 @@ def stop(): global STOPPED_BY_INTERRUPT STOPPED_BY_INTERRUPT = True + def get_resource_limits(benchmark, tempdir): - timelimit = benchmark.rlimits.cputime*benchmark.config.aggregation_factor*2 # safe overapprox - cpus = benchmark.rlimits.cpu_cores - memory = benchmark.rlimits.memory*1.5 # so that runexec catches the OOM, not SLURM + timelimit = benchmark.rlimits.cputime * benchmark.config.aggregation_factor * 2 # safe overapprox + cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor + memory = benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 # so that runexec catches the OOM, not SLURM os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) srun_timelimit_h = int(timelimit / 3600) @@ -204,19 +253,18 @@ def get_resource_limits(benchmark, tempdir): return ret -def get_run_cli(benchmark, args, tempdir): - os.makedirs(os.path.join(tempdir, "upper")) - os.makedirs(os.path.join(tempdir, "work")) +def get_run_cli(benchmark, args, tempdir, resultdir): + os.makedirs(resultdir) cli = [] - runexec = ["python3", "benchexec/bin/runexec", "--no-container"] + runexec = ["python3", "benchexec/bin/runexec", "--no-container", "--debug"] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: runexec.extend(["--softtimelimit", str(benchmark.rlimits.cputime)]) if benchmark.rlimits.walltime: runexec.extend(["--walltimelimit", str(benchmark.rlimits.walltime)]) - # if benchmark.rlimits.cpu_cores: - # runexec.extend(["--???", str(benchmark.rlimits.cpu_cores)]) + if benchmark.config.concurrency_factor != 1: + runexec.extend(["--cores", "$CPUSET"]) if benchmark.rlimits.memory: runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) @@ -243,18 +291,25 @@ def get_run_cli(benchmark, args, tempdir): [ "sh", "-c", - f"{' '.join(map(util.escape_string_shell, args))} > log 2>&1" + f"touch started; " + f"{' '.join(map(util.escape_string_shell, ['echo', 'Running command: ', *args]))}; " + f"{' '.join(map(util.escape_string_shell, args))} 2>&1 | tee log; " + f"touch ended" ] ) - logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, cli)) - ) - return " ".join(map(util.escape_string_shell, cli)) + cli = " ".join(map(util.escape_string_shell, cli)) + cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") + cli = cli.replace("'$TMPDIR", "\"$TMPDIR").replace(":/overlay'", ":/overlay\"") + cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" + logging.debug("Command to run: %s", cli) + + return cli + def get_run_result(tempdir, run): - runexec_log = f"{tempdir}/upper/log" - tmp_log = f"{tempdir}/upper/output.log" + runexec_log = f"{tempdir}/log" + tmp_log = f"{tempdir}/output.log" data_dict = {} with open(runexec_log, "r") as file: @@ -266,11 +321,11 @@ def get_run_result(tempdir, run): ret = {} if "walltime" in data_dict: - ret["walltime"] = float(data_dict["walltime"][:-1]) # ends in 's' + ret["walltime"] = float(data_dict["walltime"][:-1]) # ends in 's' if "cputime" in data_dict: - ret["cputime"] = float(data_dict["cputime"][:-1]) # ends in 's' + ret["cputime"] = float(data_dict["cputime"][:-1]) # ends in 's' if "memory" in data_dict: - ret["memory"] = int(data_dict["memory"][:-1]) # ends in 'B' + ret["memory"] = int(data_dict["memory"][:-1]) # ends in 'B' if "returnvalue" in data_dict: ret["exitcode"] = ProcessExitCode.create(value=int(data_dict["returnvalue"])) if "exitsignal" in data_dict: @@ -283,4 +338,4 @@ def get_run_result(tempdir, run): content = log_source.read() file.write(content) - return ret \ No newline at end of file + return ret From 2dee15f516e9f9bb77768bfb831eda4f94d9b2c2 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 15:19:42 +0100 Subject: [PATCH 022/124] runexec no longer relative --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 11425c816..abd78d120 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -256,7 +256,7 @@ def get_resource_limits(benchmark, tempdir): def get_run_cli(benchmark, args, tempdir, resultdir): os.makedirs(resultdir) cli = [] - runexec = ["python3", "benchexec/bin/runexec", "--no-container", "--debug"] + runexec = ["runexec", "--no-container"] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: From 3ad572dddddedc23d31881511a5441c3fd955ffd Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 15:40:06 +0100 Subject: [PATCH 023/124] Changed to shlex.join --- contrib/slurm/arrayexecutor.py | 13 +++++++------ contrib/slurm/slurmexecutor.py | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index abd78d120..e60aebedc 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -12,6 +12,7 @@ import os import queue import re +import shlex import subprocess import sys import tempfile @@ -198,7 +199,7 @@ def execute_batch( try: sbatch_cmd = ["sbatch", "--wait", str(batchfile)] logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, sbatch_cmd)) + "Command to run: %s", shlex.join(sbatch_cmd) ) sbatch_result = subprocess.run( sbatch_cmd, @@ -278,12 +279,12 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup", "-B", - "./:/lower", + "/home:/lower", "--no-home", "-B", f"{tempdir}:/overlay", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/{os.getlogin()}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/", benchmark.config.singularity, ] ) @@ -292,13 +293,13 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "sh", "-c", f"touch started; " - f"{' '.join(map(util.escape_string_shell, ['echo', 'Running command: ', *args]))}; " - f"{' '.join(map(util.escape_string_shell, args))} 2>&1 | tee log; " + f"{shlex.join(['echo', 'Running command: ', *args])}; " + f"{shlex.join(args)} 2>&1 | tee log; " f"touch ended" ] ) - cli = " ".join(map(util.escape_string_shell, cli)) + cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", "\"$TMPDIR").replace(":/overlay'", ":/overlay\"") cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 6b52a1aff..2c86d8bad 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -300,7 +300,7 @@ def run_slurm(benchmark, args, log_file): ) logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, srun_command)) + "Command to run: %s", shlex.join(srun_command) ) jobid = None while jobid is None and not STOPPED_BY_INTERRUPT: @@ -410,7 +410,7 @@ def run_sacct(jobid): "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS", ] logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, sacct_command)) + "Command to run: %s", shlex.join(sacct_command) ) def get_checked_sacct_result(): @@ -467,7 +467,7 @@ def run_seff(jobid): seff_command = ["seff", str(jobid)] logging.debug( - "Command to run: %s", " ".join(map(util.escape_string_shell, seff_command)) + "Command to run: %s", shlex.join(seff_command) ) def get_checked_seff_result(): From 63132ac2fa2b50e07509358198819cd6ec22ec31 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 16:59:36 +0100 Subject: [PATCH 024/124] determining version in singularity now --- contrib/slurm/arrayexecutor.py | 182 ++++++++++++++++++++++++--------- 1 file changed, 134 insertions(+), 48 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index e60aebedc..2ca812dc5 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -7,19 +7,16 @@ # SPDX-FileCopyrightText: Budapest University of Technology and Economics # # SPDX-License-Identifier: Apache-2.0 -import itertools import logging import os -import queue import re import shlex import subprocess import sys import tempfile -import threading import time -from benchexec import BenchExecException, tooladapter, util +from benchexec import tooladapter from benchexec.util import ProcessExitCode sys.dont_write_bytecode = True # prevent creation of .pyc files @@ -34,8 +31,47 @@ def init(config, benchmark): try: benchmark.tool_version = benchmark.tool.version(benchmark.executable) except Exception as e: - logging.warning("could not determine version due to error: %s", e) - benchmark.tool_version = None + if benchmark.config.singularity: + logging.warning( + "could not determine version due to error: %s, will retry in executor", + e, + ) + try: + version_printer = f"""from benchexec import tooladapter +from benchexec.model import load_tool_info +class Config(): + pass + +config = Config() +config.container = False +config.tool_directory = "{config.tool_directory}" +locator = tooladapter.create_tool_locator(config) +tool = load_tool_info("{benchmark.tool_module}", config)[1] +executable = tool.executable(locator) +print(tool.version(executable))""" + with open(".get_version.py", "w") as script: + script.write(version_printer) + result = subprocess.run( + [ + "singularity", + "exec", + benchmark.config.singularity, + "python3", + ".get_version.py", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if result.stdout: + for line in result.stdout.splitlines(): + benchmark.tool_version = line + + except Exception as e: + logging.warning("could not determine version due to error: %s", e) + benchmark.tool_version = None + else: + logging.warning("could not determine version due to error: %s", e) + benchmark.tool_version = None def get_system_info(): @@ -74,9 +110,9 @@ def execute_benchmark(benchmark, output_handler): def _execute_run_set( - runSet, - benchmark, - output_handler, + runSet, + benchmark, + output_handler, ): global STOPPED_BY_INTERRUPT @@ -101,7 +137,9 @@ def _execute_run_set( for i in range(0, len(runSet.runs), benchmark.config.batch_size): if not STOPPED_BY_INTERRUPT: - chunk = runSet.runs[i:min(i + benchmark.config.batch_size, len(runSet.runs))] + chunk = runSet.runs[ + i : min(i + benchmark.config.batch_size, len(runSet.runs)) + ] execute_batch(chunk, benchmark, output_handler) if STOPPED_BY_INTERRUPT: @@ -114,21 +152,29 @@ def _execute_run_set( def get_cpu_cmd(concurrency_factor, cores): - get_cpus = ("cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " - "awk -F= ' { print $2 } ' | head -n1 | " - "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " - "{ split($i, range, \"-\"); for (j = range[1]; j <= range[2]; j++ ) { print j } } " - "else { print $i } } }'))") + get_cpus = ( + "cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " + "awk -F= ' { print $2 } ' | head -n1 | " + "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " + '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' + "else { print $i } } }'))" + ) for i in range(concurrency_factor): - get_cpus = get_cpus + f"\nexport cpuset{i}=$(IFS=,; echo \"${{cpus[*]:{i * cores}:{cores}}}\")" + get_cpus = ( + get_cpus + + f'\nexport cpuset{i}=$(IFS=,; echo "${{cpus[*]:{i * cores}:{cores}}}")' + ) return get_cpus def lock_cpu_cmds(concurrency_factor, tempdir, bin): - lock_cpus = "CPUSET=\"\"; while ! {" + lock_cpus = 'CPUSET=""; while ! {' for i in range(concurrency_factor): - lock_cpus = lock_cpus + f" {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET=\"$cpuset{i}\"; }}" - if (i == concurrency_factor - 1): + lock_cpus = ( + lock_cpus + + f' {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET="$cpuset{i}"; }}' + ) + if i == concurrency_factor - 1: lock_cpus = lock_cpus + "; }; do sleep 1; done" else: lock_cpus = lock_cpus + " ||" @@ -137,16 +183,18 @@ def lock_cpu_cmds(concurrency_factor, tempdir, bin): def execute_batch( - runs, - benchmark, - output_handler, + runs, + benchmark, + output_handler, ): global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 use_concurrency = benchmark.config.concurrency_factor != 1 if use_concurrency: - get_cpus = get_cpu_cmd(benchmark.config.concurrency_factor, benchmark.rlimits.cpu_cores) + get_cpus = get_cpu_cmd( + benchmark.config.concurrency_factor, benchmark.rlimits.cpu_cores + ) with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: batch_lines = ["#!/bin/bash"] @@ -154,7 +202,9 @@ def execute_batch( for setting in get_resource_limits(benchmark, tempdir): batch_lines.extend(["\n#SBATCH " + str(setting)]) - batch_lines.extend([f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"]) + batch_lines.extend( + [f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"] + ) batch_lines.extend(["\n\nTMPDIR=$(mktemp -d)"]) bins = {} @@ -168,26 +218,56 @@ def execute_batch( batch_lines.extend(["\n\n" + get_cpus]) batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) for bin in bins: - lock_cpus, unlock_cpus = lock_cpu_cmds(benchmark.config.concurrency_factor, tempdir, bin) + lock_cpus, unlock_cpus = lock_cpu_cmds( + benchmark.config.concurrency_factor, tempdir, bin + ) batch_lines.extend(["\n" + str(bin) + ") "]) taskfile_name = f"bin{str(bin)}.tasks" taskfile = os.path.join(tempdir, taskfile_name) with open(taskfile, "w") as f: task_lines = [] - for (i, run) in bins[bin]: + for i, run in bins[bin]: task_lines.extend( - [lock_cpus + " && " + str(get_run_cli(benchmark, run.cmdline(), os.path.join("$TMPDIR", str(i)), os.path.join(tempdir, str(i)))) + "; " + unlock_cpus + "\n"]) + [ + lock_cpus + + " && " + + str( + get_run_cli( + benchmark, + run.cmdline(), + os.path.join("$TMPDIR", str(i)), + os.path.join(tempdir, str(i)), + ) + ) + + "; " + + unlock_cpus + + "\n" + ] + ) f.writelines(task_lines) - batch_lines.extend(f"\n while read -r x; do /bin/sh -c \"$x\" & done < {taskfile}") + batch_lines.extend( + f'\n while read -r x; do /bin/sh -c "$x" & done < {taskfile}' + ) batch_lines.extend("\n wait") batch_lines.extend(["\n;;"]) else: batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) for bin in bins: batch_lines.extend(["\n" + str(bin) + ") "]) - for (i, run) in bins[bin]: + for i, run in bins[bin]: batch_lines.extend( - ["\n " + str(get_run_cli(benchmark, run.cmdline(), os.path.join("$TMPDIR", str(i)), os.path.join(tempdir, str(i))))]) + [ + "\n " + + str( + get_run_cli( + benchmark, + run.cmdline(), + os.path.join("$TMPDIR", str(i)), + os.path.join(tempdir, str(i)), + ) + ) + ] + ) batch_lines.extend(["\n;;"]) batch_lines.extend(["\nesac"]) @@ -198,9 +278,7 @@ def execute_batch( try: sbatch_cmd = ["sbatch", "--wait", str(batchfile)] - logging.debug( - "Command to run: %s", shlex.join(sbatch_cmd) - ) + logging.debug("Command to run: %s", shlex.join(sbatch_cmd)) sbatch_result = subprocess.run( sbatch_cmd, stdout=subprocess.PIPE, @@ -234,23 +312,31 @@ def stop(): def get_resource_limits(benchmark, tempdir): - timelimit = benchmark.rlimits.cputime * benchmark.config.aggregation_factor * 2 # safe overapprox + timelimit = ( + benchmark.rlimits.cputime * benchmark.config.aggregation_factor * 2 + ) # safe overapprox cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor - memory = benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 # so that runexec catches the OOM, not SLURM + memory = ( + benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 + ) # so that runexec catches the OOM, not SLURM os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) srun_timelimit_h = int(timelimit / 3600) srun_timelimit_m = int((timelimit % 3600) / 60) srun_timelimit_s = int(timelimit % 60) - srun_timelimit = f"{srun_timelimit_h:02d}:{srun_timelimit_m:02d}:{srun_timelimit_s:02d}" - - ret = [f"--output={tempdir}/logs/%A_%a.out", - "--time=" + str(srun_timelimit), - "--cpus-per-task=" + str(cpus), - "--mem=" + str(int(memory / 1000000)) + "M", - "--threads-per-core=1", # --use_hyperthreading=False is always given here - "--mincpus=" + str(cpus), - "--ntasks=1"] + srun_timelimit = ( + f"{srun_timelimit_h:02d}:{srun_timelimit_m:02d}:{srun_timelimit_s:02d}" + ) + + ret = [ + f"--output={tempdir}/logs/%A_%a.out", + "--time=" + str(srun_timelimit), + "--cpus-per-task=" + str(cpus), + "--mem=" + str(int(memory / 1000000)) + "M", + "--threads-per-core=1", # --use_hyperthreading=False is always given here + "--mincpus=" + str(cpus), + "--ntasks=1", + ] return ret @@ -295,13 +381,13 @@ def get_run_cli(benchmark, args, tempdir, resultdir): f"touch started; " f"{shlex.join(['echo', 'Running command: ', *args])}; " f"{shlex.join(args)} 2>&1 | tee log; " - f"touch ended" + f"touch ended", ] ) cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") - cli = cli.replace("'$TMPDIR", "\"$TMPDIR").replace(":/overlay'", ":/overlay\"") + cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay'", ':/overlay"') cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) @@ -316,8 +402,8 @@ def get_run_result(tempdir, run): with open(runexec_log, "r") as file: for line in file: line = line.strip() - if line and '=' in line: - key, value = line.split('=', 1) + if line and "=" in line: + key, value = line.split("=", 1) data_dict[key.strip()] = value.strip() ret = {} From dd8d67be301a1fff131bb9466af801438b5d1582 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 18:24:43 +0100 Subject: [PATCH 025/124] Fix cwd --- contrib/slurm/arrayexecutor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2ca812dc5..96a9c3700 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -365,12 +365,14 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup", "-B", - "/home:/lower", + f"{os.getcwd()}:/lower", "--no-home", + "--cwd", + os.getcwd(), "-B", f"{tempdir}:/overlay", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work /home/", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", benchmark.config.singularity, ] ) From 9c13fb524490e1779a8ea0f10ed9b71b82da791a Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 18:32:15 +0100 Subject: [PATCH 026/124] Removed erroneous param --- contrib/slurm/arrayexecutor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 96a9c3700..f2e1b2d01 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -367,8 +367,6 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", f"{os.getcwd()}:/lower", "--no-home", - "--cwd", - os.getcwd(), "-B", f"{tempdir}:/overlay", "--fusemount", From fb997a51a47bc4c2fb0af52d86efd51fd7a5b960 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 18:51:48 +0100 Subject: [PATCH 027/124] Fixed paths --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index f2e1b2d01..33ab1ce28 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -365,12 +365,12 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup", "-B", - f"{os.getcwd()}:/lower", + f"{os.path.abspath(os.path.dirname(benchmark.config.singularity))}:/lower", "--no-home", "-B", f"{tempdir}:/overlay", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.path.abspath(os.path.dirname(benchmark.config.singularity))}", benchmark.config.singularity, ] ) From dedea8f2be4766507fcc445348e910eb18301293 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 19:17:47 +0100 Subject: [PATCH 028/124] fixed result file collection --- contrib/slurm/arrayexecutor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 33ab1ce28..7eb1ff591 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -356,6 +356,8 @@ def get_run_cli(benchmark, args, tempdir, resultdir): runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) args = [*runexec, "--", *args] + basedir = os.path.abspath(os.path.dirname(benchmark.config.singularity)) + prefix = os.path.relpath(os.getcwd(), basedir) if benchmark.config.singularity: cli.extend( @@ -365,12 +367,12 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup", "-B", - f"{os.path.abspath(os.path.dirname(benchmark.config.singularity))}:/lower", + f"{basedir}:/lower", "--no-home", "-B", f"{tempdir}:/overlay", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.path.abspath(os.path.dirname(benchmark.config.singularity))}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {basedir}", benchmark.config.singularity, ] ) @@ -388,7 +390,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay'", ':/overlay"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" + cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{prefix}/* {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli From ad9fa6f99515a59c436a151b143f3587e0a5e9a8 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 21:09:58 +0100 Subject: [PATCH 029/124] Moving files as well --- contrib/slurm-benchmark.py | 2 ++ contrib/slurm/arrayexecutor.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 32daa7e81..9fa895225 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -14,6 +14,8 @@ import os import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + import benchexec.benchexec import benchexec.tools import benchexec.util diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 7eb1ff591..16dd4187f 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -11,6 +11,7 @@ import os import re import shlex +import shutil import subprocess import sys import tempfile @@ -300,7 +301,7 @@ def execute_batch( for i, run in enumerate(runs): try: - run.set_result(get_run_result(os.path.join(tempdir, str(i)), run)) + run.set_result(get_run_result(benchmark.log_folder, os.path.join(tempdir, str(i)), run)) output_handler.output_after_run(run) except: logging.debug(f"Output missing for run #{i}") @@ -380,10 +381,8 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", - f"touch started; " f"{shlex.join(['echo', 'Running command: ', *args])}; " - f"{shlex.join(args)} 2>&1 | tee log; " - f"touch ended", + f"{shlex.join(args)} 2>&1 | tee log; ", ] ) @@ -396,7 +395,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): return cli -def get_run_result(tempdir, run): +def get_run_result(output_dir, tempdir, run): runexec_log = f"{tempdir}/log" tmp_log = f"{tempdir}/output.log" @@ -427,4 +426,11 @@ def get_run_result(tempdir, run): content = log_source.read() file.write(content) + src_files = os.listdir(tempdir) + for file_name in src_files: + if file_name not in ["log", "output.log"]: + full_file_name = os.path.join(tempdir, file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, output_dir) + return ret From 2159e5f431ec3f02d9f46f2dad600575e20fa505 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 21:34:58 +0100 Subject: [PATCH 030/124] Fixed source file path --- contrib/slurm/arrayexecutor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 16dd4187f..1d281c577 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -426,9 +426,8 @@ def get_run_result(output_dir, tempdir, run): content = log_source.read() file.write(content) - src_files = os.listdir(tempdir) + src_files = os.listdir(os.path.join(tempdir, "output")) for file_name in src_files: - if file_name not in ["log", "output.log"]: full_file_name = os.path.join(tempdir, file_name) if os.path.isfile(full_file_name): shutil.copy(full_file_name, output_dir) From b14ad92e72692e1bd3bc9e560bbc83f29ffdcddc Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 21:41:38 +0100 Subject: [PATCH 031/124] retrying version in singularity --- contrib/slurm/arrayexecutor.py | 58 ++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 1d281c577..65c720607 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -29,16 +29,21 @@ def init(config, benchmark): tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) + has_version = False try: benchmark.tool_version = benchmark.tool.version(benchmark.executable) + if benchmark.tool_version != "": + has_version = True except Exception as e: - if benchmark.config.singularity: - logging.warning( - "could not determine version due to error: %s, will retry in executor", - e, - ) - try: - version_printer = f"""from benchexec import tooladapter + logging.warning( + "could not determine version due to error: %s, will retry in executor if allowed", + e, + ) + has_version = False + + if not has_version and benchmark.config.singularity: + try: + version_printer = f"""from benchexec import tooladapter from benchexec.model import load_tool_info class Config(): pass @@ -50,28 +55,25 @@ class Config(): tool = load_tool_info("{benchmark.tool_module}", config)[1] executable = tool.executable(locator) print(tool.version(executable))""" - with open(".get_version.py", "w") as script: - script.write(version_printer) - result = subprocess.run( - [ - "singularity", - "exec", - benchmark.config.singularity, - "python3", - ".get_version.py", - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - if result.stdout: - for line in result.stdout.splitlines(): - benchmark.tool_version = line + with open(".get_version.py", "w") as script: + script.write(version_printer) + result = subprocess.run( + [ + "singularity", + "exec", + benchmark.config.singularity, + "python3", + ".get_version.py", + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if result.stdout: + for line in result.stdout.splitlines(): + benchmark.tool_version = line - except Exception as e: - logging.warning("could not determine version due to error: %s", e) - benchmark.tool_version = None - else: - logging.warning("could not determine version due to error: %s", e) + except Exception as e: + logging.warning("could not determine version (in container) due to error: %s", e) benchmark.tool_version = None From 45eced4fd87640358bb552bc655f5a1196d28699 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 21:53:01 +0100 Subject: [PATCH 032/124] fixed copying --- contrib/slurm/arrayexecutor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 65c720607..026a881f6 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -70,7 +70,7 @@ class Config(): ) if result.stdout: for line in result.stdout.splitlines(): - benchmark.tool_version = line + benchmark.tool_version = str(line) except Exception as e: logging.warning("could not determine version (in container) due to error: %s", e) @@ -430,8 +430,8 @@ def get_run_result(output_dir, tempdir, run): src_files = os.listdir(os.path.join(tempdir, "output")) for file_name in src_files: - full_file_name = os.path.join(tempdir, file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, output_dir) + full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, output_dir) return ret From 0510de7628cbc242fae9447ece5e0b1047e44fb9 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 22:00:02 +0100 Subject: [PATCH 033/124] Fixed dest path --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 026a881f6..78bc399cf 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -303,7 +303,7 @@ def execute_batch( for i, run in enumerate(runs): try: - run.set_result(get_run_result(benchmark.log_folder, os.path.join(tempdir, str(i)), run)) + run.set_result(get_run_result(benchmark.result_files_folder, os.path.join(tempdir, str(i)), run)) output_handler.output_after_run(run) except: logging.debug(f"Output missing for run #{i}") From ddd77e71e8c03aa17cfcf8882100545d9c670723 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 22:29:23 +0100 Subject: [PATCH 034/124] fixed copying; no longer overwriting directory --- contrib/slurm/arrayexecutor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 78bc399cf..794a8d772 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -428,6 +428,7 @@ def get_run_result(output_dir, tempdir, run): content = log_source.read() file.write(content) + os.makedirs(output_dir, exist_ok=True) src_files = os.listdir(os.path.join(tempdir, "output")) for file_name in src_files: full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) From 6a1d3162b90df3ea27062ac27408fedd6aaa90f8 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 21 Nov 2024 22:39:15 +0100 Subject: [PATCH 035/124] Using task result_files_folder instead of runset result_files_folder --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 794a8d772..8cf2621a5 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -303,7 +303,7 @@ def execute_batch( for i, run in enumerate(runs): try: - run.set_result(get_run_result(benchmark.result_files_folder, os.path.join(tempdir, str(i)), run)) + run.set_result(get_run_result(run.result_files_folder, os.path.join(tempdir, str(i)), run)) output_handler.output_after_run(run) except: logging.debug(f"Output missing for run #{i}") From 6b174f9c462009add43853d520442e17f9ff273e Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 00:06:46 +0100 Subject: [PATCH 036/124] Enhanced logging --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 8cf2621a5..5e065b2aa 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -305,8 +305,8 @@ def execute_batch( try: run.set_result(get_run_result(run.result_files_folder, os.path.join(tempdir, str(i)), run)) output_handler.output_after_run(run) - except: - logging.debug(f"Output missing for run #{i}") + except Exception as e: + logging.warning("could not set result due to error: %s", e) def stop(): From 8e2fe4859f40d430ebb06a31f9f6fe25b619aafc Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 00:10:53 +0100 Subject: [PATCH 037/124] Only copying files if folder exists --- contrib/slurm/arrayexecutor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 5e065b2aa..acc57bf74 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -428,11 +428,12 @@ def get_run_result(output_dir, tempdir, run): content = log_source.read() file.write(content) - os.makedirs(output_dir, exist_ok=True) - src_files = os.listdir(os.path.join(tempdir, "output")) - for file_name in src_files: - full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, output_dir) + if os.path.exists(os.path.join(tempdir, "output")): + os.makedirs(output_dir, exist_ok=True) + src_files = os.listdir(os.path.join(tempdir, "output")) + for file_name in src_files: + full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, output_dir) return ret From cf975aab38db6141893cea6a5af04e537e3b9324 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 11:19:50 +0100 Subject: [PATCH 038/124] Forcing version query in singularity --- contrib/slurm/arrayexecutor.py | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index acc57bf74..f5b55fbe9 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -25,25 +25,8 @@ WORKER_THREADS = [] STOPPED_BY_INTERRUPT = False - def init(config, benchmark): - tool_locator = tooladapter.create_tool_locator(config) - benchmark.executable = benchmark.tool.executable(tool_locator) - has_version = False - try: - benchmark.tool_version = benchmark.tool.version(benchmark.executable) - if benchmark.tool_version != "": - has_version = True - except Exception as e: - logging.warning( - "could not determine version due to error: %s, will retry in executor if allowed", - e, - ) - has_version = False - - if not has_version and benchmark.config.singularity: - try: - version_printer = f"""from benchexec import tooladapter + version_printer = f"""from benchexec import tooladapter from benchexec.model import load_tool_info class Config(): pass @@ -55,6 +38,14 @@ class Config(): tool = load_tool_info("{benchmark.tool_module}", config)[1] executable = tool.executable(locator) print(tool.version(executable))""" + + def version_from_tool_in_container( + executable, + arg="--version", + use_stderr=False, + ignore_stderr=False, + line_prefix=None): + try: with open(".get_version.py", "w") as script: script.write(version_printer) result = subprocess.run( @@ -70,11 +61,22 @@ class Config(): ) if result.stdout: for line in result.stdout.splitlines(): - benchmark.tool_version = str(line) + return str(line) except Exception as e: logging.warning("could not determine version (in container) due to error: %s", e) - benchmark.tool_version = None + return "" + + tool_locator = tooladapter.create_tool_locator(config) + benchmark.executable = benchmark.tool.executable(tool_locator) + benchmark.tool._version_from_tool = version_from_tool_in_container + try: + benchmark.tool_version = benchmark.tool.version(benchmark.executable) + except Exception as e: + logging.warning( + "could not determine version due to error: %s", + e, + ) def get_system_info(): From d0b7ee13ae021fe2e58c8f44bc37ecef2dfe8810 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 11:28:19 +0100 Subject: [PATCH 039/124] Fixed version string --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index f5b55fbe9..ddfc03434 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -33,7 +33,7 @@ class Config(): config = Config() config.container = False -config.tool_directory = "{config.tool_directory}" +config.tool_directory = "." locator = tooladapter.create_tool_locator(config) tool = load_tool_info("{benchmark.tool_module}", config)[1] executable = tool.executable(locator) @@ -61,7 +61,7 @@ def version_from_tool_in_container( ) if result.stdout: for line in result.stdout.splitlines(): - return str(line) + return str(line) # first line is OK except Exception as e: logging.warning("could not determine version (in container) due to error: %s", e) From 1108b4c2ced120340ddf301e27944b4ef2c7b51d Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 12:29:23 +0100 Subject: [PATCH 040/124] fix version parsing --- contrib/slurm/arrayexecutor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index ddfc03434..5d5d1f260 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -48,7 +48,7 @@ def version_from_tool_in_container( try: with open(".get_version.py", "w") as script: script.write(version_printer) - result = subprocess.run( + process = subprocess.run( [ "singularity", "exec", @@ -57,15 +57,16 @@ def version_from_tool_in_container( ".get_version.py", ], stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, + stderr=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + universal_newlines=True, ) - if result.stdout: - for line in result.stdout.splitlines(): - return str(line) # first line is OK + if process.stdout: + return process.stdout.strip() except Exception as e: logging.warning("could not determine version (in container) due to error: %s", e) - return "" + return "" tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) From c7670ea60245db27c05f325690f1a34150d534de Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 13:34:40 +0100 Subject: [PATCH 041/124] Determining system info now --- contrib/slurm/arrayexecutor.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 5d5d1f260..77dfe518c 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -7,6 +7,7 @@ # SPDX-FileCopyrightText: Budapest University of Technology and Economics # # SPDX-License-Identifier: Apache-2.0 +import json import logging import os import re @@ -18,6 +19,7 @@ import time from benchexec import tooladapter +from benchexec.systeminfo import SystemInfo from benchexec.util import ProcessExitCode sys.dont_write_bytecode = True # prevent creation of .pyc files @@ -81,6 +83,37 @@ def version_from_tool_in_container( def get_system_info(): + try: + process = subprocess.run( + [ + "srun", + "singularity", + "exec", + "python3", + "-c", + "import benchexec.systeminfo; " + "import json; " + "print(json.dumps(benchexec.systeminfo.SystemInfo().__dict__)", + ], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + universal_newlines=True, + ) + if process.stdout: + actual_sysinfo = json.loads(process.stdout.strip()) + blank_sysinfo = SystemInfo() + blank_sysinfo.hostname = str(actual_sysinfo["hostname"]) + " (sample)" + blank_sysinfo.os = actual_sysinfo["os"] + blank_sysinfo.cpu_max_frequency = actual_sysinfo["cpu_max_frequency"] + blank_sysinfo.cpu_number_of_cores = actual_sysinfo["cpu_number_of_cores"] + blank_sysinfo.cpu_model = actual_sysinfo["cpu_model"] + blank_sysinfo.cpu_turboboost = actual_sysinfo["cpu_turboboost"] + blank_sysinfo.memory = actual_sysinfo["memory"] + return blank_sysinfo + + except Exception as e: + logging.warning("could not determine system info due to error: %s", e) return None From 49707d9e86784ac2e7aa309165d7638f86c8d74c Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 13:36:12 +0100 Subject: [PATCH 042/124] Black --- contrib/slurm/arrayexecutor.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 77dfe518c..c4e9a1bde 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -27,6 +27,7 @@ WORKER_THREADS = [] STOPPED_BY_INTERRUPT = False + def init(config, benchmark): version_printer = f"""from benchexec import tooladapter from benchexec.model import load_tool_info @@ -46,7 +47,8 @@ def version_from_tool_in_container( arg="--version", use_stderr=False, ignore_stderr=False, - line_prefix=None): + line_prefix=None, + ): try: with open(".get_version.py", "w") as script: script.write(version_printer) @@ -67,7 +69,9 @@ def version_from_tool_in_container( return process.stdout.strip() except Exception as e: - logging.warning("could not determine version (in container) due to error: %s", e) + logging.warning( + "could not determine version (in container) due to error: %s", e + ) return "" tool_locator = tooladapter.create_tool_locator(config) @@ -339,7 +343,11 @@ def execute_batch( for i, run in enumerate(runs): try: - run.set_result(get_run_result(run.result_files_folder, os.path.join(tempdir, str(i)), run)) + run.set_result( + get_run_result( + run.result_files_folder, os.path.join(tempdir, str(i)), run + ) + ) output_handler.output_after_run(run) except Exception as e: logging.warning("could not set result due to error: %s", e) From 920632981e8df53799d7a029b6642a2b77b9351b Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 19:35:09 +0100 Subject: [PATCH 043/124] Preserving logfiles if encountering an error --- contrib/slurm/arrayexecutor.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index c4e9a1bde..50ee2c50c 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -7,6 +7,7 @@ # SPDX-FileCopyrightText: Budapest University of Technology and Economics # # SPDX-License-Identifier: Apache-2.0 +import glob import json import logging import os @@ -341,16 +342,21 @@ def execute_batch( logging.debug(f"Canceling sbatch job #{jobid}") subprocess.run(["scancel", str(jobid)]) - for i, run in enumerate(runs): - try: - run.set_result( - get_run_result( - run.result_files_folder, os.path.join(tempdir, str(i)), run + for bin in bins: + for i, run in bins[bin]: + try: + run.set_result( + get_run_result( + run.result_files_folder, os.path.join(tempdir, str(i)), run + ) ) - ) - output_handler.output_after_run(run) - except Exception as e: - logging.warning("could not set result due to error: %s", e) + output_handler.output_after_run(run) + except Exception as e: + logging.warning("could not set result due to error: %s", e) + if not STOPPED_BY_INTERRUPT: + logging.debug("preserving log(s) due to error with run") + for file in glob.glob(f"{tempdir}/logs/*{bin}.out"): + shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) def stop(): From c3704758f12a7b45db7dc1750d5c01fffbb63373 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 19:56:22 +0100 Subject: [PATCH 044/124] Added logging --- contrib/slurm/arrayexecutor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 50ee2c50c..e111174bb 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -202,6 +202,7 @@ def get_cpu_cmd(concurrency_factor, cores): "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' "else { print $i } } }'))" + '"\necho "${cpus[@]}"' ) for i in range(concurrency_factor): get_cpus = ( @@ -355,7 +356,7 @@ def execute_batch( logging.warning("could not set result due to error: %s", e) if not STOPPED_BY_INTERRUPT: logging.debug("preserving log(s) due to error with run") - for file in glob.glob(f"{tempdir}/logs/*{bin}.out"): + for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) From 217431b58f426beb123aca426d33213ce986f853 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 20:33:58 +0100 Subject: [PATCH 045/124] creating folder if doesn't exist --- contrib/slurm/arrayexecutor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index e111174bb..677914298 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -28,8 +28,13 @@ WORKER_THREADS = [] STOPPED_BY_INTERRUPT = False +singularity = None def init(config, benchmark): + global singularity + if benchmark.config.singularity: + singularity = benchmark.config.singularity + version_printer = f"""from benchexec import tooladapter from benchexec.model import load_tool_info class Config(): @@ -94,11 +99,12 @@ def get_system_info(): "srun", "singularity", "exec", + singularity, "python3", "-c", "import benchexec.systeminfo; " "import json; " - "print(json.dumps(benchexec.systeminfo.SystemInfo().__dict__)", + "print(json.dumps(benchexec.systeminfo.SystemInfo().__dict__))", ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, @@ -357,6 +363,7 @@ def execute_batch( if not STOPPED_BY_INTERRUPT: logging.debug("preserving log(s) due to error with run") for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): + os.makedirs(benchmark.result_files_folder, exist_ok=True) shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) From 67b184d9fc64963b6d1f76af7b4bfd5de9562df8 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 20:39:03 +0100 Subject: [PATCH 046/124] syntax fix --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 677914298..6c2013314 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -208,7 +208,7 @@ def get_cpu_cmd(concurrency_factor, cores): "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' "else { print $i } } }'))" - '"\necho "${cpus[@]}"' + '\necho "${cpus[@]}"' ) for i in range(concurrency_factor): get_cpus = ( From 0295b5f497647cd5c8a81e8875602ee7bb05ed0f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 21:49:47 +0100 Subject: [PATCH 047/124] Retrying --- contrib/slurm/arrayexecutor.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 6c2013314..c617bb119 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -237,6 +237,7 @@ def execute_batch( runs, benchmark, output_handler, + first_time = True, ): global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 @@ -360,11 +361,14 @@ def execute_batch( output_handler.output_after_run(run) except Exception as e: logging.warning("could not set result due to error: %s", e) - if not STOPPED_BY_INTERRUPT: - logging.debug("preserving log(s) due to error with run") - for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): - os.makedirs(benchmark.result_files_folder, exist_ok=True) - shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) + if first_time: + execute_batch(bins[bin], benchmark, output_handler, False) + else: + if not STOPPED_BY_INTERRUPT: + logging.debug("preserving log(s) due to error with run") + for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): + os.makedirs(benchmark.result_files_folder, exist_ok=True) + shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) def stop(): @@ -374,7 +378,7 @@ def stop(): def get_resource_limits(benchmark, tempdir): timelimit = ( - benchmark.rlimits.cputime * benchmark.config.aggregation_factor * 2 + benchmark.rlimits.cputime * benchmark.config.aggregation_factor / benchmark.config.concurrency_factor ) # safe overapprox cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor memory = ( From 9706c732d32c07ecc7bd64eccbc2c644da2ddbb2 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 22:02:11 +0100 Subject: [PATCH 048/124] fixed retry --- contrib/slurm/arrayexecutor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index c617bb119..bd82bf772 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -352,6 +352,7 @@ def execute_batch( for bin in bins: for i, run in bins[bin]: + missing_runs = [] try: run.set_result( get_run_result( @@ -362,13 +363,15 @@ def execute_batch( except Exception as e: logging.warning("could not set result due to error: %s", e) if first_time: - execute_batch(bins[bin], benchmark, output_handler, False) + missing_runs.append(run) else: if not STOPPED_BY_INTERRUPT: logging.debug("preserving log(s) due to error with run") for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): os.makedirs(benchmark.result_files_folder, exist_ok=True) shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) + if len(missing_runs) > 0: + execute_batch(missing_runs, benchmark, output_handler, False) def stop(): From fd4f6ad8577ba855b5e352b88c3f34abbcd1ec0e Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 22:13:56 +0100 Subject: [PATCH 049/124] Fixed aggregation of results --- contrib/slurm/arrayexecutor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index bd82bf772..49e5e0a65 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -350,9 +350,9 @@ def execute_batch( logging.debug(f"Canceling sbatch job #{jobid}") subprocess.run(["scancel", str(jobid)]) + missing_runs = [] for bin in bins: for i, run in bins[bin]: - missing_runs = [] try: run.set_result( get_run_result( @@ -370,8 +370,8 @@ def execute_batch( for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): os.makedirs(benchmark.result_files_folder, exist_ok=True) shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) - if len(missing_runs) > 0: - execute_batch(missing_runs, benchmark, output_handler, False) + if len(missing_runs) > 0: + execute_batch(missing_runs, benchmark, output_handler, False) def stop(): From 8131b767e4d33c5eea475013e8b5fa2c3543c4cd Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 22:19:38 +0100 Subject: [PATCH 050/124] Only using fuse where necessary --- contrib/slurm/arrayexecutor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 49e5e0a65..114548470 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -435,12 +435,14 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup", "-B", - f"{basedir}:/lower", + f"{basedir}", + "-B", + f"{os.getcwd()}:/lower", "--no-home", "-B", f"{tempdir}:/overlay", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {basedir}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", benchmark.config.singularity, ] ) From 5fc0bd51f65ec66c86dd19a4ba98a1e6b6424ca6 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 22 Nov 2024 22:23:24 +0100 Subject: [PATCH 051/124] Removed prefix, no longer necessary. --- contrib/slurm/arrayexecutor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 114548470..0579cb468 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -425,7 +425,6 @@ def get_run_cli(benchmark, args, tempdir, resultdir): args = [*runexec, "--", *args] basedir = os.path.abspath(os.path.dirname(benchmark.config.singularity)) - prefix = os.path.relpath(os.getcwd(), basedir) if benchmark.config.singularity: cli.extend( @@ -458,7 +457,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay'", ':/overlay"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{prefix}/* {resultdir}/; rm -r {tempdir}" + cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli From ea9410b8df8679c961810848cebc1cc776574809 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 10:49:24 +0100 Subject: [PATCH 052/124] keeping only witnesses --- contrib/slurm/arrayexecutor.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 0579cb468..af68a0634 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -30,6 +30,7 @@ singularity = None + def init(config, benchmark): global singularity if benchmark.config.singularity: @@ -237,7 +238,7 @@ def execute_batch( runs, benchmark, output_handler, - first_time = True, + first_time=True, ): global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 @@ -368,9 +369,17 @@ def execute_batch( if not STOPPED_BY_INTERRUPT: logging.debug("preserving log(s) due to error with run") for file in glob.glob(f"{tempdir}/logs/*_{bin}.out"): - os.makedirs(benchmark.result_files_folder, exist_ok=True) - shutil.copy(file, os.path.join(benchmark.result_files_folder, os.path.basename(file) + ".error")) - if len(missing_runs) > 0: + os.makedirs( + benchmark.result_files_folder, exist_ok=True + ) + shutil.copy( + file, + os.path.join( + benchmark.result_files_folder, + os.path.basename(file) + ".error", + ), + ) + if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: execute_batch(missing_runs, benchmark, output_handler, False) @@ -381,7 +390,9 @@ def stop(): def get_resource_limits(benchmark, tempdir): timelimit = ( - benchmark.rlimits.cputime * benchmark.config.aggregation_factor / benchmark.config.concurrency_factor + benchmark.rlimits.cputime + * benchmark.config.aggregation_factor + / benchmark.config.concurrency_factor ) # safe overapprox cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor memory = ( @@ -497,7 +508,11 @@ def get_run_result(output_dir, tempdir, run): if os.path.exists(os.path.join(tempdir, "output")): os.makedirs(output_dir, exist_ok=True) src_files = os.listdir(os.path.join(tempdir, "output")) - for file_name in src_files: + for file_name in [ + file + for file in src_files + if os.path.basename(file) in ["witness.graphml", "witness.yml"] + ]: # this should use 'benchmark.resultfiles', but if a tool is not set up correctly, it will produce way too many files. full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) if os.path.isfile(full_file_name): shutil.copy(full_file_name, output_dir) From 9edfe88acecd9f6eb984f11875c7eb78a780e4b2 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 10:52:49 +0100 Subject: [PATCH 053/124] Fixed timelimit for arrays --- contrib/slurm/arrayexecutor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index af68a0634..098f41d8d 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -10,6 +10,7 @@ import glob import json import logging +import math import os import re import shlex @@ -389,15 +390,16 @@ def stop(): def get_resource_limits(benchmark, tempdir): - timelimit = ( - benchmark.rlimits.cputime - * benchmark.config.aggregation_factor - / benchmark.config.concurrency_factor - ) # safe overapprox + timelimit = int( + benchmark.rlimits.cputime # safe overapprox + * math.ceil( + benchmark.config.aggregation_factor / benchmark.config.concurrency_factor + ) + ) cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor memory = ( benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 - ) # so that runexec catches the OOM, not SLURM + ) # so that runexec catches the OOM, not SLURM (other stuff runs in the container as well) os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) srun_timelimit_h = int(timelimit / 3600) From 81c6c86702a0d2feb898fa75ca9a8b7160c75053 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 10:59:08 +0100 Subject: [PATCH 054/124] Overwriting version() instead of _version_from_tool() --- contrib/slurm/arrayexecutor.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 098f41d8d..2071483eb 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -50,13 +50,7 @@ class Config(): executable = tool.executable(locator) print(tool.version(executable))""" - def version_from_tool_in_container( - executable, - arg="--version", - use_stderr=False, - ignore_stderr=False, - line_prefix=None, - ): + def version_from_tool_in_container(executable): try: with open(".get_version.py", "w") as script: script.write(version_printer) @@ -84,7 +78,7 @@ def version_from_tool_in_container( tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) - benchmark.tool._version_from_tool = version_from_tool_in_container + benchmark.tool.version = version_from_tool_in_container try: benchmark.tool_version = benchmark.tool.version(benchmark.executable) except Exception as e: From f07c037e5d85548b4d1ea694b238ad5d722cfa65 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 13:17:41 +0100 Subject: [PATCH 055/124] Fix false positive timeout --- contrib/slurm/arrayexecutor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2071483eb..42b975681 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -385,10 +385,15 @@ def stop(): def get_resource_limits(benchmark, tempdir): timelimit = int( - benchmark.rlimits.cputime # safe overapprox + max( + benchmark.rlimits.cputime, + benchmark.rlimits.walltime, + benchmark.rlimits.cputime_hard, + ) # safe overapprox * math.ceil( benchmark.config.aggregation_factor / benchmark.config.concurrency_factor ) + * 1.1 ) cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor memory = ( From 05c888dda66d96b602e897a9ed7cb439cfd8dbc9 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 13:19:14 +0100 Subject: [PATCH 056/124] int() --- contrib/slurm/arrayexecutor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 42b975681..cd612cd8a 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -386,9 +386,9 @@ def stop(): def get_resource_limits(benchmark, tempdir): timelimit = int( max( - benchmark.rlimits.cputime, - benchmark.rlimits.walltime, - benchmark.rlimits.cputime_hard, + int(benchmark.rlimits.cputime), + int(benchmark.rlimits.walltime), + int(benchmark.rlimits.cputime_hard), ) # safe overapprox * math.ceil( benchmark.config.aggregation_factor / benchmark.config.concurrency_factor From c42de148780060109933a4f358c0f7dfe960ea0d Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 13:23:12 +0100 Subject: [PATCH 057/124] Handling Nones --- contrib/slurm/arrayexecutor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index cd612cd8a..c7517f155 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -386,15 +386,18 @@ def stop(): def get_resource_limits(benchmark, tempdir): timelimit = int( max( - int(benchmark.rlimits.cputime), - int(benchmark.rlimits.walltime), - int(benchmark.rlimits.cputime_hard), + int(benchmark.rlimits.cputime if benchmark.rlimits.cputime else -1), + int(benchmark.rlimits.walltime if benchmark.rlimits.walltime else -1), + int( + benchmark.rlimits.cputime_hard if benchmark.rlimits.cputime_hard else -1 + ), ) # safe overapprox * math.ceil( benchmark.config.aggregation_factor / benchmark.config.concurrency_factor ) * 1.1 ) + assert timelimit > 0, "Either cputime, cputime_hard, or walltime should be given." cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor memory = ( benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 From 93ea05a20776b46bae1022239dc242a530d358b3 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 17:42:05 +0100 Subject: [PATCH 058/124] Format, prepare for MR --- contrib/slurm-benchmark.py | 7 +- contrib/slurm/README-old.md | 91 ++++++++++++++++ contrib/slurm/README.md | 72 ++++++------ contrib/slurm/arrayexecutor.py | 137 ++++------------------- contrib/slurm/slurmexecutor.py | 194 +++++++-------------------------- contrib/slurm/utils.py | 116 ++++++++++++++++++++ 6 files changed, 304 insertions(+), 313 deletions(-) create mode 100644 contrib/slurm/README-old.md create mode 100644 contrib/slurm/utils.py diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 9fa895225..05a22b61e 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -71,12 +71,6 @@ def create_argument_parser(self): default="0", help="Retry killed jobs this many times. Use -1 for unbounded retry attempts (cannot be used with --slurm-array).", ) - slurm_args.add_argument( - "--use-seff", - dest="seff", - action="store_true", - help="Use seff instead of sacct for resource measurement data (cannot be used with --slurm-array).", - ) slurm_args.add_argument( "--aggregation-factor", @@ -106,6 +100,7 @@ def load_executor(self): if self.config.slurm_array: from slurm import arrayexecutor as executor elif self.config.slurm: + logging.error("Single-job-based SLURM-integration is no longer supported. Use --slurm-array instead.") from slurm import slurmexecutor as executor else: logging.warning( diff --git a/contrib/slurm/README-old.md b/contrib/slurm/README-old.md new file mode 100644 index 000000000..dff49d3d1 --- /dev/null +++ b/contrib/slurm/README-old.md @@ -0,0 +1,91 @@ + +# BenchExec Extension for Benchmarking via SLURM + +This Python script extends BenchExec, a benchmarking framework, to facilitate benchmarking via SLURM, optionally using a Singularity container. + +In case of problems, please tag in an [issue](https://github.com/sosy-lab/benchexec/issues/new/choose): [Levente Bajczi](https://github.com/leventeBajczi) (@leventeBajczi). + +## Preliminaries + +* [SLURM](https://slurm.schedmd.com/documentation.html) is an open-source job scheduling and workload management system used primarily in high-performance computing (HPC) environments. +* [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) is a containerization platform designed for scientific and high-performance computing (HPC) workloads, providing users with a reproducible and portable environment for running applications and workflows. + +## Requirements + +* SLURM, tested with `slurm 22.05.7`, should work within `22.x.x` +* Singularity (optional), tested with `singularity-ce version 4.0.1`, should work within `4.x.x` + +## Usage +1. Run the script with Python 3: + ``` + python3 $BENCHEXEC_FOLDER/contrib/slurm-benchmark.py [options] + ``` + Options: + - `--slurm`: Use SLURM to execute benchmarks. Will revert to regular (local) benchexec if not given. + - `--singularity `: Specify the path to the Singularity .sif file to use. See usage later. + - `--scratchdir `: Specify the directory for temporary files. The script will use this parameter to create temporary directories for file storage per-run, which get discarded later. By default, this is the CWD, which might result in temporary files being generated by the thousands in the working directory. On some systems, this must be on the same mount, or even under the same hierarchy as the current directory. Must exist, be writable, and be a directory. + - `--retry-killed `: Retry killed jobs (e.g., due to SLURM errors) this many times. Use -1 for unbounded retry attempts. + - `-N `: Specify the factor of parallelism, i.e., how many instances to start at a time. Tested with up to `1000`, probably works with much higher values as well. + +## Overview of the Workflow + +This works similarly to BenchExec, however, instead of delegating each run to `runexec`, it delegates to `srun` from SLURM. + +1. If the `--singularity` option is given, the script wraps the command to run in a container. This is useful for dependency management (in most HPC environments, arbitrary package installations are frowned upon). For a simple container, use the following: + + ```singularity + BootStrap: docker + From: ubuntu:22.04 + + %post + apt -y update + apt -y install openjdk-17-jre-headless libgomp1 libmpfr-dev fuse-overlayfs + ``` + + Use `singularity build [--remote / --fakeroot] --fix-perms .sif .def` to build the container. + + Notice the `fuse-overlayfs` package. That is mandatory for the overlay filesystem to work properly. + + The script parameterizes `singularity exec` with the following params: + * `-B $PWD:/lower`: Bind the working directory to `/lower` (could be read-only) + * `--no-home`: Do not bind the home directory + * `-B {tempdir}:/overlay`: Bind the temporary directory to `/overlay` (must be writeable) + * `--fusemount "container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work $HOME"`: mount an overlay filesystem at $HOME, where modifications go in the temp dir but files can be read from the current dir + + We also wrap this command inside the container using `bash -c "{command} && echo 0 > exitcode || echo $? > exitcode` to save the exitcode of the process, _and_ always have 0 as the exitcode of a completed run. Otherwise, we cannot differentiate between a FAILURE happening due to SLURM-issues (e.g., transport failures), or a simply failing command. Otherwise, retrying would not work. + +2. Currently, the following parameters are passed to `srun` (calculated from the benchmark's parameters): + * `-t ` CPU timelimit (generally, SLURM will round up to nearest minute) + * `-c ` number of cpus + * `--threads-per-core=1` only use one thread per core + * `--mem-per-cpu ` memory allocaiton in MBs per cpu + * `--ntasks=1` number of tasks per node + +3. The script parses the resulting job ID, and after the job finishes, runs `seff` to gather resource usage data: + * Exit code + * Status + * CPU time [s] + * Wall time [s] + * Memory [MB] + +## Limitations + +Currently, there are the following limitations compared to local benchexec: + +1. No advanced resource constraining / monitoring: only CPU time, CPU core and memory limits are handled, and only CPU time, wall time, and memory usage are monitored. +2. No exotic paths in the command are handled: only the current working directory and its children are visible in the container +3. The user on the host and the container should not differ (due to using $HOME in the commands). +4. Without singularity, no constraint is placed on the resulting files of the runs: this will populate the current directory with all the output files of all the runs. +5. For timed-out runs, where SLURM terminated the run, no CPU time values are available. +6. The executor only works with hyperthreading disabled, due to the inability to query nodes about the number of threads per core. Assuming it's always 2 is risky, as it may not hold true universally. Consequently, because we can only request whole cores from SLURM instead of threads, we must divide the requested number of threads by the threads-per-core value, which is unknown if hyperthreading could be enabled. +7. Cancelling a benchmark run (by sending SIGINT) could be delayed up to a few minutes depending on the SLURM configuration. \ No newline at end of file diff --git a/contrib/slurm/README.md b/contrib/slurm/README.md index dff49d3d1..2da3ee31d 100644 --- a/contrib/slurm/README.md +++ b/contrib/slurm/README.md @@ -11,7 +11,10 @@ SPDX-License-Identifier: Apache-2.0 --> # BenchExec Extension for Benchmarking via SLURM -This Python script extends BenchExec, a benchmarking framework, to facilitate benchmarking via SLURM, optionally using a Singularity container. +> [!IMPORTANT] +> The previous, single-job-based SLURM integration is no longer maintained. For its documentation, see [README-old.md](./README-old.md) + +This Python script extends BenchExec, a benchmarking framework, to facilitate benchmarking via SLURM array jobs using Singularity containers. In case of problems, please tag in an [issue](https://github.com/sosy-lab/benchexec/issues/new/choose): [Levente Bajczi](https://github.com/leventeBajczi) (@leventeBajczi). @@ -23,7 +26,8 @@ In case of problems, please tag in an [issue](https://github.com/sosy-lab/benche ## Requirements * SLURM, tested with `slurm 22.05.7`, should work within `22.x.x` -* Singularity (optional), tested with `singularity-ce version 4.0.1`, should work within `4.x.x` +* Singularity, tested with `singularity-ce version 4.0.1`, should work within `4.x.x` +* cgroup support is required ## Usage 1. Run the script with Python 3: @@ -31,61 +35,55 @@ In case of problems, please tag in an [issue](https://github.com/sosy-lab/benche python3 $BENCHEXEC_FOLDER/contrib/slurm-benchmark.py [options] ``` Options: - - `--slurm`: Use SLURM to execute benchmarks. Will revert to regular (local) benchexec if not given. + - `--slurm-array`: Use SLURM array jobs to execute benchmarks. Will revert to regular (local) benchexec if not given. - `--singularity `: Specify the path to the Singularity .sif file to use. See usage later. - `--scratchdir `: Specify the directory for temporary files. The script will use this parameter to create temporary directories for file storage per-run, which get discarded later. By default, this is the CWD, which might result in temporary files being generated by the thousands in the working directory. On some systems, this must be on the same mount, or even under the same hierarchy as the current directory. Must exist, be writable, and be a directory. - `--retry-killed `: Retry killed jobs (e.g., due to SLURM errors) this many times. Use -1 for unbounded retry attempts. - - `-N `: Specify the factor of parallelism, i.e., how many instances to start at a time. Tested with up to `1000`, probably works with much higher values as well. + - `-N `: Specify the factor of parallelism, i.e., how many jobs to submit at a time. Tested with up to `1000`, probably works with much higher values as well. + - `--aggregation-factor`: Put this many jobs into a single job of the array. + - `--batch-size`: Allow this many runs inside a runcollection to be submitted. Lower values might hurt responsiveness, higher values might cause problems with script sizes. Suggested size is around a few thousand. + - `--parallelization`: Execute this many jobs in parallel inside a job of the array. ## Overview of the Workflow -This works similarly to BenchExec, however, instead of delegating each run to `runexec`, it delegates to `srun` from SLURM. +This works similarly to BenchExec, however, instead of delegating each run directly to `runexec`, it creates a hierarchy of run infos and an array job description for SLURM, which is then executed using `sbatch`. -1. If the `--singularity` option is given, the script wraps the command to run in a container. This is useful for dependency management (in most HPC environments, arbitrary package installations are frowned upon). For a simple container, use the following: +1. The script wraps the command to run in a container. This is useful for dependency management (in most HPC environments, arbitrary package installations are frowned upon). For a simple container, use the following: ```singularity - BootStrap: docker - From: ubuntu:22.04 - - %post - apt -y update - apt -y install openjdk-17-jre-headless libgomp1 libmpfr-dev fuse-overlayfs + BootStrap: docker + From: ubuntu:24.04 + + %post + apt -y update + apt -y install + apt -y install software-properties-common + add-apt-repository ppa:sosy-lab/benchmarking + apt -y install benchexec fuse-overlayfs + mkdir /work + mkdir /upper ``` - Use `singularity build [--remote / --fakeroot] --fix-perms .sif .def` to build the container. + Use `singularity build [--remote / --fakeroot] --fix-perms .sif .def` to build the container. A remote service (e.g., [sylabs](https://cloud.sylabs.io/builder)) may be used if root permissions are missing. - Notice the `fuse-overlayfs` package. That is mandatory for the overlay filesystem to work properly. + Notice the `fuse-overlayfs` and `benchexec` packages. That is mandatory for the overlay filesystem to work properly and for `runexec` to exist in the container. The script parameterizes `singularity exec` with the following params: - * `-B $PWD:/lower`: Bind the working directory to `/lower` (could be read-only) + * `-B "/sys/fs/cgroup:/sys/fs/cgroup"`: Bind the cgroup hierarchy for use inside the container + * `-B {basedir}`: Bind the "base directory" (directory of the .sif file) (can be read-only) + * `-B {workdir}:/lower`: Bind the current directory to `/lower` (can be read-only) * `--no-home`: Do not bind the home directory * `-B {tempdir}:/overlay`: Bind the temporary directory to `/overlay` (must be writeable) - * `--fusemount "container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work $HOME"`: mount an overlay filesystem at $HOME, where modifications go in the temp dir but files can be read from the current dir - - We also wrap this command inside the container using `bash -c "{command} && echo 0 > exitcode || echo $? > exitcode` to save the exitcode of the process, _and_ always have 0 as the exitcode of a completed run. Otherwise, we cannot differentiate between a FAILURE happening due to SLURM-issues (e.g., transport failures), or a simply failing command. Otherwise, retrying would not work. + * `--fusemount "container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {workdir}"`: mount an overlay filesystem at {workdir} under {basedir}, where modifications go in the temp dir -2. Currently, the following parameters are passed to `srun` (calculated from the benchmark's parameters): - * `-t ` CPU timelimit (generally, SLURM will round up to nearest minute) - * `-c ` number of cpus - * `--threads-per-core=1` only use one thread per core - * `--mem-per-cpu ` memory allocaiton in MBs per cpu - * `--ntasks=1` number of tasks per node +2. A `--batch-size`-sized portion of the runs is organized into bins of size `--aggregation-factor`. Each bin will correspond to a job in the array. Inside each bin, `--parallelization`-many `runexec` instances can be started with exact resource allocations and usage reporting. Output files and output log are stored inside the temp dir. If an error is encountered (most commonly this is due to `fuse` locking up and causing a TIMEOUT without any logs being ready) the run is put into a second-chance queue to be run again, at most `--retry-killed` times. -3. The script parses the resulting job ID, and after the job finishes, runs `seff` to gather resource usage data: - * Exit code - * Status - * CPU time [s] - * Wall time [s] - * Memory [MB] +3. The script parses the resource usage and status of each run, as it would with regular `runexec`. ## Limitations Currently, there are the following limitations compared to local benchexec: -1. No advanced resource constraining / monitoring: only CPU time, CPU core and memory limits are handled, and only CPU time, wall time, and memory usage are monitored. -2. No exotic paths in the command are handled: only the current working directory and its children are visible in the container -3. The user on the host and the container should not differ (due to using $HOME in the commands). -4. Without singularity, no constraint is placed on the resulting files of the runs: this will populate the current directory with all the output files of all the runs. -5. For timed-out runs, where SLURM terminated the run, no CPU time values are available. -6. The executor only works with hyperthreading disabled, due to the inability to query nodes about the number of threads per core. Assuming it's always 2 is risky, as it may not hold true universally. Consequently, because we can only request whole cores from SLURM instead of threads, we must divide the requested number of threads by the threads-per-core value, which is unknown if hyperthreading could be enabled. -7. Cancelling a benchmark run (by sending SIGINT) could be delayed up to a few minutes depending on the SLURM configuration. \ No newline at end of file +1. No exotic paths in the command are handled: only the directory of the `.sif` file and its children are visible in the container. +1. The executor only works with hyperthreading disabled, due to the inability to query nodes about the number of threads per core. Assuming it's always 2 is risky, as it may not hold true universally. Consequently, because we can only request whole cores from SLURM instead of threads, we must divide the requested number of threads by the threads-per-core value, which is unknown if hyperthreading could be enabled. +1. `fuse` sometimes locks up (more precisely: is in an uninterruptible state) for the entire duration of a job. My guess is the underlying lustre file system does not like it when the same path is overlayed from hundreds of nodes at the same time. As a mitigation, we re-run timed out jobs (not runs!). \ No newline at end of file diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index c7517f155..fb8216b20 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -8,7 +8,6 @@ # # SPDX-License-Identifier: Apache-2.0 import glob -import json import logging import math import os @@ -21,64 +20,30 @@ import time from benchexec import tooladapter -from benchexec.systeminfo import SystemInfo from benchexec.util import ProcessExitCode +from contrib.slurm.utils import ( + version_in_container, + get_system_info_srun, + get_cpu_cmd, + lock_cpu_cmds, +) sys.dont_write_bytecode = True # prevent creation of .pyc files -WORKER_THREADS = [] STOPPED_BY_INTERRUPT = False - singularity = None def init(config, benchmark): global singularity - if benchmark.config.singularity: - singularity = benchmark.config.singularity - - version_printer = f"""from benchexec import tooladapter -from benchexec.model import load_tool_info -class Config(): - pass - -config = Config() -config.container = False -config.tool_directory = "." -locator = tooladapter.create_tool_locator(config) -tool = load_tool_info("{benchmark.tool_module}", config)[1] -executable = tool.executable(locator) -print(tool.version(executable))""" - - def version_from_tool_in_container(executable): - try: - with open(".get_version.py", "w") as script: - script.write(version_printer) - process = subprocess.run( - [ - "singularity", - "exec", - benchmark.config.singularity, - "python3", - ".get_version.py", - ], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - stdin=subprocess.DEVNULL, - universal_newlines=True, - ) - if process.stdout: - return process.stdout.strip() - - except Exception as e: - logging.warning( - "could not determine version (in container) due to error: %s", e - ) - return "" + assert ( + benchmark.config.singularity + ), "Singularity is required for array-based SLURM jobs." + singularity = benchmark.config.singularity tool_locator = tooladapter.create_tool_locator(config) benchmark.executable = benchmark.tool.executable(tool_locator) - benchmark.tool.version = version_from_tool_in_container + benchmark.tool.version = version_in_container(singularity, benchmark.tool_module) try: benchmark.tool_version = benchmark.tool.version(benchmark.executable) except Exception as e: @@ -89,39 +54,7 @@ def version_from_tool_in_container(executable): def get_system_info(): - try: - process = subprocess.run( - [ - "srun", - "singularity", - "exec", - singularity, - "python3", - "-c", - "import benchexec.systeminfo; " - "import json; " - "print(json.dumps(benchexec.systeminfo.SystemInfo().__dict__))", - ], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - stdin=subprocess.DEVNULL, - universal_newlines=True, - ) - if process.stdout: - actual_sysinfo = json.loads(process.stdout.strip()) - blank_sysinfo = SystemInfo() - blank_sysinfo.hostname = str(actual_sysinfo["hostname"]) + " (sample)" - blank_sysinfo.os = actual_sysinfo["os"] - blank_sysinfo.cpu_max_frequency = actual_sysinfo["cpu_max_frequency"] - blank_sysinfo.cpu_number_of_cores = actual_sysinfo["cpu_number_of_cores"] - blank_sysinfo.cpu_model = actual_sysinfo["cpu_model"] - blank_sysinfo.cpu_turboboost = actual_sysinfo["cpu_turboboost"] - blank_sysinfo.memory = actual_sysinfo["memory"] - return blank_sysinfo - - except Exception as e: - logging.warning("could not determine system info due to error: %s", e) - return None + return get_system_info_srun(singularity) def execute_benchmark(benchmark, output_handler): @@ -197,43 +130,11 @@ def _execute_run_set( ) -def get_cpu_cmd(concurrency_factor, cores): - get_cpus = ( - "cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " - "awk -F= ' { print $2 } ' | head -n1 | " - "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " - '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' - "else { print $i } } }'))" - '\necho "${cpus[@]}"' - ) - for i in range(concurrency_factor): - get_cpus = ( - get_cpus - + f'\nexport cpuset{i}=$(IFS=,; echo "${{cpus[*]:{i * cores}:{cores}}}")' - ) - return get_cpus - - -def lock_cpu_cmds(concurrency_factor, tempdir, bin): - lock_cpus = 'CPUSET=""; while ! {' - for i in range(concurrency_factor): - lock_cpus = ( - lock_cpus - + f' {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET="$cpuset{i}"; }}' - ) - if i == concurrency_factor - 1: - lock_cpus = lock_cpus + "; }; do sleep 1; done" - else: - lock_cpus = lock_cpus + " ||" - unlock_cpus = f"rm -r {tempdir}/cpuset_{bin}_$cpuset" - return lock_cpus, unlock_cpus - - def execute_batch( runs, benchmark, output_handler, - first_time=True, + counter=0, ): global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 @@ -358,7 +259,7 @@ def execute_batch( output_handler.output_after_run(run) except Exception as e: logging.warning("could not set result due to error: %s", e) - if first_time: + if counter < benchmark.config.retry or benchmark.config.retry < 0: missing_runs.append(run) else: if not STOPPED_BY_INTERRUPT: @@ -375,7 +276,7 @@ def execute_batch( ), ) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: - execute_batch(missing_runs, benchmark, output_handler, False) + execute_batch(missing_runs, benchmark, output_handler, counter + 1) def stop(): @@ -395,7 +296,7 @@ def get_resource_limits(benchmark, tempdir): * math.ceil( benchmark.config.aggregation_factor / benchmark.config.concurrency_factor ) - * 1.1 + * 1.5 # to let all processes finish, we add 50% ) assert timelimit > 0, "Either cputime, cputime_hard, or walltime should be given." cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor @@ -439,9 +340,9 @@ def get_run_cli(benchmark, args, tempdir, resultdir): runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) args = [*runexec, "--", *args] - basedir = os.path.abspath(os.path.dirname(benchmark.config.singularity)) + basedir = os.path.abspath(os.path.dirname(singularity)) - if benchmark.config.singularity: + if singularity: cli.extend( [ "singularity", @@ -457,7 +358,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): f"{tempdir}:/overlay", "--fusemount", f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", - benchmark.config.singularity, + singularity, ] ) cli.extend( diff --git a/contrib/slurm/slurmexecutor.py b/contrib/slurm/slurmexecutor.py index 2c86d8bad..3238a3d38 100644 --- a/contrib/slurm/slurmexecutor.py +++ b/contrib/slurm/slurmexecutor.py @@ -19,7 +19,7 @@ import threading import time -from benchexec import BenchExecException, tooladapter, util +from benchexec import BenchExecException, tooladapter from benchexec.util import ProcessExitCode sys.dont_write_bytecode = True # prevent creation of .pyc files @@ -171,23 +171,20 @@ def execute(self, run): args, run.log_file, ) - if run_result is None: - stop() - else: - if ( - "terminationreason" not in run_result - or not run_result["terminationreason"] == "killed" - or (attempts >= self.benchmark.config.retry >= 0) - or STOPPED_BY_INTERRUPT - ): - break - attempts += 1 - time.sleep(1) # as to not overcrowd a failing scheduler - logging.debug( - "Retrying after %d attempts, limit: %d", - attempts, - self.benchmark.config.retry, - ) + if ( + "terminationreason" not in run_result + or not run_result["terminationreason"] == "killed" + or (attempts >= self.benchmark.config.retry >= 0) + or STOPPED_BY_INTERRUPT + ): + break + attempts += 1 + time.sleep(1) # as to not overcrowd a failing scheduler + logging.debug( + "Retrying after %d attempts, limit: %d", + attempts, + self.benchmark.config.retry, + ) except KeyboardInterrupt: # If the run was interrupted, we ignore the result and cleanup. @@ -208,7 +205,7 @@ def execute(self, run): return None -jobid_pattern = re.compile(r"job (\d*) started") +jobid_pattern = re.compile(r"job (\d*) queued") def wait_for(func, timeout_sec=None, poll_interval_sec=1): @@ -220,7 +217,7 @@ def wait_for(func, timeout_sec=None, poll_interval_sec=1): """ start_time = time.monotonic() - while not STOPPED_BY_INTERRUPT: + while True: ret = func() if ret is not None: return ret @@ -234,8 +231,6 @@ def wait_for(func, timeout_sec=None, poll_interval_sec=1): def run_slurm(benchmark, args, log_file): - global STOPPED_BY_INTERRUPT - timelimit = benchmark.rlimits.cputime cpus = benchmark.rlimits.cpu_cores memory = benchmark.rlimits.memory @@ -256,7 +251,6 @@ def run_slurm(benchmark, args, log_file): ) with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: - tempdir="tmp" tmp_log = os.path.join(tempdir, "log") os.makedirs(os.path.join(tempdir, "upper")) @@ -295,15 +289,13 @@ def run_slurm(benchmark, args, log_file): [ "sh", "-c", - f"echo job $SLURM_JOB_ID started; {shlex.join(args)}; echo $? > exitcode", + f"{shlex.join(args)}; echo $? > exitcode", ] ) - logging.debug( - "Command to run: %s", shlex.join(srun_command) - ) + logging.debug("Command to run: %s", shlex.join(srun_command)) jobid = None - while jobid is None and not STOPPED_BY_INTERRUPT: + while jobid is None: with open(tmp_log, "w") as tmp_log_f: subprocess.run( srun_command, @@ -311,27 +303,33 @@ def run_slurm(benchmark, args, log_file): stderr=subprocess.STDOUT, ) - if ( - STOPPED_BY_INTERRUPT - ): # job cancelled while srun was running, log not necessarily finalized - return - - # we try to read back the log, in the first three lines, there should be the jobid + # we try to read back the log, in the first two lines there should be the jobid with open(tmp_log, "r") as tmp_log_f: - for line in itertools.islice(tmp_log_f, 3): + for line in itertools.islice(tmp_log_f, 2): jobid_match = jobid_pattern.search(line) if jobid_match: jobid = int(jobid_match.group(1)) break - logging.debug("Pattern not found in log line: %s", line) - if ( - STOPPED_BY_INTERRUPT - ): # job was cancelled during log parsing, no job id present - return + seff_command = ["seff", str(jobid)] + logging.debug("Command to run: %s", shlex.join(seff_command)) + + def get_checked_seff_result(): + seff_result = subprocess.run( + seff_command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + if "exit code" in str(seff_result.stdout): + return seff_result + else: + return None - raw_output, slurm_status, exit_code, cpu_time, wall_time, memory_usage = ( - run_seff(jobid) if benchmark.config.seff else run_sacct(jobid) + # sometimes `seff` needs a few extra seconds to realize the task has ended + result = wait_for(get_checked_seff_result, 30, 2) + + slurm_status, exit_code, cpu_time, wall_time, memory_usage = parse_seff( + str(result.stdout) ) if os.path.exists(exitcode_file): @@ -366,7 +364,7 @@ def run_slurm(benchmark, args, log_file): with open(log_file, "w+") as file: with open(tmp_log, "r") as log_source: content = log_source.read() - file.write(f"{shlex.join(args)}") + file.write(shlex.join(args)) file.write("\n\n\n" + "-" * 80 + "\n\n\n") file.write(content) if content == "": @@ -375,120 +373,12 @@ def run_slurm(benchmark, args, log_file): if benchmark.config.debug: with open(log_file + ".debug_info", "w+") as file: file.write(f"jobid: {jobid}\n") - file.write(f"seff output: {str(raw_output)}\n") + file.write(f"seff output: {str(result.stdout)}\n") file.write(f"Parsed data: {str(ret)}\n") return ret -time_pattern = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?") - - -def get_seconds_from_time(time_str): - time_match = time_pattern.search(time_str) - if time_match: - hours, minutes, seconds, millis = time_match.groups() - if hours is None: - hours = 0 - if minutes is None: - minutes = 0 # realistically never None, but doesn't hurt - if seconds is None: - seconds = 0 # realistically never None, but doesn't hurt - if millis is None: - millis = 0 - return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(millis) / 1000 - - -def run_sacct(jobid): - global STOPPED_BY_INTERRUPT - - sacct_command = [ - "sacct", - "-j", - str(jobid), - "-n", - "--format=State,ExitCode,TotalCpu,Elapsed,MaxRSS", - ] - logging.debug( - "Command to run: %s", shlex.join(sacct_command) - ) - - def get_checked_sacct_result(): - sacct_result = subprocess.run( - sacct_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - lines = sacct_result.stdout.splitlines() - if len(lines) < 2: - logging.debug("Sacct output not yet ready: %s", lines) - return None # jobs not yet ready - parent_job = lines[0].split() # State is read from here - child_job = lines[ - 1 - ].split() # ExitCode, TotalCPU, Elapsed and MaxRSS read from here - logging.debug("Sacct data: parent: %s; child: %s", parent_job, child_job) - if parent_job[0].decode() in [ - "RUNNING", - "PENDING", - "REQUEUED", - "RESIZING", - "SUSPENDED", - "R", - "PD", - "RQ", - "RS", - "S", - ]: - logging.debug( - "Sacct output not yet ready due to state: %s", parent_job[0].decode() - ) - return None # not finished - if len(child_job) < 5: - logging.debug( - "Sacct output not yet ready due to memory not available: %s", child_job - ) - return None # not finished - return ( - sacct_result.stdout, - parent_job[0].decode(), # State - child_job[1].decode().split(":")[0], # ExitCode - get_seconds_from_time(child_job[2].decode()), # TotalCPU in seconds - get_seconds_from_time(child_job[3].decode()), # Elapsed in seconds - float(child_job[4].decode()[:-1]) * 1000, - ) # MaxRSS in K * 1000 -> Bytes - - # sometimes `seff` needs a few extra seconds to realize the task has ended - return wait_for(get_checked_sacct_result, 30, 2) - - -def run_seff(jobid): - global STOPPED_BY_INTERRUPT - - seff_command = ["seff", str(jobid)] - logging.debug( - "Command to run: %s", shlex.join(seff_command) - ) - - def get_checked_seff_result(): - seff_result = subprocess.run( - seff_command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - if "exit code" in str(seff_result.stdout): - return seff_result - else: - return None - - # sometimes `seff` needs a few extra seconds to realize the task has ended - result = wait_for(get_checked_seff_result, 30, 2) - if STOPPED_BY_INTERRUPT: # job was cancelled - return - - return result.stdout, *parse_seff(str(result.stdout)) - - exit_code_pattern = re.compile(r"State: ([A-Z-_]*) \(exit code (\d+)\)") cpu_time_pattern = re.compile(r"CPU Utilized: (\d+):(\d+):(\d+)") wall_time_pattern = re.compile(r"Job Wall-clock time: (\d+):(\d+):(\d+)") diff --git a/contrib/slurm/utils.py b/contrib/slurm/utils.py new file mode 100644 index 000000000..d22cd854f --- /dev/null +++ b/contrib/slurm/utils.py @@ -0,0 +1,116 @@ +import json +import logging +import subprocess + +from benchexec.systeminfo import SystemInfo + + +def version_in_container(singularity, tool_module): + version_printer = f"""from benchexec import tooladapter +from benchexec.model import load_tool_info +class Config(): + pass + +config = Config() +config.container = False +config.tool_directory = "." +locator = tooladapter.create_tool_locator(config) +tool = load_tool_info("{tool_module}", config)[1] +executable = tool.executable(locator) +print(tool.version(executable))""" + + def version_from_tool_in_container(executable): + try: + with open(".get_version.py", "w") as script: + script.write(version_printer) + process = subprocess.run( + [ + "singularity", + "exec", + singularity, + "python3", + ".get_version.py", + ], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + universal_newlines=True, + ) + if process.stdout: + return process.stdout.strip() + + except Exception as e: + logging.warning( + "could not determine version (in container) due to error: %s", e + ) + return "" + + return version_from_tool_in_container + + +def get_system_info_srun(singularity): + try: + process = subprocess.run( + [ + "srun", + "singularity", + "exec", + singularity, + "python3", + "-c", + "import benchexec.systeminfo; " + "import json; " + "print(json.dumps(benchexec.systeminfo.SystemInfo().__dict__))", + ], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + stdin=subprocess.DEVNULL, + universal_newlines=True, + ) + if process.stdout: + actual_sysinfo = json.loads(process.stdout.strip()) + blank_sysinfo = SystemInfo() + blank_sysinfo.hostname = str(actual_sysinfo["hostname"]) + " (sample)" + blank_sysinfo.os = actual_sysinfo["os"] + blank_sysinfo.cpu_max_frequency = actual_sysinfo["cpu_max_frequency"] + blank_sysinfo.cpu_number_of_cores = actual_sysinfo["cpu_number_of_cores"] + blank_sysinfo.cpu_model = actual_sysinfo["cpu_model"] + blank_sysinfo.cpu_turboboost = actual_sysinfo["cpu_turboboost"] + blank_sysinfo.memory = actual_sysinfo["memory"] + return blank_sysinfo + + except Exception as e: + logging.warning("could not determine system info due to error: %s", e) + return None + + +def get_cpu_cmd(concurrency_factor, cores): + get_cpus = ( + "cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " + "awk -F= ' { print $2 } ' | head -n1 | " + "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " + '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' + "else { print $i } } }'))" + '\necho "${cpus[@]}"' + ) + for i in range(concurrency_factor): + get_cpus = ( + get_cpus + + f'\nexport cpuset{i}=$(IFS=,; echo "${{cpus[*]:{i * cores}:{cores}}}")' + ) + return get_cpus + + +def lock_cpu_cmds(concurrency_factor, tempdir, bin): + lock_cpus = 'CPUSET=""; while ! {' + for i in range(concurrency_factor): + lock_cpus = ( + lock_cpus + + f' {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET="$cpuset{i}"; }}' + ) + if i == concurrency_factor - 1: + lock_cpus = lock_cpus + "; }; do sleep 1; done" + else: + lock_cpus = lock_cpus + " ||" + unlock_cpus = f"rm -r {tempdir}/cpuset_{bin}_$cpuset" + return lock_cpus, unlock_cpus From dad8bd5b4a2f48625ac58fa956f0e50dab8a7d35 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 17:46:06 +0100 Subject: [PATCH 059/124] enhance documentation --- contrib/slurm/README-old.md | 3 +++ contrib/slurm/README.md | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/README-old.md b/contrib/slurm/README-old.md index dff49d3d1..035c8ee44 100644 --- a/contrib/slurm/README-old.md +++ b/contrib/slurm/README-old.md @@ -11,6 +11,9 @@ SPDX-License-Identifier: Apache-2.0 --> # BenchExec Extension for Benchmarking via SLURM +> [!CAUTION] +> This, single-job-based SLURM integration is no longer maintained. For the maintained, array-based version's documentation, see [README.md](./README.md) + This Python script extends BenchExec, a benchmarking framework, to facilitate benchmarking via SLURM, optionally using a Singularity container. In case of problems, please tag in an [issue](https://github.com/sosy-lab/benchexec/issues/new/choose): [Levente Bajczi](https://github.com/leventeBajczi) (@leventeBajczi). diff --git a/contrib/slurm/README.md b/contrib/slurm/README.md index 2da3ee31d..69e51b513 100644 --- a/contrib/slurm/README.md +++ b/contrib/slurm/README.md @@ -46,7 +46,7 @@ In case of problems, please tag in an [issue](https://github.com/sosy-lab/benche ## Overview of the Workflow -This works similarly to BenchExec, however, instead of delegating each run directly to `runexec`, it creates a hierarchy of run infos and an array job description for SLURM, which is then executed using `sbatch`. +This works similarly to BenchExec, however, instead of delegating each run directly to `runexec`, it creates a hierarchy of run infos and an array job description for SLURM, which is then executed using `sbatch`. `runexec` is still used to measure and limit resources. 1. The script wraps the command to run in a container. This is useful for dependency management (in most HPC environments, arbitrary package installations are frowned upon). For a simple container, use the following: From 704161a4047d92165f77e3666e68fe3151d0e6e7 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 17:46:37 +0100 Subject: [PATCH 060/124] Black --- contrib/slurm-benchmark.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 05a22b61e..1f94d2aa3 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -100,7 +100,9 @@ def load_executor(self): if self.config.slurm_array: from slurm import arrayexecutor as executor elif self.config.slurm: - logging.error("Single-job-based SLURM-integration is no longer supported. Use --slurm-array instead.") + logging.error( + "Single-job-based SLURM-integration is no longer supported. Use --slurm-array instead." + ) from slurm import slurmexecutor as executor else: logging.warning( From 8a2babe437a6451858f586a67f8df8bf9775b75f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 17:53:53 +0100 Subject: [PATCH 061/124] Add missing REUSE --- contrib/slurm/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/contrib/slurm/utils.py b/contrib/slurm/utils.py index d22cd854f..b0ed8a124 100644 --- a/contrib/slurm/utils.py +++ b/contrib/slurm/utils.py @@ -1,3 +1,12 @@ +# This file is part of BenchExec, a framework for reliable benchmarking: +# https://github.com/sosy-lab/benchexec +# +# SPDX-FileCopyrightText: 2024 Levente Bajczi +# SPDX-FileCopyrightText: Critical Systems Research Group +# SPDX-FileCopyrightText: Budapest University of Technology and Economics +# +# SPDX-License-Identifier: Apache-2.0 + import json import logging import subprocess From 5d526dff4596d951adddbccd138a6fe2e4df8881 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 18:06:32 +0100 Subject: [PATCH 062/124] singularity is not optional --- contrib/slurm/arrayexecutor.py | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index fb8216b20..f124848fd 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -342,25 +342,24 @@ def get_run_cli(benchmark, args, tempdir, resultdir): args = [*runexec, "--", *args] basedir = os.path.abspath(os.path.dirname(singularity)) - if singularity: - cli.extend( - [ - "singularity", - "exec", - "-B", - "/sys/fs/cgroup:/sys/fs/cgroup", - "-B", - f"{basedir}", - "-B", - f"{os.getcwd()}:/lower", - "--no-home", - "-B", - f"{tempdir}:/overlay", - "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", - singularity, - ] - ) + cli.extend( + [ + "singularity", + "exec", + "-B", + "/sys/fs/cgroup:/sys/fs/cgroup", + "-B", + f"{basedir}", + "-B", + f"{os.getcwd()}:/lower", + "--no-home", + "-B", + f"{tempdir}:/overlay", + "--fusemount", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", + singularity, + ] + ) cli.extend( [ "sh", From d633afcc817e3ea92cc4b5d715f765f93b88d24d Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 18:09:17 +0100 Subject: [PATCH 063/124] sleeping before running array --- contrib/slurm/arrayexecutor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index f124848fd..02db33f54 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -225,6 +225,9 @@ def execute_batch( with open(batchfile, "w") as f: f.writelines(batch_lines) + logging.info("Waiting for 10s for the newly created files to settle (NFS)") + time.sleep(10) + try: sbatch_cmd = ["sbatch", "--wait", str(batchfile)] logging.debug("Command to run: %s", shlex.join(sbatch_cmd)) From bc8435a1254e5e5e365612e270b81d66f6d2b43f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 18:13:01 +0100 Subject: [PATCH 064/124] Specify ro and rw for binds --- contrib/slurm/arrayexecutor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 02db33f54..cd4f075a4 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -350,14 +350,14 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "singularity", "exec", "-B", - "/sys/fs/cgroup:/sys/fs/cgroup", + "/sys/fs/cgroup:/sys/fs/cgroup:rw", "-B", - f"{basedir}", + f"{basedir}:{basedir}:ro", "-B", - f"{os.getcwd()}:/lower", + f"{os.getcwd()}:/lower:ro", "--no-home", "-B", - f"{tempdir}:/overlay", + f"{tempdir}:/overlay:rw", "--fusemount", f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", singularity, From 23923d302faa35d215880d96136579c03c8d351b Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 18:26:12 +0100 Subject: [PATCH 065/124] fix quotes --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index cd4f075a4..bb4b378d3 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -374,7 +374,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") - cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay'", ':/overlay"') + cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay:rw'", ':/overlay:rw"') cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) From adeb41d0ff185c50f7207c1ba730d9c36ff92b21 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 18:30:04 +0100 Subject: [PATCH 066/124] Ruff fix --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index bb4b378d3..6d2c69d5c 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -241,7 +241,7 @@ def execute_batch( STOPPED_BY_INTERRUPT = True if STOPPED_BY_INTERRUPT: - logging.debug(f"Canceling sbatch job if already started") + logging.debug("Canceling sbatch job if already started") if sbatch_result and sbatch_result.stdout: for line in sbatch_result.stdout.splitlines(): jobid_match = sbatch_pattern.search(str(line)) From ea14f51e0806183857b307344b91eee1b3c2c705 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 19:41:45 +0100 Subject: [PATCH 067/124] Moved to resultfiles-based approach --- contrib/slurm/arrayexecutor.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 6d2c69d5c..fd209e3a0 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -256,7 +256,10 @@ def execute_batch( try: run.set_result( get_run_result( - run.result_files_folder, os.path.join(tempdir, str(i)), run + run.result_files_folder, + os.path.join(tempdir, str(i)), + run, + benchmark.result_files_patterns, ) ) output_handler.output_after_run(run) @@ -381,7 +384,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): return cli -def get_run_result(output_dir, tempdir, run): +def get_run_result(output_dir, tempdir, run, result_files_patterns): runexec_log = f"{tempdir}/log" tmp_log = f"{tempdir}/output.log" @@ -414,14 +417,9 @@ def get_run_result(output_dir, tempdir, run): if os.path.exists(os.path.join(tempdir, "output")): os.makedirs(output_dir, exist_ok=True) - src_files = os.listdir(os.path.join(tempdir, "output")) - for file_name in [ - file - for file in src_files - if os.path.basename(file) in ["witness.graphml", "witness.yml"] - ]: # this should use 'benchmark.resultfiles', but if a tool is not set up correctly, it will produce way too many files. - full_file_name = os.path.join(os.path.join(tempdir, "output"), file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, output_dir) + for result_files_pattern in result_files_patterns: + for file_name in glob.glob(f"{tempdir}/output/{result_files_pattern}"): + if os.path.isfile(file_name): + shutil.copy(file_name, output_dir) return ret From 39256a0371679fd5cd6b470fdac1cf43cb8c6863 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 20:54:18 +0100 Subject: [PATCH 068/124] Fixed copying result files --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index fd209e3a0..a873f730e 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -415,10 +415,10 @@ def get_run_result(output_dir, tempdir, run, result_files_patterns): content = log_source.read() file.write(content) - if os.path.exists(os.path.join(tempdir, "output")): + if os.path.exists(tempdir): os.makedirs(output_dir, exist_ok=True) for result_files_pattern in result_files_patterns: - for file_name in glob.glob(f"{tempdir}/output/{result_files_pattern}"): + for file_name in glob.glob(f"{tempdir}/{result_files_pattern}"): if os.path.isfile(file_name): shutil.copy(file_name, output_dir) From 222b3064944fad6af1cf4cd64b334f7eae0848f5 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 21:10:50 +0100 Subject: [PATCH 069/124] Added catch-all witness pattern --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index a873f730e..fb941a1dc 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -259,7 +259,7 @@ def execute_batch( run.result_files_folder, os.path.join(tempdir, str(i)), run, - benchmark.result_files_patterns, + benchmark.result_files_patterns + "*witness*", # e.g., deagle uses mismatched naming ) ) output_handler.output_after_run(run) From 65c9a195c418215111cf5485c0c49a26a70c10c7 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 21:13:03 +0100 Subject: [PATCH 070/124] fixed concat --- contrib/slurm/arrayexecutor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index fb941a1dc..3d5995ec5 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -259,7 +259,8 @@ def execute_batch( run.result_files_folder, os.path.join(tempdir, str(i)), run, - benchmark.result_files_patterns + "*witness*", # e.g., deagle uses mismatched naming + benchmark.result_files_patterns + + ["*witness*"], # e.g., deagle uses mismatched naming ) ) output_handler.output_after_run(run) From 5067d69663527c508024c2544d38f58616497f6e Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 23 Nov 2024 23:40:52 +0100 Subject: [PATCH 071/124] added cache to func --- contrib/slurm/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/utils.py b/contrib/slurm/utils.py index b0ed8a124..364c6cb0c 100644 --- a/contrib/slurm/utils.py +++ b/contrib/slurm/utils.py @@ -6,7 +6,7 @@ # SPDX-FileCopyrightText: Budapest University of Technology and Economics # # SPDX-License-Identifier: Apache-2.0 - +import functools import json import logging import subprocess @@ -28,6 +28,7 @@ class Config(): executable = tool.executable(locator) print(tool.version(executable))""" + @functools.lru_cache() def version_from_tool_in_container(executable): try: with open(".get_version.py", "w") as script: From cda2d982f631fec1376789cdfb1b06cfa213071c Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sun, 24 Nov 2024 13:50:48 +0100 Subject: [PATCH 072/124] Fixed moving files, only moving those that are necessary --- contrib/slurm/arrayexecutor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 3d5995ec5..fbae7d757 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -256,7 +256,6 @@ def execute_batch( try: run.set_result( get_run_result( - run.result_files_folder, os.path.join(tempdir, str(i)), run, benchmark.result_files_patterns @@ -379,13 +378,13 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay:rw'", ':/overlay:rw"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/* {resultdir}/; rm -r {tempdir}" + cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{",".join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli -def get_run_result(output_dir, tempdir, run, result_files_patterns): +def get_run_result(tempdir, run, result_files_patterns): runexec_log = f"{tempdir}/log" tmp_log = f"{tempdir}/output.log" @@ -417,10 +416,11 @@ def get_run_result(output_dir, tempdir, run, result_files_patterns): file.write(content) if os.path.exists(tempdir): - os.makedirs(output_dir, exist_ok=True) + os.makedirs(run.result_files_folder, exist_ok=True) for result_files_pattern in result_files_patterns: for file_name in glob.glob(f"{tempdir}/{result_files_pattern}"): if os.path.isfile(file_name): - shutil.copy(file_name, output_dir) + shutil.copy(file_name, run.result_files_folder) + shutil.rmtree(tempdir) return ret From 54eddc2525a4c65c29b182d8e6711191e9fa4684 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sun, 24 Nov 2024 13:54:36 +0100 Subject: [PATCH 073/124] Syntax fix --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index fbae7d757..61ebd1444 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -378,7 +378,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay:rw'", ':/overlay:rw"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{",".join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" + cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{','.join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli From 4ab571ee8fdf70378f1b44d59168e9f5b929a202 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 13:35:36 +0100 Subject: [PATCH 074/124] added logging for retrying --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 61ebd1444..39b2f05cf 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -282,9 +282,9 @@ def execute_batch( ), ) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: + logging.info(f"Retrying {len(missing_runs)} runs due to errors. Current retry count for this batch: {counter}") execute_batch(missing_runs, benchmark, output_handler, counter + 1) - def stop(): global STOPPED_BY_INTERRUPT STOPPED_BY_INTERRUPT = True From dc6f5f229f6019bc050c575ea1358cd0056dd684 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:12:02 +0100 Subject: [PATCH 075/124] Added logic to re-run experiments --- contrib/slurm-benchmark.py | 7 ++ contrib/slurm/arrayexecutor.py | 149 +++++++++++++++++++++++++++++++-- 2 files changed, 150 insertions(+), 6 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 1f94d2aa3..687fc7618 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -93,6 +93,13 @@ def create_argument_parser(self): default="4", help="Run this many tasks at once in one job.", ) + slurm_args.add_argument( + "--continue-interrupted", + dest="continue_interrupted", + type=str, + default=None, + help="Continue a previously interrupted job.", + ) return parser diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 39b2f05cf..ea2e36520 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -18,9 +18,11 @@ import sys import tempfile import time +import zipfile from benchexec import tooladapter -from benchexec.util import ProcessExitCode +from benchexec.tablegenerator import parse_results_file, handle_union_tag +from benchexec.util import ProcessExitCode, relative_path from contrib.slurm.utils import ( version_in_container, get_system_info_srun, @@ -114,11 +116,14 @@ def _execute_run_set( walltime_after = time.monotonic() usedWallTime = walltime_after - walltime_before - for i in range(0, len(runSet.runs), benchmark.config.batch_size): + if benchmark.config.continue_interrupted: + runs = filter_previous_results(runSet, benchmark) + else: + runs = runSet.runs + + for i in range(0, len(runs), benchmark.config.batch_size): if not STOPPED_BY_INTERRUPT: - chunk = runSet.runs[ - i : min(i + benchmark.config.batch_size, len(runSet.runs)) - ] + chunk = runs[i : min(i + benchmark.config.batch_size, len(runs))] execute_batch(chunk, benchmark, output_handler) if STOPPED_BY_INTERRUPT: @@ -130,6 +135,135 @@ def _execute_run_set( ) +def filter_previous_results(run_set, benchmark): + prefix_base = f"{benchmark.config.output_path}{benchmark.name}." + files = glob.glob(f"{prefix_base}*.logfiles.zip") + if files: + prefix = str(max(files, key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] + else: + logging.warning("No logfile zip found. Giving up recovery.") + return run_set.runs + logging.info(f"Logfile zip found with prefix {prefix}. Attempting recovery.") + + logfile_zip = prefix + ".logfiles.zip" + file_zip = prefix + ".files.zip" + logfile_folder = prefix + ".logfiles" + files_folder = prefix + ".files" + + with zipfile.ZipFile(logfile_zip, "r") as zip_ref: + zip_ref.extractall( + benchmark.config.output_path + ) # we must clean this directory up on every exit point + + if not os.path.isdir(logfile_folder): + logging.warning( + f"Logfiles were extracted, but could not be found under {logfile_folder}." + ) + return run_set.runs + + if not os.path.isfile(file_zip): + logging.warning(f"No {file_zip} found. Giving up recovery.") + shutil.rmtree(logfile_folder) + return run_set.runs + + with zipfile.ZipFile(file_zip, "r") as zip_ref: + zip_ref.extractall( + benchmark.config.output_path + ) # we must clean this directory up on every exit point + + if not os.path.isdir(files_folder): + logging.warning( + f"Files were extracted, but could not be found under {files_folder}." + ) + shutil.rmtree(logfile_folder) + return run_set.runs + + xml_filename_base = prefix + ".results." + run_set.name + xml = xml_filename_base + ".xml" + xml_bz2 = xml_filename_base + ".xml.bz2" + if os.path.exists(xml): + result_file = xml + elif os.path.exists(xml_bz2): + result_file = xml_bz2 + else: + logging.warning( + ".xml or .xml.bz2 must exist for previous run. Giving up recovery." + ) + shutil.rmtree(logfile_folder) + shutil.rmtree(files_folder) + return run_set.runs + + previous_results = parse_results_file(result_file) + previous_runs = {} + for elem in previous_results: + if elem.tag == "run": + values = {} + for col in elem: + if col.tag == "column": + if "walltime" == col.get("title"): + values["walltime"] = float( + str(col.get("value"))[:-1] + ) # ends in 's' + elif "cputime" == col.get("title"): + values["cputime"] = float( + str(col.get("value"))[:-1] + ) # ends in 's' + elif "memory" == col.get("title"): + values["memory"] = int( + str(col.get("value"))[:-1] + ) # ends in 'B' + elif "returnvalue" == col.get("title"): + values["exitcode"] = ProcessExitCode.create( + value=int(col.get("value")) + ) + elif "exitsignal" == col.get("title"): + values["exitcode"] = ProcessExitCode.create( + signal=int(col.get("value")) + ) + elif "terminationreason" == col.get("title"): + values["terminationreason"] = col.get("value") + else: + values[col.get("title")] = col.get("value") + # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible + if values != {}: + previous_runs[(elem.get("name"), elem.get("properties"))] = values + + missing_runs = [] + for run in run_set.runs: + props = " ".join(sorted([prop.name for prop in run.properties])) + key = (run.name, props) + if key in previous_runs: + old_log = str( + os.path.join(logfile_folder, os.path.basename(run.identifier) + ".log") + ) + if os.path.exists(old_log) and os.path.isfile(old_log): + shutil.copy(old_log, run.log_file) + + old_files = str( + os.path.join(logfile_folder, os.path.basename(run.identifier)) + ) + if os.path.exists(old_files) and os.path.isdir(old_files): + os.makedirs(run.result_files_folder, exist_ok=True) + for file in os.listdir(old_files): + shutil.copy(file, run.result_files_folder) + + run.set_result(previous_runs[key]) + else: + missing_runs.append(run) + else: + missing_runs.append(run) + else: + missing_runs.append(run) + + shutil.rmtree(logfile_folder) + shutil.rmtree(files_folder) + + logging.info( + f"Successfully recovered {len(run_set.runs) - len(missing_runs)} runs, still missing {len(missing_runs)} more." + ) + return missing_runs + + def execute_batch( runs, benchmark, @@ -282,9 +416,12 @@ def execute_batch( ), ) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: - logging.info(f"Retrying {len(missing_runs)} runs due to errors. Current retry count for this batch: {counter}") + logging.info( + f"Retrying {len(missing_runs)} runs due to errors. Current retry count for this batch: {counter}" + ) execute_batch(missing_runs, benchmark, output_handler, counter + 1) + def stop(): global STOPPED_BY_INTERRUPT STOPPED_BY_INTERRUPT = True From 46b792592fa8484894625b917f56bc3656552c0a Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:14:01 +0100 Subject: [PATCH 076/124] Fixed argument --- contrib/slurm-benchmark.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 687fc7618..cc6d88985 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -96,8 +96,7 @@ def create_argument_parser(self): slurm_args.add_argument( "--continue-interrupted", dest="continue_interrupted", - type=str, - default=None, + action="store_true", help="Continue a previously interrupted job.", ) From a70784287404c287f9e54792c54d1eb9940b0517 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:22:02 +0100 Subject: [PATCH 077/124] Added filter --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index ea2e36520..2ad6f43d4 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -139,7 +139,7 @@ def filter_previous_results(run_set, benchmark): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." files = glob.glob(f"{prefix_base}*.logfiles.zip") if files: - prefix = str(max(files, key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] + prefix = str(max(filter(lambda file: file != benchmark.log_zip, files), key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] else: logging.warning("No logfile zip found. Giving up recovery.") return run_set.runs From 52a207e03ff3869f8782a4c950b27a4f6b3a9c1e Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:28:56 +0100 Subject: [PATCH 078/124] Added logging, fixed name --- contrib/slurm/arrayexecutor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2ad6f43d4..5d7e07a60 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -231,7 +231,8 @@ def filter_previous_results(run_set, benchmark): missing_runs = [] for run in run_set.runs: props = " ".join(sorted([prop.name for prop in run.properties])) - key = (run.name, props) + name = relative_path(run.identifier, result_file) + key = (name, props) if key in previous_runs: old_log = str( os.path.join(logfile_folder, os.path.basename(run.identifier) + ".log") @@ -249,10 +250,13 @@ def filter_previous_results(run_set, benchmark): run.set_result(previous_runs[key]) else: + logging.warning(f"Old files directory {old_files} does not exist. Skipping run {name}.") missing_runs.append(run) else: + logging.warning(f"Old log {old_log} does not exist. Skipping run {name}.") missing_runs.append(run) else: + logging.warning(f"Run with key {key} not found in results. Skipping run {name}.") missing_runs.append(run) shutil.rmtree(logfile_folder) From dfbf57e9606119eee5ac80e364c063f63d5bc0bc Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:43:25 +0100 Subject: [PATCH 079/124] using proper logfile name now --- contrib/slurm/arrayexecutor.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 5d7e07a60..54f8ba5d4 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -226,7 +226,7 @@ def filter_previous_results(run_set, benchmark): values[col.get("title")] = col.get("value") # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible if values != {}: - previous_runs[(elem.get("name"), elem.get("properties"))] = values + previous_runs[(elem.get("name"), elem.get("properties"))] = (values, elem.get("logfile")) missing_runs = [] for run in run_set.runs: @@ -234,9 +234,7 @@ def filter_previous_results(run_set, benchmark): name = relative_path(run.identifier, result_file) key = (name, props) if key in previous_runs: - old_log = str( - os.path.join(logfile_folder, os.path.basename(run.identifier) + ".log") - ) + values, old_log = previous_runs[key] if os.path.exists(old_log) and os.path.isfile(old_log): shutil.copy(old_log, run.log_file) @@ -248,7 +246,7 @@ def filter_previous_results(run_set, benchmark): for file in os.listdir(old_files): shutil.copy(file, run.result_files_folder) - run.set_result(previous_runs[key]) + run.set_result(values) else: logging.warning(f"Old files directory {old_files} does not exist. Skipping run {name}.") missing_runs.append(run) From d5bfa928eff5dd9d0ae9933f21946287b75f7a8b Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:46:26 +0100 Subject: [PATCH 080/124] Fixed logfile and resultfile paths --- contrib/slurm/arrayexecutor.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 54f8ba5d4..4d0c273aa 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -226,7 +226,7 @@ def filter_previous_results(run_set, benchmark): values[col.get("title")] = col.get("value") # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible if values != {}: - previous_runs[(elem.get("name"), elem.get("properties"))] = (values, elem.get("logfile")) + previous_runs[(elem.get("name"), elem.get("properties"))] = values missing_runs = [] for run in run_set.runs: @@ -234,19 +234,21 @@ def filter_previous_results(run_set, benchmark): name = relative_path(run.identifier, result_file) key = (name, props) if key in previous_runs: - values, old_log = previous_runs[key] + old_log = str( + os.path.join(logfile_folder, run_set.real_name + "." + os.path.basename(run.identifier) + ".log") + ) if os.path.exists(old_log) and os.path.isfile(old_log): shutil.copy(old_log, run.log_file) old_files = str( - os.path.join(logfile_folder, os.path.basename(run.identifier)) + os.path.join(logfile_folder, run_set.real_name, os.path.basename(run.identifier)) ) if os.path.exists(old_files) and os.path.isdir(old_files): os.makedirs(run.result_files_folder, exist_ok=True) for file in os.listdir(old_files): shutil.copy(file, run.result_files_folder) - run.set_result(values) + run.set_result(previous_runs[key]) else: logging.warning(f"Old files directory {old_files} does not exist. Skipping run {name}.") missing_runs.append(run) From 7ddf826a8eb9ab17a4a9360b2fbc4dcea75911d7 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:50:50 +0100 Subject: [PATCH 081/124] Fixed resultfile paths --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 4d0c273aa..01bc7d02e 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -241,7 +241,7 @@ def filter_previous_results(run_set, benchmark): shutil.copy(old_log, run.log_file) old_files = str( - os.path.join(logfile_folder, run_set.real_name, os.path.basename(run.identifier)) + os.path.join(files_folder, run_set.real_name, os.path.basename(run.identifier)) ) if os.path.exists(old_files) and os.path.isdir(old_files): os.makedirs(run.result_files_folder, exist_ok=True) From ed7cf2352c17f7f9e54cfd3521e9908960f98fc4 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 16:58:16 +0100 Subject: [PATCH 082/124] Added logging, and no longer adding status back --- contrib/slurm/arrayexecutor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 01bc7d02e..28a95e152 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -135,7 +135,7 @@ def _execute_run_set( ) -def filter_previous_results(run_set, benchmark): +def filter_previous_results(run_set, benchmark, output_handler): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." files = glob.glob(f"{prefix_base}*.logfiles.zip") if files: @@ -222,8 +222,6 @@ def filter_previous_results(run_set, benchmark): ) elif "terminationreason" == col.get("title"): values["terminationreason"] = col.get("value") - else: - values[col.get("title")] = col.get("value") # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible if values != {}: previous_runs[(elem.get("name"), elem.get("properties"))] = values @@ -249,6 +247,7 @@ def filter_previous_results(run_set, benchmark): shutil.copy(file, run.result_files_folder) run.set_result(previous_runs[key]) + output_handler.output_after_run(run) else: logging.warning(f"Old files directory {old_files} does not exist. Skipping run {name}.") missing_runs.append(run) From 6e94524a964c6d675b4885b03e694dff68f1a2ab Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 17:01:29 +0100 Subject: [PATCH 083/124] Added missing param --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 28a95e152..a813ebdfa 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -117,7 +117,7 @@ def _execute_run_set( usedWallTime = walltime_after - walltime_before if benchmark.config.continue_interrupted: - runs = filter_previous_results(runSet, benchmark) + runs = filter_previous_results(runSet, benchmark, output_handler) else: runs = runSet.runs From 92fa12468404df4db0ecc16e98303d5e39f74c0d Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 17:05:27 +0100 Subject: [PATCH 084/124] Added .cmdline() --- contrib/slurm/arrayexecutor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index a813ebdfa..bad69fa4c 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -246,6 +246,7 @@ def filter_previous_results(run_set, benchmark, output_handler): for file in os.listdir(old_files): shutil.copy(file, run.result_files_folder) + run.cmdline() # we need to call this, because it sets the _cmdline value run.set_result(previous_runs[key]) output_handler.output_after_run(run) else: From 2288a774bce36fd37aaa53133bff9047aefd3586 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 17:09:47 +0100 Subject: [PATCH 085/124] Fixed copying --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index bad69fa4c..8cba55f3b 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -244,7 +244,7 @@ def filter_previous_results(run_set, benchmark, output_handler): if os.path.exists(old_files) and os.path.isdir(old_files): os.makedirs(run.result_files_folder, exist_ok=True) for file in os.listdir(old_files): - shutil.copy(file, run.result_files_folder) + shutil.copy(os.path.join(old_files, file), run.result_files_folder) run.cmdline() # we need to call this, because it sets the _cmdline value run.set_result(previous_runs[key]) From 8d0e7d5af3098faa6ef01b0ea01ddcc5560d01d1 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 25 Nov 2024 22:52:13 +0100 Subject: [PATCH 086/124] fixed max, reformat --- contrib/slurm/arrayexecutor.py | 36 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 8cba55f3b..a89cea8e1 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -137,9 +137,12 @@ def _execute_run_set( def filter_previous_results(run_set, benchmark, output_handler): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." - files = glob.glob(f"{prefix_base}*.logfiles.zip") + files = filter( + lambda file: file != benchmark.log_zip, + glob.glob(f"{prefix_base}*.logfiles.zip"), + ) if files: - prefix = str(max(filter(lambda file: file != benchmark.log_zip, files), key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] + prefix = str(max(files, key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] else: logging.warning("No logfile zip found. Giving up recovery.") return run_set.runs @@ -233,30 +236,45 @@ def filter_previous_results(run_set, benchmark, output_handler): key = (name, props) if key in previous_runs: old_log = str( - os.path.join(logfile_folder, run_set.real_name + "." + os.path.basename(run.identifier) + ".log") + os.path.join( + logfile_folder, + run_set.real_name + "." + os.path.basename(run.identifier) + ".log", + ) ) if os.path.exists(old_log) and os.path.isfile(old_log): shutil.copy(old_log, run.log_file) old_files = str( - os.path.join(files_folder, run_set.real_name, os.path.basename(run.identifier)) + os.path.join( + files_folder, + run_set.real_name, + os.path.basename(run.identifier), + ) ) if os.path.exists(old_files) and os.path.isdir(old_files): os.makedirs(run.result_files_folder, exist_ok=True) for file in os.listdir(old_files): - shutil.copy(os.path.join(old_files, file), run.result_files_folder) + shutil.copy( + os.path.join(old_files, file), run.result_files_folder + ) - run.cmdline() # we need to call this, because it sets the _cmdline value + run.cmdline() # we need to call this, because it sets the _cmdline value run.set_result(previous_runs[key]) output_handler.output_after_run(run) else: - logging.warning(f"Old files directory {old_files} does not exist. Skipping run {name}.") + logging.warning( + f"Old files directory {old_files} does not exist. Skipping run {name}." + ) missing_runs.append(run) else: - logging.warning(f"Old log {old_log} does not exist. Skipping run {name}.") + logging.warning( + f"Old log {old_log} does not exist. Skipping run {name}." + ) missing_runs.append(run) else: - logging.warning(f"Run with key {key} not found in results. Skipping run {name}.") + logging.warning( + f"Run with key {key} not found in results. Skipping run {name}." + ) missing_runs.append(run) shutil.rmtree(logfile_folder) From 4691c1e54fb176dca3a8b261b5421b1bd3d75455 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Tue, 26 Nov 2024 10:13:29 +0100 Subject: [PATCH 087/124] No attempting recovery if there is no logfiles.zip --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index a89cea8e1..3e0abf321 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -141,7 +141,7 @@ def filter_previous_results(run_set, benchmark, output_handler): lambda file: file != benchmark.log_zip, glob.glob(f"{prefix_base}*.logfiles.zip"), ) - if files: + if files and len(files) > 0: prefix = str(max(files, key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] else: logging.warning("No logfile zip found. Giving up recovery.") From 00f17d07838fd4ff537b923bca46a2c11ffe66b4 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Tue, 26 Nov 2024 10:53:20 +0100 Subject: [PATCH 088/124] copying instead of reading-writing --- contrib/slurm/arrayexecutor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 3e0abf321..0c3f96c37 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -569,10 +569,7 @@ def get_run_result(tempdir, run, result_files_patterns): if "terminationreason" in data_dict: ret["terminationreason"] = data_dict["terminationreason"] - with open(run.log_file, "w+") as file: - with open(tmp_log, "r") as log_source: - content = log_source.read() - file.write(content) + shutil.copy(tmp_log, run.log_file) if os.path.exists(tempdir): os.makedirs(run.result_files_folder, exist_ok=True) From 2ca8afcc3c963f6e11a4d7101c44cc97e2de3dfb Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Tue, 26 Nov 2024 11:40:24 +0100 Subject: [PATCH 089/124] Fixed len() for filter() --- contrib/slurm/arrayexecutor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 0c3f96c37..2cdfaa985 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -137,9 +137,11 @@ def _execute_run_set( def filter_previous_results(run_set, benchmark, output_handler): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." - files = filter( - lambda file: file != benchmark.log_zip, - glob.glob(f"{prefix_base}*.logfiles.zip"), + files = list( + filter( + lambda file: file != benchmark.log_zip, + glob.glob(f"{prefix_base}*.logfiles.zip"), + ) ) if files and len(files) > 0: prefix = str(max(files, key=os.path.getmtime))[0 : -(len(".logfiles.zip"))] From e11e8abce9ee688b6a2ae6bf561d001683067498 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 12:35:31 +0100 Subject: [PATCH 090/124] Added further checks to recovery --- contrib/slurm/arrayexecutor.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2cdfaa985..d606b15f7 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -199,6 +199,18 @@ def filter_previous_results(run_set, benchmark, output_handler): return run_set.runs previous_results = parse_results_file(result_file) + if previous_results.get("version") != benchmark.tool_version: + logging.warning(f"Mismatch in tool version: old version={previous_results.get("version")}, current version: {benchmark.tool_version}") + shutil.rmtree(logfile_folder) + shutil.rmtree(files_folder) + return run_set.runs + + if previous_results.get("options") != benchmark.options: + logging.warning(f"Mismatch in tool options: old options='{previous_results.get("options")}', current options: '{benchmark.options}'") + shutil.rmtree(logfile_folder) + shutil.rmtree(files_folder) + return run_set.runs + previous_runs = {} for elem in previous_results: if elem.tag == "run": From b11f215dda281892daa0c582e5db25f958f421e2 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 12:36:19 +0100 Subject: [PATCH 091/124] reformat --- contrib/slurm/arrayexecutor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index d606b15f7..f689ea576 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -200,13 +200,17 @@ def filter_previous_results(run_set, benchmark, output_handler): previous_results = parse_results_file(result_file) if previous_results.get("version") != benchmark.tool_version: - logging.warning(f"Mismatch in tool version: old version={previous_results.get("version")}, current version: {benchmark.tool_version}") + logging.warning( + f"Mismatch in tool version: old version={previous_results.get("version")}, current version: {benchmark.tool_version}" + ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) return run_set.runs if previous_results.get("options") != benchmark.options: - logging.warning(f"Mismatch in tool options: old options='{previous_results.get("options")}', current options: '{benchmark.options}'") + logging.warning( + f"Mismatch in tool options: old options='{previous_results.get("options")}', current options: '{benchmark.options}'" + ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) return run_set.runs From 187925cfc86b088a368fc9eef22ac8802971d8d7 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 12:41:08 +0100 Subject: [PATCH 092/124] Fixed f-string --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index f689ea576..a28dd29aa 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -201,7 +201,7 @@ def filter_previous_results(run_set, benchmark, output_handler): previous_results = parse_results_file(result_file) if previous_results.get("version") != benchmark.tool_version: logging.warning( - f"Mismatch in tool version: old version={previous_results.get("version")}, current version: {benchmark.tool_version}" + f"Mismatch in tool version: old version={previous_results.get('version')}, current version: {benchmark.tool_version}" ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) @@ -209,7 +209,7 @@ def filter_previous_results(run_set, benchmark, output_handler): if previous_results.get("options") != benchmark.options: logging.warning( - f"Mismatch in tool options: old options='{previous_results.get("options")}', current options: '{benchmark.options}'" + f"Mismatch in tool options: old options='{previous_results.get('options')}', current options: '{benchmark.options}'" ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) From 74b2a737c069e492b238a2cd123cda9c0b097609 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 13:09:34 +0100 Subject: [PATCH 093/124] Fixed options comparison --- contrib/slurm/arrayexecutor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index a28dd29aa..6f6a391af 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -199,17 +199,22 @@ def filter_previous_results(run_set, benchmark, output_handler): return run_set.runs previous_results = parse_results_file(result_file) - if previous_results.get("version") != benchmark.tool_version: + + old_version = previous_results.get("version") + new_version = benchmark.tool_version + if old_version != new_version: logging.warning( - f"Mismatch in tool version: old version={previous_results.get('version')}, current version: {benchmark.tool_version}" + f"Mismatch in tool version: old version={old_version}, current version: {new_version}" ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) return run_set.runs - if previous_results.get("options") != benchmark.options: + old_options = previous_results.get("options") + new_options = " ".join(benchmark.options) + if old_options != new_options: logging.warning( - f"Mismatch in tool options: old options='{previous_results.get('options')}', current options: '{benchmark.options}'" + f"Mismatch in tool options: old options='{old_options}', current options: '{new_options}'" ) shutil.rmtree(logfile_folder) shutil.rmtree(files_folder) From 19d70d8f37434182a5ca096e1b28053b0ea25fef Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 13:16:37 +0100 Subject: [PATCH 094/124] Added srun timeout --- contrib/slurm/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/slurm/utils.py b/contrib/slurm/utils.py index 364c6cb0c..8c54ec13f 100644 --- a/contrib/slurm/utils.py +++ b/contrib/slurm/utils.py @@ -63,6 +63,8 @@ def get_system_info_srun(singularity): process = subprocess.run( [ "srun", + "-t", + "1" "singularity", "exec", singularity, From eed7c19cd0ecadb2f98a6e03a273ab2dc2ee5542 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 14:23:32 +0100 Subject: [PATCH 095/124] Not unzipping old files and logfiles any more --- contrib/slurm/arrayexecutor.py | 262 ++++++++++++++++----------------- 1 file changed, 126 insertions(+), 136 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 6f6a391af..28834d01a 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -152,161 +152,151 @@ def filter_previous_results(run_set, benchmark, output_handler): logfile_zip = prefix + ".logfiles.zip" file_zip = prefix + ".files.zip" - logfile_folder = prefix + ".logfiles" - files_folder = prefix + ".files" - - with zipfile.ZipFile(logfile_zip, "r") as zip_ref: - zip_ref.extractall( - benchmark.config.output_path - ) # we must clean this directory up on every exit point - - if not os.path.isdir(logfile_folder): - logging.warning( - f"Logfiles were extracted, but could not be found under {logfile_folder}." - ) - return run_set.runs if not os.path.isfile(file_zip): logging.warning(f"No {file_zip} found. Giving up recovery.") - shutil.rmtree(logfile_folder) return run_set.runs - with zipfile.ZipFile(file_zip, "r") as zip_ref: - zip_ref.extractall( - benchmark.config.output_path - ) # we must clean this directory up on every exit point + with zipfile.ZipFile(logfile_zip, "r") as logfile_zip_ref: - if not os.path.isdir(files_folder): - logging.warning( - f"Files were extracted, but could not be found under {files_folder}." - ) - shutil.rmtree(logfile_folder) - return run_set.runs - - xml_filename_base = prefix + ".results." + run_set.name - xml = xml_filename_base + ".xml" - xml_bz2 = xml_filename_base + ".xml.bz2" - if os.path.exists(xml): - result_file = xml - elif os.path.exists(xml_bz2): - result_file = xml_bz2 - else: - logging.warning( - ".xml or .xml.bz2 must exist for previous run. Giving up recovery." - ) - shutil.rmtree(logfile_folder) - shutil.rmtree(files_folder) - return run_set.runs + with zipfile.ZipFile(file_zip, "r") as file_zip_ref: - previous_results = parse_results_file(result_file) + xml_filename_base = prefix + ".results." + run_set.name + xml = xml_filename_base + ".xml" + xml_bz2 = xml_filename_base + ".xml.bz2" + if os.path.exists(xml): + result_file = xml + elif os.path.exists(xml_bz2): + result_file = xml_bz2 + else: + logging.warning( + ".xml or .xml.bz2 must exist for previous run. Giving up recovery." + ) + return run_set.runs - old_version = previous_results.get("version") - new_version = benchmark.tool_version - if old_version != new_version: - logging.warning( - f"Mismatch in tool version: old version={old_version}, current version: {new_version}" - ) - shutil.rmtree(logfile_folder) - shutil.rmtree(files_folder) - return run_set.runs + previous_results = parse_results_file(result_file) - old_options = previous_results.get("options") - new_options = " ".join(benchmark.options) - if old_options != new_options: - logging.warning( - f"Mismatch in tool options: old options='{old_options}', current options: '{new_options}'" - ) - shutil.rmtree(logfile_folder) - shutil.rmtree(files_folder) - return run_set.runs + old_version = previous_results.get("version") + new_version = benchmark.tool_version + if old_version != new_version: + logging.warning( + f"Mismatch in tool version: old version={old_version}, current version: {new_version}" + ) + return run_set.runs - previous_runs = {} - for elem in previous_results: - if elem.tag == "run": - values = {} - for col in elem: - if col.tag == "column": - if "walltime" == col.get("title"): - values["walltime"] = float( - str(col.get("value"))[:-1] - ) # ends in 's' - elif "cputime" == col.get("title"): - values["cputime"] = float( - str(col.get("value"))[:-1] - ) # ends in 's' - elif "memory" == col.get("title"): - values["memory"] = int( - str(col.get("value"))[:-1] - ) # ends in 'B' - elif "returnvalue" == col.get("title"): - values["exitcode"] = ProcessExitCode.create( - value=int(col.get("value")) - ) - elif "exitsignal" == col.get("title"): - values["exitcode"] = ProcessExitCode.create( - signal=int(col.get("value")) - ) - elif "terminationreason" == col.get("title"): - values["terminationreason"] = col.get("value") - # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible - if values != {}: - previous_runs[(elem.get("name"), elem.get("properties"))] = values - - missing_runs = [] - for run in run_set.runs: - props = " ".join(sorted([prop.name for prop in run.properties])) - name = relative_path(run.identifier, result_file) - key = (name, props) - if key in previous_runs: - old_log = str( - os.path.join( - logfile_folder, - run_set.real_name + "." + os.path.basename(run.identifier) + ".log", + old_options = previous_results.get("options") + new_options = " ".join(benchmark.options) + if old_options != new_options: + logging.warning( + f"Mismatch in tool options: old options='{old_options}', current options: '{new_options}'" ) - ) - if os.path.exists(old_log) and os.path.isfile(old_log): - shutil.copy(old_log, run.log_file) - - old_files = str( - os.path.join( - files_folder, - run_set.real_name, - os.path.basename(run.identifier), + return run_set.runs + + previous_runs = {} + for elem in previous_results: + if elem.tag == "run": + values = {} + for col in elem: + if col.tag == "column": + if "walltime" == col.get("title"): + values["walltime"] = float( + str(col.get("value"))[:-1] + ) # ends in 's' + elif "cputime" == col.get("title"): + values["cputime"] = float( + str(col.get("value"))[:-1] + ) # ends in 's' + elif "memory" == col.get("title"): + values["memory"] = int( + str(col.get("value"))[:-1] + ) # ends in 'B' + elif "returnvalue" == col.get("title"): + values["exitcode"] = ProcessExitCode.create( + value=int(col.get("value")) + ) + elif "exitsignal" == col.get("title"): + values["exitcode"] = ProcessExitCode.create( + signal=int(col.get("value")) + ) + elif "terminationreason" == col.get("title"): + values["terminationreason"] = col.get("value") + # I think 'name' and 'properties' are enough to uniquely identify runs, but this should probably be more extensible + if values != {}: + previous_runs[(elem.get("name"), elem.get("properties"))] = ( + values + ) + + missing_runs = [] + for run in run_set.runs: + props = " ".join(sorted([prop.name for prop in run.properties])) + name = relative_path(run.identifier, result_file) + key = (name, props) + if key in previous_runs: + old_log = str( + run_set.real_name + + "." + + os.path.basename(run.identifier) + + ".log" ) - ) - if os.path.exists(old_files) and os.path.isdir(old_files): - os.makedirs(run.result_files_folder, exist_ok=True) - for file in os.listdir(old_files): - shutil.copy( - os.path.join(old_files, file), run.result_files_folder + if old_log in logfile_zip_ref.namelist(): + with logfile_zip_ref.open(old_log) as zipped_log, open( + run.log_file, "wb" + ) as target_log: + shutil.copyfileobj(zipped_log, target_log) + + old_files_prefix = ( + str( + os.path.join( + run_set.real_name, + os.path.basename(run.identifier), + ) + ) + + "/" ) - run.cmdline() # we need to call this, because it sets the _cmdline value - run.set_result(previous_runs[key]) - output_handler.output_after_run(run) + files_in_zip = [ + f + for f in file_zip_ref.namelist() + if f.startswith(old_files_prefix) + ] + if files_in_zip and len(files_in_zip) > 0: + os.makedirs(run.result_files_folder, exist_ok=True) + for file_in_zip in files_in_zip: + if not file_in_zip.endswith("/"): + with file_zip_ref.open( + file_in_zip + ) as source_file, open( + os.path.join( + run.result_files_folder, + os.path.basename(file_in_zip), + ), + "wb", + ) as target_file: + shutil.copyfileobj(source_file, target_file) + + run.cmdline() # we need to call this, because it sets the _cmdline value + run.set_result(previous_runs[key]) + output_handler.output_after_run(run) + else: + logging.warning( + f"Old files directory {old_files_prefix} does not exist. Skipping run {name}." + ) + missing_runs.append(run) + else: + logging.warning( + f"Old log {old_log} does not exist. Skipping run {name}." + ) + missing_runs.append(run) else: logging.warning( - f"Old files directory {old_files} does not exist. Skipping run {name}." + f"Run with key {key} not found in results. Skipping run {name}." ) missing_runs.append(run) - else: - logging.warning( - f"Old log {old_log} does not exist. Skipping run {name}." - ) - missing_runs.append(run) - else: - logging.warning( - f"Run with key {key} not found in results. Skipping run {name}." - ) - missing_runs.append(run) - - shutil.rmtree(logfile_folder) - shutil.rmtree(files_folder) - logging.info( - f"Successfully recovered {len(run_set.runs) - len(missing_runs)} runs, still missing {len(missing_runs)} more." - ) - return missing_runs + logging.info( + f"Successfully recovered {len(run_set.runs) - len(missing_runs)} runs, still missing {len(missing_runs)} more." + ) + return missing_runs def execute_batch( From c9e24bd8e9ba5ec6d193c3a848177d21ade13c82 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 28 Nov 2024 14:40:10 +0100 Subject: [PATCH 096/124] Added prefix to names --- contrib/slurm/arrayexecutor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 28834d01a..4c79c2ec2 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -233,10 +233,13 @@ def filter_previous_results(run_set, benchmark, output_handler): key = (name, props) if key in previous_runs: old_log = str( - run_set.real_name - + "." - + os.path.basename(run.identifier) - + ".log" + os.path.join( + str(os.path.basename(logfile_zip))[0 : -(len(".zip"))], + run_set.real_name + + "." + + os.path.basename(run.identifier) + + ".log", + ) ) if old_log in logfile_zip_ref.namelist(): with logfile_zip_ref.open(old_log) as zipped_log, open( @@ -247,6 +250,7 @@ def filter_previous_results(run_set, benchmark, output_handler): old_files_prefix = ( str( os.path.join( + str(os.path.basename(file_zip))[0 : -(len(".zip"))], run_set.real_name, os.path.basename(run.identifier), ) From 57b3c4b2e4b312bbbbeba0abf4842c533c282b00 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 30 Nov 2024 14:09:09 +0100 Subject: [PATCH 097/124] Fixed which params to check --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 4c79c2ec2..075eb8bc9 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -185,7 +185,7 @@ def filter_previous_results(run_set, benchmark, output_handler): return run_set.runs old_options = previous_results.get("options") - new_options = " ".join(benchmark.options) + new_options = " ".join(run_set.options) if old_options != new_options: logging.warning( f"Mismatch in tool options: old options='{old_options}', current options: '{new_options}'" From 61baa52b665c8f6aa2b417a8e4ecc87cd72a8e36 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 30 Nov 2024 15:03:04 +0100 Subject: [PATCH 098/124] Moved output printing outside of try block --- contrib/slurm/arrayexecutor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 075eb8bc9..1fc5fc3be 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -19,6 +19,7 @@ import tempfile import time import zipfile +from csv import excel from benchexec import tooladapter from benchexec.tablegenerator import parse_results_file, handle_union_tag @@ -426,6 +427,7 @@ def execute_batch( missing_runs = [] for bin in bins: for i, run in bins[bin]: + success = False try: run.set_result( get_run_result( @@ -435,7 +437,7 @@ def execute_batch( + ["*witness*"], # e.g., deagle uses mismatched naming ) ) - output_handler.output_after_run(run) + success = True except Exception as e: logging.warning("could not set result due to error: %s", e) if counter < benchmark.config.retry or benchmark.config.retry < 0: @@ -454,6 +456,12 @@ def execute_batch( os.path.basename(file) + ".error", ), ) + if success: + try: + output_handler.output_after_run(run) + except Exception as e: + logging.warning("could not print result due to error: %s", e) + if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: logging.info( f"Retrying {len(missing_runs)} runs due to errors. Current retry count for this batch: {counter}" From 76fc58c44b30d7f2fa492c96aa263e85405ad490 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 2 Dec 2024 16:42:01 +0100 Subject: [PATCH 099/124] Removed run result setting to outside of try --- contrib/slurm/arrayexecutor.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 1fc5fc3be..293296910 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -429,13 +429,11 @@ def execute_batch( for i, run in bins[bin]: success = False try: - run.set_result( - get_run_result( - os.path.join(tempdir, str(i)), - run, - benchmark.result_files_patterns - + ["*witness*"], # e.g., deagle uses mismatched naming - ) + result = get_run_result( + os.path.join(tempdir, str(i)), + run, + benchmark.result_files_patterns + + ["*witness*"], # e.g., deagle uses mismatched naming ) success = True except Exception as e: @@ -458,9 +456,10 @@ def execute_batch( ) if success: try: + run.set_result(result) output_handler.output_after_run(run) except Exception as e: - logging.warning("could not print result due to error: %s", e) + logging.warning("could not set result due to error: %s", e) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: logging.info( From 5d24b42166f3bb125bc88debd37c2c8410df6eea Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 16:39:59 +0100 Subject: [PATCH 100/124] Merging rundefs if smaller than batchsize --- contrib/slurm/arrayexecutor.py | 84 +++++++++++++++------------------- 1 file changed, 38 insertions(+), 46 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 293296910..d99e976f0 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -66,6 +66,18 @@ def execute_benchmark(benchmark, output_handler): "SLURM can only work properly without hyperthreading enabled, by passing the --no-hyperthreading option. See README.md for details." ) + if not benchmark.config.scratchdir: + sys.exit("No scratchdir present. Please specify using --scratchdir .") + elif not os.path.exists(benchmark.config.scratchdir): + os.makedirs(benchmark.config.scratchdir) + logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") + elif not os.path.isdir(benchmark.config.scratchdir): + sys.exit( + f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." + ) + + # First we execute the tests + runs = [] for runSet in benchmark.run_sets: if STOPPED_BY_INTERRUPT: break @@ -79,62 +91,42 @@ def execute_benchmark(benchmark, output_handler): ) else: - _execute_run_set( - runSet, - benchmark, - output_handler, - ) - - output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) - - -sbatch_pattern = re.compile(r"Submitted batch job (\d+)") + output_handler.output_before_run_set(runSet) + if benchmark.config.continue_interrupted: + runs.extend(filter_previous_results(runSet, benchmark, output_handler)) + else: + runs.extend(runSet.runs) -def _execute_run_set( - runSet, - benchmark, - output_handler, -): - global STOPPED_BY_INTERRUPT - - # get times before runSet - walltime_before = time.monotonic() + for i in range(0, len(runs), benchmark.config.batch_size): + if not STOPPED_BY_INTERRUPT: + chunk = runs[i : min(i + benchmark.config.batch_size, len(runs))] + execute_batch(chunk, benchmark, output_handler) - output_handler.output_before_run_set(runSet) - if not benchmark.config.scratchdir: - sys.exit("No scratchdir present. Please specify using --scratchdir .") - elif not os.path.exists(benchmark.config.scratchdir): - os.makedirs(benchmark.config.scratchdir) - logging.debug(f"Created scratchdir: {benchmark.config.scratchdir}") - elif not os.path.isdir(benchmark.config.scratchdir): - sys.exit( - f"Scratchdir {benchmark.config.scratchdir} not a directory. Please specify using --scratchdir ." - ) + # Second we set the outputs + for runSet in benchmark.run_sets: + if STOPPED_BY_INTERRUPT: + break - # get times after runSet - walltime_after = time.monotonic() - usedWallTime = walltime_after - walltime_before + if not runSet.should_be_executed(): + output_handler.output_for_skipping_run_set(runSet) - if benchmark.config.continue_interrupted: - runs = filter_previous_results(runSet, benchmark, output_handler) - else: - runs = runSet.runs + elif not runSet.runs: + output_handler.output_for_skipping_run_set( + runSet, "because it has no files" + ) - for i in range(0, len(runs), benchmark.config.batch_size): - if not STOPPED_BY_INTERRUPT: - chunk = runs[i : min(i + benchmark.config.batch_size, len(runs))] - execute_batch(chunk, benchmark, output_handler) + else: + output_handler.output_after_run_set( + runSet + ) + - if STOPPED_BY_INTERRUPT: - output_handler.set_error("interrupted", runSet) + output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) - output_handler.output_after_run_set( - runSet, - walltime=usedWallTime, - ) +sbatch_pattern = re.compile(r"Submitted batch job (\d+)") def filter_previous_results(run_set, benchmark, output_handler): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." From 72a1af0e7d1c98e8f812d91095cdc5d422e3f1b1 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 16:49:20 +0100 Subject: [PATCH 101/124] added --fakeroot and --contain to singularity command list --- contrib/slurm/arrayexecutor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index d99e976f0..2c49388b5 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -508,7 +508,7 @@ def get_resource_limits(benchmark, tempdir): def get_run_cli(benchmark, args, tempdir, resultdir): os.makedirs(resultdir) cli = [] - runexec = ["runexec", "--no-container"] + runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup"] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: @@ -527,6 +527,8 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "singularity", "exec", + "--fakeroot", + "--contain" "-B", "/sys/fs/cgroup:/sys/fs/cgroup:rw", "-B", From 5facff38bf078fd858d6269ca88cc0f877802934 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 22:59:51 +0100 Subject: [PATCH 102/124] Added workaround to container --- contrib/slurm/arrayexecutor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2c49388b5..1a520165e 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -528,7 +528,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "singularity", "exec", "--fakeroot", - "--contain" + "--contain", "-B", "/sys/fs/cgroup:/sys/fs/cgroup:rw", "-B", @@ -547,6 +547,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", + "pwd; cd {os.getcwd()}; pwd; ls; " f"{shlex.join(['echo', 'Running command: ', *args])}; " f"{shlex.join(args)} 2>&1 | tee log; ", ] @@ -555,7 +556,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay:rw'", ':/overlay:rw"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{','.join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" + cli = f"mkdir -p {tempdir}/{{upper,work}}; chmod 777 {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{','.join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli From 021c42fa3a8605d0517908b8945ffff35ded5614 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 23:01:46 +0100 Subject: [PATCH 103/124] Added workaround to container, now working --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 1a520165e..3fc3e14e3 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -547,7 +547,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", - "pwd; cd {os.getcwd()}; pwd; ls; " + f"pwd; cd {os.getcwd()}; pwd; ls; " f"{shlex.join(['echo', 'Running command: ', *args])}; " f"{shlex.join(args)} 2>&1 | tee log; ", ] From 69c8a977da390e6ac3881f71f1430670a26c5df1 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 23:03:58 +0100 Subject: [PATCH 104/124] chmod TMPDIR as well --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 3fc3e14e3..cc627cf7b 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -320,7 +320,7 @@ def execute_batch( batch_lines.extend( [f"\n#SBATCH --array=0-{number_of_bins - 1}%{benchmark.num_of_threads}"] ) - batch_lines.extend(["\n\nTMPDIR=$(mktemp -d)"]) + batch_lines.extend(["\n\nTMPDIR=$(mktemp -d)\nchmod 777 $TMPDIR"]) bins = {} # put all runs into a queue From 346e602bb4b20db1e2230b561b6e6124e240cbaa Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 23:07:45 +0100 Subject: [PATCH 105/124] Added readonlydir / --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index cc627cf7b..2c5471475 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -508,7 +508,7 @@ def get_resource_limits(benchmark, tempdir): def get_run_cli(benchmark, args, tempdir, resultdir): os.makedirs(resultdir) cli = [] - runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup"] + runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/"] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: From 170583671f5996407f2ac9c5449eac8da82effec Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Thu, 13 Mar 2025 23:15:26 +0100 Subject: [PATCH 106/124] Added fullaccessdir --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2c5471475..724f8f45a 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -508,7 +508,7 @@ def get_resource_limits(benchmark, tempdir): def get_run_cli(benchmark, args, tempdir, resultdir): os.makedirs(resultdir) cli = [] - runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/"] + runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--full-access-dir", os.getcwd()] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: From 06a944f722a0e1fd6847f440b273b5eb9f11d148 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 15 Mar 2025 21:09:13 +0100 Subject: [PATCH 107/124] Added missing overlay dir --- contrib/slurm/arrayexecutor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 724f8f45a..597d3387e 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -508,7 +508,9 @@ def get_resource_limits(benchmark, tempdir): def get_run_cli(benchmark, args, tempdir, resultdir): os.makedirs(resultdir) cli = [] - runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--full-access-dir", os.getcwd()] + basedir = os.path.abspath(os.path.dirname(singularity)) + + runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--overlay-dir", basedir, "--full-access-dir", os.getcwd()] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: @@ -521,7 +523,6 @@ def get_run_cli(benchmark, args, tempdir, resultdir): runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) args = [*runexec, "--", *args] - basedir = os.path.abspath(os.path.dirname(singularity)) cli.extend( [ From 5b7130b7b51e245fe7d66cccd94609494a28df31 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sat, 15 Mar 2025 21:32:32 +0100 Subject: [PATCH 108/124] readonly basedir --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 597d3387e..f67a01a6a 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -510,7 +510,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = [] basedir = os.path.abspath(os.path.dirname(singularity)) - runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--overlay-dir", basedir, "--full-access-dir", os.getcwd()] + runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--read-only-dir", basedir, "--full-access-dir", os.getcwd()] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: From 0118b47b1f9db591bad8a98a9d5d04d8a87a8378 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Tue, 8 Apr 2025 19:10:24 +0200 Subject: [PATCH 109/124] simplified filesystem --- contrib/slurm/arrayexecutor.py | 45 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index f67a01a6a..72556d140 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -97,14 +97,12 @@ def execute_benchmark(benchmark, output_handler): else: runs.extend(runSet.runs) - for i in range(0, len(runs), benchmark.config.batch_size): if not STOPPED_BY_INTERRUPT: chunk = runs[i : min(i + benchmark.config.batch_size, len(runs))] execute_batch(chunk, benchmark, output_handler) - - # Second we set the outputs + # Second we set the outputs for runSet in benchmark.run_sets: if STOPPED_BY_INTERRUPT: break @@ -118,16 +116,14 @@ def execute_benchmark(benchmark, output_handler): ) else: - output_handler.output_after_run_set( - runSet - ) - + output_handler.output_after_run_set(runSet) output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) sbatch_pattern = re.compile(r"Submitted batch job (\d+)") + def filter_previous_results(run_set, benchmark, output_handler): prefix_base = f"{benchmark.config.output_path}{benchmark.name}." files = list( @@ -391,9 +387,6 @@ def execute_batch( with open(batchfile, "w") as f: f.writelines(batch_lines) - logging.info("Waiting for 10s for the newly created files to settle (NFS)") - time.sleep(10) - try: sbatch_cmd = ["sbatch", "--wait", str(batchfile)] logging.debug("Command to run: %s", shlex.join(sbatch_cmd)) @@ -510,7 +503,23 @@ def get_run_cli(benchmark, args, tempdir, resultdir): cli = [] basedir = os.path.abspath(os.path.dirname(singularity)) - runexec = ["runexec", "--full-access-dir", "/sys/fs/cgroup", "--read-only-dir", "/", "--read-only-dir", basedir, "--full-access-dir", os.getcwd()] + runexec = [ + "runexec", + "--full-access-dir", + "/sys/fs/cgroup", + "--read-only-dir", + "/", + "--overlay-dir", + os.getcwd(), + "--hidden-dir", + "/home", + "--output-directory", + "/results", + "--output", + "/results/output.log", + "--result-files", + "**/*witness*", + ] if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: @@ -533,14 +542,12 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-B", "/sys/fs/cgroup:/sys/fs/cgroup:rw", "-B", - f"{basedir}:{basedir}:ro", + f"{basedir}:/lower:ro", "-B", - f"{os.getcwd()}:/lower:ro", + f"{resultdir}:/results:rw", "--no-home", - "-B", - f"{tempdir}:/overlay:rw", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/overlay/upper -o workdir=/overlay/work {os.getcwd()}", + f"container:fuse-overlayfs -o lowerdir=/lower {basedir}", singularity, ] ) @@ -548,16 +555,14 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", - f"pwd; cd {os.getcwd()}; pwd; ls; " + f"cd {os.getcwd()}; " f"{shlex.join(['echo', 'Running command: ', *args])}; " - f"{shlex.join(args)} 2>&1 | tee log; ", + f"{shlex.join(args)} 2>&1 | tee /results/log; ", ] ) cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") - cli = cli.replace("'$TMPDIR", '"$TMPDIR').replace(":/overlay:rw'", ':/overlay:rw"') - cli = f"mkdir -p {tempdir}/{{upper,work}}; chmod 777 {tempdir}/{{upper,work}}; {cli}; mv {tempdir}/upper/{{log,output.log,*witness*,{','.join(benchmark.result_files_patterns)}}} {resultdir}/; rm -r {tempdir}" logging.debug("Command to run: %s", cli) return cli From 2255b672e231f9ebd1cf2139b6d8d8f135173c41 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 14:58:55 +0200 Subject: [PATCH 110/124] Added --copy-tool --- contrib/slurm-benchmark.py | 6 ++++++ contrib/slurm/arrayexecutor.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index cc6d88985..3bd516720 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -99,6 +99,12 @@ def create_argument_parser(self): action="store_true", help="Continue a previously interrupted job.", ) + slurm_args.add_argument( + "--copy-tool", + dest="copy_tool", + action="store_true", + help="Make a copy of the tool folder in the container.", + ) return parser diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 72556d140..bd9ca6631 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -510,7 +510,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "--read-only-dir", "/", "--overlay-dir", - os.getcwd(), + "$tooldir" if benchmark.config.copy_tool else os.getcwd(), "--hidden-dir", "/home", "--output-directory", @@ -547,7 +547,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): f"{resultdir}:/results:rw", "--no-home", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower {basedir}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/tmp -o workdir=/tmp {basedir}", singularity, ] ) @@ -555,14 +555,19 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", - f"cd {os.getcwd()}; " - f"{shlex.join(['echo', 'Running command: ', *args])}; " - f"{shlex.join(args)} 2>&1 | tee /results/log; ", + ( + f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -r {os.getcwd()}/. $tooldir/; cd $tooldir; " + if benchmark.config.copy_tool + else f"cd {os.getcwd()}; " + f"{shlex.join(['echo', 'Running command: ', *args])}; " + f"{shlex.join(args)} 2>&1 | tee /results/log; " + ), ] ) cli = shlex.join(cli) cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") + cli = cli.replace("'\"'\"'$tooldir'\"'\"'", "$tooldir") logging.debug("Command to run: %s", cli) return cli From ecb68ca6a8081583a09e41d683ff00dbe23d3e7f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 15:08:00 +0200 Subject: [PATCH 111/124] added parens --- contrib/slurm/arrayexecutor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index bd9ca6631..1965c1072 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -559,9 +559,9 @@ def get_run_cli(benchmark, args, tempdir, resultdir): f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -r {os.getcwd()}/. $tooldir/; cd $tooldir; " if benchmark.config.copy_tool else f"cd {os.getcwd()}; " - f"{shlex.join(['echo', 'Running command: ', *args])}; " - f"{shlex.join(args)} 2>&1 | tee /results/log; " - ), + ) + + f"{shlex.join(['echo', 'Running command: ', *args])}; " + f"{shlex.join(args)} 2>&1 | tee /results/log; ", ] ) From 907c21f95ec55261760947f84115cc14250c3e1c Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 15:55:24 +0200 Subject: [PATCH 112/124] Unsetting TMPDIR --- contrib/slurm/arrayexecutor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 1965c1072..2d7009562 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -555,7 +555,8 @@ def get_run_cli(benchmark, args, tempdir, resultdir): [ "sh", "-c", - ( + "unset TMPDIR; " + + ( f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -r {os.getcwd()}/. $tooldir/; cd $tooldir; " if benchmark.config.copy_tool else f"cd {os.getcwd()}; " From 367eebef4ca9dd61c1c9f626766e5feed64e3db3 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:41:43 +0200 Subject: [PATCH 113/124] Added input file copying as well --- contrib/slurm/arrayexecutor.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 2d7009562..8e3d286f2 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -531,6 +531,18 @@ def get_run_cli(benchmark, args, tempdir, resultdir): if benchmark.rlimits.memory: runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) + need_copy = [] + if benchmark.config.copy_tool: + def map_arg(arg): + if os.path.exists(arg) and os.path.dirname(arg) != os.getcwd(): + new_arg = os.path.join("/tmp", os.path.basename(arg)) + need_copy.append(arg) + return new_arg + else: + return arg + args = [arg for arg in args] + + args = [*runexec, "--", *args] cli.extend( @@ -547,7 +559,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): f"{resultdir}:/results:rw", "--no-home", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/tmp -o workdir=/tmp {basedir}", + f"container:fuse-overlayfs -o lowerdir=/lower {basedir}", singularity, ] ) @@ -557,7 +569,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "-c", "unset TMPDIR; " + ( - f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -r {os.getcwd()}/. $tooldir/; cd $tooldir; " + f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cd $tooldir; cp -vr {" ".join(need_copy)} /tmp/; " if benchmark.config.copy_tool else f"cd {os.getcwd()}; " ) From a7db875e7c9eceb6ff256032ed89de8772231034 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:43:26 +0200 Subject: [PATCH 114/124] Added input file copying as well (fixed) --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 8e3d286f2..3648ff24b 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -534,7 +534,7 @@ def get_run_cli(benchmark, args, tempdir, resultdir): need_copy = [] if benchmark.config.copy_tool: def map_arg(arg): - if os.path.exists(arg) and os.path.dirname(arg) != os.getcwd(): + if os.path.exists(arg) and not str(os.path.abspath(arg)).startswith(str(os.path.abspath(os.getcwd()))): new_arg = os.path.join("/tmp", os.path.basename(arg)) need_copy.append(arg) return new_arg From 79bb0067b36c0da824401190fa2889c626299c84 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:44:40 +0200 Subject: [PATCH 115/124] Added input file copying as well (fixed) --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 3648ff24b..48633c559 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -569,7 +569,7 @@ def map_arg(arg): "-c", "unset TMPDIR; " + ( - f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cd $tooldir; cp -vr {" ".join(need_copy)} /tmp/; " + f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cp -vr {' '.join(need_copy)} /tmp/; cd $tooldir; " if benchmark.config.copy_tool else f"cd {os.getcwd()}; " ) From e4a4006cbda07285992da07ce536012f9d8ec86b Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:47:58 +0200 Subject: [PATCH 116/124] put upper and workdir back --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 48633c559..b47909158 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -559,7 +559,7 @@ def map_arg(arg): f"{resultdir}:/results:rw", "--no-home", "--fusemount", - f"container:fuse-overlayfs -o lowerdir=/lower {basedir}", + f"container:fuse-overlayfs -o lowerdir=/lower -o upperdir=/tmp -o workdir=/tmp {basedir}", singularity, ] ) From ca39e011acec869b305daf21cc49dcdd13b542c5 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:49:44 +0200 Subject: [PATCH 117/124] Added missing mapping --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index b47909158..04e3fed65 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -540,7 +540,7 @@ def map_arg(arg): return new_arg else: return arg - args = [arg for arg in args] + args = [map_arg(arg) for arg in args] args = [*runexec, "--", *args] From acf3454cf59567025f4e9d226c69b6ecda3bf9c9 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:51:07 +0200 Subject: [PATCH 118/124] Switched cd and cp --- contrib/slurm/arrayexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 04e3fed65..87e7ffffb 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -569,7 +569,7 @@ def map_arg(arg): "-c", "unset TMPDIR; " + ( - f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cp -vr {' '.join(need_copy)} /tmp/; cd $tooldir; " + f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cd $tooldir; cp -vr {' '.join(need_copy)} /tmp/; " if benchmark.config.copy_tool else f"cd {os.getcwd()}; " ) From bc91d1db32d6fb3c142901243c1a84ff80c5adff Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Fri, 11 Apr 2025 23:53:01 +0200 Subject: [PATCH 119/124] Added /tmp --- contrib/slurm/arrayexecutor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 87e7ffffb..a2a150a9a 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -520,6 +520,8 @@ def get_run_cli(benchmark, args, tempdir, resultdir): "--result-files", "**/*witness*", ] + if benchmark.config.copy_tool: + runexec.extend(["--full-access-dir", "/tmp"]) if benchmark.rlimits.cputime_hard: runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) if benchmark.rlimits.cputime: From 438d7a323889aedc87158e60271067e66ad4e055 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sun, 20 Apr 2025 21:56:41 +0200 Subject: [PATCH 120/124] Added sleeping and fsync --- contrib/slurm/arrayexecutor.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index a2a150a9a..7c2f56579 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -118,6 +118,8 @@ def execute_benchmark(benchmark, output_handler): else: output_handler.output_after_run_set(runSet) + time.sleep(5) + output_handler.output_after_benchmark(STOPPED_BY_INTERRUPT) @@ -409,6 +411,8 @@ def execute_batch( logging.debug(f"Canceling sbatch job #{jobid}") subprocess.run(["scancel", str(jobid)]) + time.sleep(5) + missing_runs = [] for bin in bins: for i, run in bins[bin]: @@ -616,6 +620,11 @@ def get_run_result(tempdir, run, result_files_patterns): shutil.copy(tmp_log, run.log_file) + # 1. fsync the file itself + fd = os.open(run.log_file, os.O_RDONLY) + os.fsync(fd) # ensure data+metadata on dst are committed from client + os.close(fd) + if os.path.exists(tempdir): os.makedirs(run.result_files_folder, exist_ok=True) for result_files_pattern in result_files_patterns: From 7ad0d6ede9e10d59bc3235e12f3475d6c19eb593 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Sun, 20 Apr 2025 23:29:34 +0200 Subject: [PATCH 121/124] Added better log --- contrib/slurm/arrayexecutor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 7c2f56579..13a235866 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -45,8 +45,8 @@ def init(config, benchmark): singularity = benchmark.config.singularity tool_locator = tooladapter.create_tool_locator(config) - benchmark.executable = benchmark.tool.executable(tool_locator) benchmark.tool.version = version_in_container(singularity, benchmark.tool_module) + benchmark.executable = benchmark.tool.executable(tool_locator) try: benchmark.tool_version = benchmark.tool.version(benchmark.executable) except Exception as e: @@ -448,7 +448,7 @@ def execute_batch( run.set_result(result) output_handler.output_after_run(run) except Exception as e: - logging.warning("could not set result due to error: %s", e) + logging.warning("could not set result due to error, and won't retry: %s", e) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: logging.info( From d817a58b93aa6ead90b73cf2a57b520d31522347 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 21 Apr 2025 20:35:39 +0200 Subject: [PATCH 122/124] Instead of fsync, using delay --- contrib/slurm/arrayexecutor.py | 38 ++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 13a235866..031a8fba3 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -19,7 +19,6 @@ import tempfile import time import zipfile -from csv import excel from benchexec import tooladapter from benchexec.tablegenerator import parse_results_file, handle_union_tag @@ -414,9 +413,9 @@ def execute_batch( time.sleep(5) missing_runs = [] + success_runs = [] for bin in bins: for i, run in bins[bin]: - success = False try: result = get_run_result( os.path.join(tempdir, str(i)), @@ -424,7 +423,7 @@ def execute_batch( benchmark.result_files_patterns + ["*witness*"], # e.g., deagle uses mismatched naming ) - success = True + success_runs.append((run, result)) except Exception as e: logging.warning("could not set result due to error: %s", e) if counter < benchmark.config.retry or benchmark.config.retry < 0: @@ -443,12 +442,17 @@ def execute_batch( os.path.basename(file) + ".error", ), ) - if success: - try: - run.set_result(result) - output_handler.output_after_run(run) - except Exception as e: - logging.warning("could not set result due to error, and won't retry: %s", e) + + time.sleep(10) + + for run, result in success_runs: + try: + run.set_result(result) + output_handler.output_after_run(run) + except Exception as e: + logging.warning( + "could not set result due to error, and won't retry: %s", e + ) if len(missing_runs) > 0 and not STOPPED_BY_INTERRUPT: logging.info( @@ -539,15 +543,22 @@ def get_run_cli(benchmark, args, tempdir, resultdir): need_copy = [] if benchmark.config.copy_tool: + def map_arg(arg): - if os.path.exists(arg) and not str(os.path.abspath(arg)).startswith(str(os.path.abspath(os.getcwd()))): + if ( + os.path.exists(arg) + and os.path.isfile(arg) + and not str(os.path.abspath(arg)).startswith( + str(os.path.abspath(os.getcwd())) + ) + ): new_arg = os.path.join("/tmp", os.path.basename(arg)) need_copy.append(arg) return new_arg else: return arg - args = [map_arg(arg) for arg in args] + args = [map_arg(arg) for arg in args] args = [*runexec, "--", *args] @@ -620,11 +631,6 @@ def get_run_result(tempdir, run, result_files_patterns): shutil.copy(tmp_log, run.log_file) - # 1. fsync the file itself - fd = os.open(run.log_file, os.O_RDONLY) - os.fsync(fd) # ensure data+metadata on dst are committed from client - os.close(fd) - if os.path.exists(tempdir): os.makedirs(run.result_files_folder, exist_ok=True) for result_files_pattern in result_files_patterns: From d4d5ed058fe2827a4e4e9e375f7e47a3d0ec974f Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 20 Oct 2025 22:08:26 +0200 Subject: [PATCH 123/124] removed dependency on runexec --- contrib/slurm-benchmark.py | 15 +++ contrib/slurm/arrayexecutor.py | 192 +++++++++------------------------ contrib/slurm/utils.py | 34 +----- 3 files changed, 67 insertions(+), 174 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index 3bd516720..be88e4fcc 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -14,6 +14,8 @@ import os import sys +from esphome.cpp_types import double + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import benchexec.benchexec @@ -93,6 +95,13 @@ def create_argument_parser(self): default="4", help="Run this many tasks at once in one job.", ) + slurm_args.add_argument( + "--overtime-factor", + dest="overtime_factor", + type=float, + default="1.1", + help="Factor which by to scale timelimits to overapproximate CPU time limit with walltime limit.", + ) slurm_args.add_argument( "--continue-interrupted", dest="continue_interrupted", @@ -105,6 +114,12 @@ def create_argument_parser(self): action="store_true", help="Make a copy of the tool folder in the container.", ) + slurm_args.add_argument( + "--generate-only", + dest="generate_only", + action="store_true", + help="Only generate the SLURM array description, don't run it.", + ) return parser diff --git a/contrib/slurm/arrayexecutor.py b/contrib/slurm/arrayexecutor.py index 031a8fba3..736dba2ab 100644 --- a/contrib/slurm/arrayexecutor.py +++ b/contrib/slurm/arrayexecutor.py @@ -21,13 +21,11 @@ import zipfile from benchexec import tooladapter -from benchexec.tablegenerator import parse_results_file, handle_union_tag +from benchexec.tablegenerator import parse_results_file from benchexec.util import ProcessExitCode, relative_path from contrib.slurm.utils import ( version_in_container, get_system_info_srun, - get_cpu_cmd, - lock_cpu_cmds, ) sys.dont_write_bytecode = True # prevent creation of .pyc files @@ -302,16 +300,10 @@ def execute_batch( global STOPPED_BY_INTERRUPT number_of_bins = int(len(runs) / benchmark.config.aggregation_factor) + 1 - use_concurrency = benchmark.config.concurrency_factor != 1 - if use_concurrency: - get_cpus = get_cpu_cmd( - benchmark.config.concurrency_factor, benchmark.rlimits.cpu_cores - ) - - with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir) as tempdir: + with tempfile.TemporaryDirectory(dir=benchmark.config.scratchdir, delete=not benchmark.config.generate_only) as tempdir: batch_lines = ["#!/bin/bash"] - for setting in get_resource_limits(benchmark, tempdir): + for setting in get_resource_limits(benchmark, benchmark.config.concurrency_factor, math.ceil(benchmark.config.aggregation_factor * 1.0 / benchmark.config.concurrency_factor)): batch_lines.extend(["\n#SBATCH " + str(setting)]) batch_lines.extend( @@ -326,61 +318,30 @@ def execute_batch( bins[i % number_of_bins] = [] bins[i % number_of_bins].append((i, run)) - if use_concurrency: - batch_lines.extend(["\n\n" + get_cpus]) - batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) - for bin in bins: - lock_cpus, unlock_cpus = lock_cpu_cmds( - benchmark.config.concurrency_factor, tempdir, bin - ) - batch_lines.extend(["\n" + str(bin) + ") "]) - taskfile_name = f"bin{str(bin)}.tasks" - taskfile = os.path.join(tempdir, taskfile_name) - with open(taskfile, "w") as f: - task_lines = [] - for i, run in bins[bin]: - task_lines.extend( - [ - lock_cpus - + " && " - + str( - get_run_cli( - benchmark, - run.cmdline(), - os.path.join("$TMPDIR", str(i)), - os.path.join(tempdir, str(i)), - ) - ) - + "; " - + unlock_cpus - + "\n" - ] - ) - f.writelines(task_lines) - batch_lines.extend( - f'\n while read -r x; do /bin/sh -c "$x" & done < {taskfile}' - ) - batch_lines.extend("\n wait") - batch_lines.extend(["\n;;"]) - else: - batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) - for bin in bins: - batch_lines.extend(["\n" + str(bin) + ") "]) + batch_lines.extend(["\n\ncase $SLURM_ARRAY_TASK_ID in"]) + for bin in bins: + batch_lines.extend(["\n" + str(bin) + ") "]) + taskfile_name = f"bin{str(bin)}.tasks" + taskfile = os.path.join(tempdir, taskfile_name) + with open(taskfile, "w") as f: + task_lines = [] for i, run in bins[bin]: - batch_lines.extend( - [ - "\n " - + str( + task_lines.extend( + [str( get_run_cli( benchmark, run.cmdline(), - os.path.join("$TMPDIR", str(i)), os.path.join(tempdir, str(i)), ) - ) + )+ "\n" ] ) - batch_lines.extend(["\n;;"]) + f.writelines(task_lines) + batch_lines.extend( + f'\n while read -r x; do /bin/sh -c "$x" & done < {taskfile}' + ) + batch_lines.extend("\n wait") + batch_lines.extend(["\n;;"]) batch_lines.extend(["\nesac"]) @@ -388,6 +349,9 @@ def execute_batch( with open(batchfile, "w") as f: f.writelines(batch_lines) + if benchmark.config.generate_only: + return + try: sbatch_cmd = ["sbatch", "--wait", str(batchfile)] logging.debug("Command to run: %s", shlex.join(sbatch_cmd)) @@ -396,7 +360,6 @@ def execute_batch( stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) - except KeyboardInterrupt: STOPPED_BY_INTERRUPT = True @@ -466,26 +429,21 @@ def stop(): STOPPED_BY_INTERRUPT = True -def get_resource_limits(benchmark, tempdir): +def get_resource_limits(benchmark, parallel_factor=1, sequential_factor=1): timelimit = int( max( - int(benchmark.rlimits.cputime if benchmark.rlimits.cputime else -1), - int(benchmark.rlimits.walltime if benchmark.rlimits.walltime else -1), + int(benchmark.rlimits.cputime if benchmark.rlimits.cputime else 0), + int(benchmark.rlimits.walltime if benchmark.rlimits.walltime else 0), int( - benchmark.rlimits.cputime_hard if benchmark.rlimits.cputime_hard else -1 + benchmark.rlimits.cputime_hard if benchmark.rlimits.cputime_hard else 0 ), - ) # safe overapprox - * math.ceil( - benchmark.config.aggregation_factor / benchmark.config.concurrency_factor - ) - * 1.5 # to let all processes finish, we add 50% + ) * benchmark.config.overtime_factor * sequential_factor ) assert timelimit > 0, "Either cputime, cputime_hard, or walltime should be given." - cpus = benchmark.rlimits.cpu_cores * benchmark.config.concurrency_factor + cpus = benchmark.rlimits.cpu_cores * parallel_factor memory = ( - benchmark.rlimits.memory * benchmark.config.concurrency_factor * 1.5 - ) # so that runexec catches the OOM, not SLURM (other stuff runs in the container as well) - os.makedirs(os.path.join(tempdir, "logs"), exist_ok=True) + benchmark.rlimits.memory * parallel_factor + ) srun_timelimit_h = int(timelimit / 3600) srun_timelimit_m = int((timelimit % 3600) / 60) @@ -495,7 +453,6 @@ def get_resource_limits(benchmark, tempdir): ) ret = [ - f"--output={tempdir}/logs/%A_%a.out", "--time=" + str(srun_timelimit), "--cpus-per-task=" + str(cpus), "--mem=" + str(int(memory / 1000000)) + "M", @@ -506,70 +463,19 @@ def get_resource_limits(benchmark, tempdir): return ret -def get_run_cli(benchmark, args, tempdir, resultdir): +def get_run_cli(benchmark, args, resultdir): os.makedirs(resultdir) cli = [] basedir = os.path.abspath(os.path.dirname(singularity)) - runexec = [ - "runexec", - "--full-access-dir", - "/sys/fs/cgroup", - "--read-only-dir", - "/", - "--overlay-dir", - "$tooldir" if benchmark.config.copy_tool else os.getcwd(), - "--hidden-dir", - "/home", - "--output-directory", - "/results", - "--output", - "/results/output.log", - "--result-files", - "**/*witness*", - ] - if benchmark.config.copy_tool: - runexec.extend(["--full-access-dir", "/tmp"]) - if benchmark.rlimits.cputime_hard: - runexec.extend(["--timelimit", str(benchmark.rlimits.cputime_hard)]) - if benchmark.rlimits.cputime: - runexec.extend(["--softtimelimit", str(benchmark.rlimits.cputime)]) - if benchmark.rlimits.walltime: - runexec.extend(["--walltimelimit", str(benchmark.rlimits.walltime)]) - if benchmark.config.concurrency_factor != 1: - runexec.extend(["--cores", "$CPUSET"]) - if benchmark.rlimits.memory: - runexec.extend(["--memlimit", str(benchmark.rlimits.memory)]) - - need_copy = [] - if benchmark.config.copy_tool: - - def map_arg(arg): - if ( - os.path.exists(arg) - and os.path.isfile(arg) - and not str(os.path.abspath(arg)).startswith( - str(os.path.abspath(os.getcwd())) - ) - ): - new_arg = os.path.join("/tmp", os.path.basename(arg)) - need_copy.append(arg) - return new_arg - else: - return arg - - args = [map_arg(arg) for arg in args] + base_cmd = ["srun", "--exclusive", *get_resource_limits(benchmark, 1, 1)] - args = [*runexec, "--", *args] - - cli.extend( + base_cmd.extend( [ "singularity", "exec", - "--fakeroot", - "--contain", "-B", - "/sys/fs/cgroup:/sys/fs/cgroup:rw", + "/sys/fs/cgroup:/sys/fs/cgroup:ro", "-B", f"{basedir}:/lower:ro", "-B", @@ -580,24 +486,28 @@ def map_arg(arg): singularity, ] ) - cli.extend( + base_cmd.extend( [ - "sh", + "bash", # bash is needed for ${PIPESTATUS[0]} "-c", - "unset TMPDIR; " - + ( - f"tooldir=$(mktemp -d -p {os.path.dirname(os.getcwd())}); cp -vr {os.getcwd()}/. $tooldir/; cd $tooldir; cp -vr {' '.join(need_copy)} /tmp/; " - if benchmark.config.copy_tool - else f"cd {os.getcwd()}; " - ) - + f"{shlex.join(['echo', 'Running command: ', *args])}; " - f"{shlex.join(args)} 2>&1 | tee /results/log; ", + f"cd {os.getcwd()}; " + "start=$(date +%s.%N); " + "CG_CPU=$(awk -F: \"$2 ~ /cpu/ {print $3;exit}\" /proc/self/cgroup); " + "CG_MEM=$(awk -F: \"$2 ~ /memory/ {print $3;exit}\" /proc/self/cgroup); " + "BASE_CPU=\"/sys/fs/cgroup/cpu$CG_CPU\"; " + "BASE_MEM=\"/sys/fs/cgroup/memory$CG_MEM\"; " + "before_cpu=$(cat $BASE_CPU/cpuacct.usage 2>/dev/null); " + f"{shlex.join(['echo', 'Running command: ', *args])}; " + f"{shlex.join(args)} 2>&1 | tee /results/log; " + "rv=${PIPESTATUS[0]}; " + "end=$(date +%s.%N); " + "after_cpu=$(cat $BASE_CPU/cpuacct.usage 2>/dev/null); " + "mem=$(cat $BASE_MEM/memory.max_usage_in_bytes 2>/dev/null || cat $BASE_MEM/memory.usage_in_bytes 2>/dev/null); " + "awk -v start=\"$start\" -v end=\"$end\" -v before_cpu=\"$before_cpu\" -v after_cpu=\"$after_cpu\" -v mem=\"$mem\" -v rv=\"$rv\" \"BEGIN { walltime=end-start;cputime=(after_cpu-before_cpu)/1e9; printf \\\"walltime=%.3fs\\ncputime=%.3fs\\nmemory=%dB\\nreturnvalue=%d\\n\\\", walltime, cputime, mem, rv }\"' >/results/output.log", ] ) - cli = shlex.join(cli) - cli = cli.replace("'\"'\"'$CPUSET'\"'\"'", "'$CPUSET'") - cli = cli.replace("'\"'\"'$tooldir'\"'\"'", "$tooldir") + cli = shlex.join(base_cmd) logging.debug("Command to run: %s", cli) return cli diff --git a/contrib/slurm/utils.py b/contrib/slurm/utils.py index 8c54ec13f..ffd897506 100644 --- a/contrib/slurm/utils.py +++ b/contrib/slurm/utils.py @@ -93,36 +93,4 @@ def get_system_info_srun(singularity): except Exception as e: logging.warning("could not determine system info due to error: %s", e) - return None - - -def get_cpu_cmd(concurrency_factor, cores): - get_cpus = ( - "cpus=($(scontrol show job -d \"$SLURM_JOB_ID\" | grep -o 'CPU_IDs=[^ ]*' | " - "awk -F= ' { print $2 } ' | head -n1 | " - "awk -F, ' { for (i = 1; i <= NF; i++ ) { if ($i ~ /-/) " - '{ split($i, range, "-"); for (j = range[1]; j <= range[2]; j++ ) { print j } } ' - "else { print $i } } }'))" - '\necho "${cpus[@]}"' - ) - for i in range(concurrency_factor): - get_cpus = ( - get_cpus - + f'\nexport cpuset{i}=$(IFS=,; echo "${{cpus[*]:{i * cores}:{cores}}}")' - ) - return get_cpus - - -def lock_cpu_cmds(concurrency_factor, tempdir, bin): - lock_cpus = 'CPUSET=""; while ! {' - for i in range(concurrency_factor): - lock_cpus = ( - lock_cpus - + f' {{ mkdir {tempdir}/cpuset_{bin}_{i} 2>/dev/null && cpuset={i} && CPUSET="$cpuset{i}"; }}' - ) - if i == concurrency_factor - 1: - lock_cpus = lock_cpus + "; }; do sleep 1; done" - else: - lock_cpus = lock_cpus + " ||" - unlock_cpus = f"rm -r {tempdir}/cpuset_{bin}_$cpuset" - return lock_cpus, unlock_cpus + return None \ No newline at end of file From fdabfb5c054fca15ad5ae1b6b7987a74291f2840 Mon Sep 17 00:00:00 2001 From: Levente Bajczi Date: Mon, 20 Oct 2025 22:11:09 +0200 Subject: [PATCH 124/124] removed nonsense import --- contrib/slurm-benchmark.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/contrib/slurm-benchmark.py b/contrib/slurm-benchmark.py index be88e4fcc..ae724665a 100755 --- a/contrib/slurm-benchmark.py +++ b/contrib/slurm-benchmark.py @@ -14,8 +14,6 @@ import os import sys -from esphome.cpp_types import double - sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import benchexec.benchexec