cms-sw · smuzaffar · Dec 15, 2025 · Dec 12, 2025
diff --git a/comparisons/compare-maxmem-summary.py b/comparisons/compare-maxmem-summary.py
@@ -8,9 +8,9 @@
 import json
 import glob
 import re
+import sys
 
-MAXMEM_WARN_THRESHOLD = 1.0
-MAXMEM_ERROR_THRESHOLD = 10.0
+import maxmem_threshold
 
 
 def KILL(message):
@@ -50,9 +50,9 @@ def compare_maxmem_summary(**kwargs):
                 nalloc_pr = max_memory_pr_dict[step].get("# allocations calls")
                 ndalloc_pr = max_memory_pr_dict[step].get("# deallocations calls")
                 nlalloc_pr = nalloc_pr - ndalloc_pr if (nalloc_pr and ndalloc_pr) else 0
-                max_memory_pr = max_mem_pr / 1000000 if max_mem_pr else 0.0
-                req_memory_pr = req_mem_pr / 1000000 if req_mem_pr else 0.0
-                leak_memory_pr = leak_mem_pr / 1000000 if leak_mem_pr else 0.0
+                max_memory_pr = max_mem_pr / (1024 * 1024) if max_mem_pr else 0.0
+                req_memory_pr = req_mem_pr / (1024 * 1024) if req_mem_pr else 0.0
+                leak_memory_pr = leak_mem_pr / (1024 * 1024) if leak_mem_pr else 0.0
                 nallocated_pr = nalloc_pr if nalloc_pr else 0
 
                 max_mem_base = max_memory_base_dict[step].get("max memory used")
@@ -61,9 +61,9 @@ def compare_maxmem_summary(**kwargs):
                 nalloc_base = max_memory_base_dict[step].get("# allocations calls")
                 ndalloc_base = max_memory_base_dict[step].get("# deallocations calls")
                 nlalloc_base = nalloc_base - ndalloc_base if (nalloc_base and ndalloc_base) else 0
-                max_memory_base = max_mem_base / 1000000 if max_mem_base else 0.0
-                req_memory_base = req_mem_base / 1000000 if req_mem_base else 0.0
-                leak_memory_base = leak_mem_base / 1000000 if leak_mem_base else 0.0
+                max_memory_base = max_mem_base / (1024 * 1024) if max_mem_base else 0.0
+                req_memory_base = req_mem_base / (1024 * 1024) if req_mem_base else 0.0
+                leak_memory_base = leak_mem_base / (1024 * 1024) if leak_mem_base else 0.0
                 nallocated_base = nalloc_base if nalloc_base else 0
 
                 max_mem_pdiff = max_memory_pdiff_dict[step].get("max memory used")
@@ -199,36 +199,40 @@ def stepfn(step):
         summaryLine += [
             '<tr><td style="border-bottom-style:hidden;border-top-style:hidden;">&lt;PR - baseline (MB)&gt;</td>'
         ]
-        for step in sorted(workflows[workflow].keys(), key=stepfn):
-            summaryLine += [
-                '<td style="border-bottom-style:hidden;border-top-style:hidden;">',
-                "{:,.2f}".format(workflows[workflow][step]["max memory adiff"]),
-                "</td>",
-            ]
-        summaryLine += [
-            "</tr>",
-        ]
-        summaryLine += [
-            '<tr><td style="border-top-style:hidden">&lt;100 * (PR - baseline)/baseline &gt;</td>'
-        ]
         for step in sorted(workflows[workflow].keys(), key=stepfn):
             threshold = workflows[workflow][step]["threshold"]
             if not threshold:
-                threshold = 1.0
+                threshold = maxmem_threshold.WARN_THRESHOLD
             error_threshold = workflows[workflow][step].get("error_threshold")
             if not error_threshold:
-                error_threshold = 10.0
-            cellString = '<td style="border-top-style:hidden" '
+                error_threshold = maxmem_threshold.ERROR_THRESHOLD
+            cellString = '<td style="border-bottom-style:hidden;border-top-style:hidden;" '
             color = ""
-            if abs(workflows[workflow][step]["max memory pdiff"]) > MAXMEM_WARN_THRESHOLD:
+            if workflows[workflow][step]["max memory adiff"] > threshold:
                 color = 'bgcolor="orange"'
-            if abs(workflows[workflow][step]["max memory pdiff"]) > MAXMEM_ERROR_THRESHOLD:
+            if workflows[workflow][step]["max memory adiff"] > error_threshold:
                 color = 'bgcolor="red"'
+            if workflows[workflow][step]["max memory adiff"] < -1 * threshold:
+                color = 'bgcolor="yellow"'
+            if workflows[workflow][step]["max memory adiff"] < -1 * error_threshold:
+                color = 'bgcolor="green"'
             cellString += color
             cellString += ">"
             summaryLine += [
                 cellString,
-                "{:,.3f}".format(workflows[workflow][step]["max memory pdiff"]),
+                "{:,.3f}".format(workflows[workflow][step]["max memory adiff"]),
+                "</td>",
+            ]
+        summaryLine += [
+            "</tr>",
+        ]
+        summaryLine += [
+            '<tr><td style="border-top-style:hidden">&lt;100 * (PR - baseline)/baseline &gt;</td>'
+        ]
+        for step in sorted(workflows[workflow].keys(), key=stepfn):
+            summaryLine += [
+                '<td style="border-top-style:hidden;">',
+                "{:,.2f}".format(workflows[workflow][step]["max memory pdiff"]),
                 "%</td>",
             ]
         summaryLine += [
@@ -391,9 +395,6 @@ def stepfn(step):
                 "{:,}".format(workflows[workflow][step]["nallocated base"]),
                 "</td>",
             ]
-        summaryLine += [
-            "</tr>",
-        ]
         summaryLine += [
             '<tr><td style="border-bottom-style:hidden;border-top-style:hidden;">&lt;pull request &gt;</td>'
         ]
@@ -435,10 +436,17 @@ def stepfn(step):
     if summaryFormat == "html":
         summaryLines += [
             '</table><table><tr><td bgcolor="orange">'
-            + "maximum memory used warn threshold %0.3f" % MAXMEM_WARN_THRESHOLD
-            + '%</td></tr><tr><td bgcolor="red">'
-            + "maximum memory used error threshold %0.3f" % MAXMEM_ERROR_THRESHOLD
-            + "%</td></tr>",
+            + "default maximum memory used warn threshold %0.0f" % maxmem_threshold.WARN_THRESHOLD
+            + ' MB</td></tr><tr><td bgcolor="red">'
+            + "default maximum memory used error threshold %0.0f"
+            % maxmem_threshold.ERROR_THRESHOLD
+            + ' MB</td></tr><tr><td bgcolor="yellow">'
+            + "default maximum memory used warn threshold -1 * %0.0f"
+            % maxmem_threshold.WARN_THRESHOLD
+            + ' MB</td></tr><tr><td bgcolor="green">'
+            + "default maximum memory used error threshold -1 * %0.0f"
+            % maxmem_threshold.ERROR_THRESHOLD
+            + " MB</td></tr></table><table>",
         ]
         summaryLines += ["</table></body></html>"]
 

diff --git a/comparisons/compare-maxmem.py b/comparisons/compare-maxmem.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python3
+import os
 import sys
 import json
 from collections import defaultdict
 
+import maxmem_threshold
+
 
 def create_memory_report_dict(filename):
     memory_reports = dict(dict())
@@ -25,45 +28,54 @@ def create_memory_report_dict(filename):
 mem_prof_base_dicts = create_memory_report_dict(sys.argv[2])
 
 mem_prof_pdiffs_dicts = dict(dict())
+mem_prof_diffs_dicts = dict(dict())
 
 for k in mem_prof_pr_dicts.keys():
     mem_prof_pdiffs_dict = dict()
+    mem_prof_diffs_dict = dict()
     mem_prof_pr_subdict = mem_prof_pr_dicts[k]
     for j, v in mem_prof_pr_subdict.items():
         if j == "step":
             mem_prof_pdiffs_dict[j] = v
+            mem_prof_diffs_dict[j] = v
         else:
             mem_prof_pdiffs_dict[j] = (
                 100
                 * (mem_prof_pr_dicts[k][j] - mem_prof_base_dicts[k][j])
                 / mem_prof_base_dicts[k][j]
             )
+            mem_prof_diffs_dict[j] = mem_prof_pr_dicts[k][j] - mem_prof_base_dicts[k][j]
     mem_prof_pdiffs_dicts[k] = mem_prof_pdiffs_dict
+    mem_prof_diffs_dicts[k] = mem_prof_diffs_dict
 
 mem_prof = {}
 
 mem_prof["max memory pr"] = mem_prof_pr_dicts
 mem_prof["max memory base"] = mem_prof_base_dicts
 mem_prof["max memory pdiffs"] = mem_prof_pdiffs_dicts
-WARN_THRESHOLD = 1.0
-ERROR_THRESHOLD = 10.0
-mem_prof["threshold"] = WARN_THRESHOLD
-mem_prof["error_threshold"] = ERROR_THRESHOLD
+mem_prof["max memory diffs"] = mem_prof_diffs_dicts
+mem_prof["threshold"] = maxmem_threshold.WARN_THRESHOLD
+mem_prof["error_threshold"] = maxmem_threshold.ERROR_THRESHOLD
 mem_prof["workflow"] = sys.argv[1].split("/")[-2]
 sys.stdout.write(json.dumps(mem_prof))
 sys.stdout.write("\n")
 
 errs = 0
-for k in sorted(mem_prof_pdiffs_dicts.keys()):
-    mmu = mem_prof_pdiffs_dicts[k].get("max memory used")
+for k in sorted(mem_prof_diffs_dicts.keys()):
+    mmu = mem_prof_diffs_dicts[k].get("max memory used")
     if mmu:
-        if abs(mmu) > ERROR_THRESHOLD:
+        mmus = mmu / (1024 * 1024)
+        if mmus > maxmem_threshold.WARN_THRESHOLD or mmus < -1 * maxmem_threshold.WARN_THRESHOLD:
+            sys.stderr.write(
+                "Warning: Workflow %s %s max memory diff %.1f exceeds +/- %.1f MiB\n"
+                % (mem_prof["workflow"], k, mmus, maxmem_threshold.WARN_THRESHOLD)
+            )
+        if mmus > maxmem_threshold.ERROR_THRESHOLD or mmus < -1 * maxmem_threshold.ERROR_THRESHOLD:
             errs = errs + 1
             sys.stderr.write(
-                "Workflow %s %s max memory used percentage diff %2f%% exceeds error threshold %2f%%"
-                % (mem_prof["workflow"], k, abs(mmu), ERROR_THRESHOLD)
+                "Error: Workflow %s %s max memory diff %.1f exceeds +/- %.1f MiB\n"
+                % (mem_prof["workflow"], k, mmus, maxmem_threshold.ERROR_THRESHOLD)
             )
-            sys.stderr.write("\n")
 
 if errs > 0:
     exit(10)
diff --git a/comparisons/maxmem_threshold.py b/comparisons/maxmem_threshold.py
@@ -0,0 +1,2 @@
+WARN_THRESHOLD = 10.0
+ERROR_THRESHOLD = 80.0
diff --git a/pr_testing/_helper_functions.sh b/pr_testing/_helper_functions.sh
@@ -189,6 +189,14 @@ function get_result_file_name () {
       echo "21-${TEST_FLAVOR}-comparison-report.res"
       return 0
       ;;
+    maxmem)
+      if [ "$TEST_FLAVOR" != "" ]; then
+        echo "23-${TEST_FLAVOR}-maxmem-report.res"
+      else
+        echo "23-maxmem-report.res"
+      fi
+      return 0
+      ;;
   esac
   return 1
 }

diff --git a/pr_testing/run-pr-comparisons b/pr_testing/run-pr-comparisons
@@ -294,9 +294,9 @@ set +x
 # maxmem-profile comparison
 # --------------------------------------------------------------------------
 echo "Started maxmem-profile comparison at `date`"
-OUTPUT_DIR=$WORKSPACE/results/maxmem-comparison
+MAXMEM_COMPARISON_OUTPUT_DIR=$WORKSPACE/results/maxmem-comparison
 #create the output dir
-mkdir -p $OUTPUT_DIR
+mkdir -p $MAXMEM_COMPARISON_OUTPUT_DIR
 for maxmem in $(find $WORKSPACE/data/PR-${PR_NUM} -follow -name 'maxmem_profile_*.txt' -type f | sed "s|$WORKSPACE/data/PR-${PR_NUM}/||") ; do
   echo "Maxmem Profile> Working on ${maxmem}"
   if [ ! -e "$WORKSPACE/data/$COMPARISON_RELEASE/${maxmem}" ] ; then
@@ -305,13 +305,18 @@ for maxmem in $(find $WORKSPACE/data/PR-${PR_NUM} -follow -name 'maxmem_profile_
   fi
   WF_NUMBER=$(echo ${maxmem} | sed 's|_.*||')
   $CMS_BOT_DIR/comparisons/compare-maxmem.py $WORKSPACE/data/PR-${PR_NUM}/${maxmem} \
-    $WORKSPACE/data/$COMPARISON_RELEASE/${maxmem} > $OUTPUT_DIR/${WF_NUMBER}.json 2> $OUTPUT_DIR/${WF_NUMBER}.err || true
+    $WORKSPACE/data/$COMPARISON_RELEASE/${maxmem} > $MAXMEM_COMPARISON_OUTPUT_DIR/${WF_NUMBER}.json 2>> $MAXMEM_COMPARISON_OUTPUT_DIR/${WF_NUMBER}.err || true
 done
-$CMS_BOT_DIR/comparisons/compare-maxmem-summary.py -i $OUTPUT_DIR -f '*.json' -F html -o $OUTPUT_DIR/index.html -u $JENKINS_ARTIFACTS_URL/$PR_BASELINE_DIR || true
-if grep "exceeds threshold" $OUTPUT_DIR/*.err 2>/dev/null; then
-  echo "MAXMEM_COMPARISON${TEST_FLAVOR_STR};OK,max memory used ${UC_TEST_FLAVOR} comparison,See results,/SDT/jenkins-artifacts/$COMP_UPLOAD_DIR/maxmem-comparison" >> ${RESULTS_FILE}
+mkdir -p $WORKSPACE/testsResults
+$CMS_BOT_DIR/comparisons/compare-maxmem-summary.py -i $MAXMEM_COMPARISON_OUTPUT_DIR -f '*.json' -F html -o $MAXMEM_COMPARISON_OUTPUT_DIR/maxmem_summary.html -u $JENKINS_ARTIFACTS_URL/$PR_BASELINE_DIR >$MAXMEM_COMPARISON_OUTPUT_DIR/maxmem_summary.log 2>&1 || true
+if grep "Error:" $MAXMEM_COMPARISON_OUTPUT_DIR/*.err >$MAXMEM_COMPARISON_OUTPUT_DIR/maxmem_summary.log 2>/dev/null; then
+  echo "MAXMEM_COMPARISON${TEST_FLAVOR_STR};OK,${UC_TEST_FLAVOR} max memory used comparison failed,See failed results,/SDT/jenkins-artifacts/$COMP_UPLOAD_DIR/maxmem-comparison/maxmem_summary.html" >> ${RESULTS_FILE}
+  REPORT_FILE=$WORKSPACE/testsResults/$(get_result_file_name "maxmem" "${TEST_FLAVOR}" "")
+  touch $REPORT_FILE
+  ${CMS_BOT_DIR}/report-pull-request-results PARSE_MAXMEM_FAIL --no-post --unit-tests-file $MAXMEM_COMPARISON_OUTPUT_DIR/maxmem_summary.log --report-file ${REPORT_FILE} --report-url ${PR_RESULT_URL} || true
+  ${CMS_BOT_DIR}/report-pull-request-results PARSE_MAXMEM_FAIL --unit-tests-file $MAXMEM_COMPARISON_OUTPUT_DIR/maxmem_summary.log --report-file ${REPORT_FILE} --report-url ${PR_RESULT_URL} || true
 else
-  echo "MAXMEM_COMPARISON${TEST_FLAVOR_STR};OK,max memory used ${UC_TEST_FLAVOR} comparison,See results,/SDT/jenkins-artifacts/$COMP_UPLOAD_DIR/maxmem-comparison" >> ${RESULTS_FILE}
+  echo "MAXMEM_COMPARISON${TEST_FLAVOR_STR};OK,max memory used ${UC_TEST_FLAVOR} comparison,See results,/SDT/jenkins-artifacts/$COMP_UPLOAD_DIR/maxmem-comparison/maxmem_summary.html" >> ${RESULTS_FILE}
 fi
 
 # --------------------------------------------------------------------------

diff --git a/report-pull-request-results.py b/report-pull-request-results.py
@@ -26,7 +26,7 @@
     usage="usage: %prog ACTION [options] \n ACTION = PARSE_UNIT_TESTS_FAIL | PARSE_BUILD_FAIL "
     "| PARSE_MATRIX_FAIL | COMPARISON_READY | GET_BASE_MESSAGE | PARSE_EXTERNAL_BUILD_FAIL "
     "| PARSE_ADDON_FAIL | PARSE_CRAB_FAIL | PARSE_CLANG_BUILD_FAIL | MATERIAL_BUDGET "
-    "| PYTHON3_FAIL | PARSE_GPU_UNIT_TESTS_FAIL | MERGE_COMMITS"
+    "| PYTHON3_FAIL | PARSE_GPU_UNIT_TESTS_FAIL | MERGE_COMMITS | PARSE_MAXMEM_FAIL "
 )
 
 parser.add_option(
@@ -294,6 +294,26 @@ def read_material_budget_log_file(unit_tests_file):
     send_message_pr(message)
 
 
+#
+# reads maxmem comparison error files
+#
+def read_maxmem_comparison_file(unit_tests_file):
+    errors_found = ""
+    err_cnt = 0
+    for line in openlog(unit_tests_file):
+        if "exceeds" in line.lower():
+            err_cnt += 1
+            errors_found += " - " + line.split(":")[1] + "\n"
+
+    if err_cnt > 0:
+        message = (
+            "\n## Max Memory Comparisons exceeding threshold\n\n"
+            "@cms-sw/core-l2 , I found %s workflow step(s) with memory usage exceeding the error threshold:\n\n%s"
+            % (err_cnt, errors_found)
+        )
+        send_message_pr(message)
+
+
 def get_recent_merges_message():
     message = ""
     if options.recent_merges_file:
@@ -643,6 +663,8 @@ def complain_missing_param(param_name):
     read_python3_file(options.unit_tests_file)
 elif ACTION == "MATERIAL_BUDGET":
     read_material_budget_log_file(options.unit_tests_file)
+elif ACTION == "PARSE_MAXMEM_FAIL":
+    read_maxmem_comparison_file(options.unit_tests_file)
 elif ACTION == "MERGE_COMMITS":
     add_to_report(get_recent_merges_message())
 elif ACTION == "PARSE_CUDA_UNIT_TESTS_FAIL":