diff --git a/DQMServices/Components/scripts/dqm-plot b/DQMServices/Components/scripts/dqm-plot new file mode 100755 index 0000000000000..61631ba41551f --- /dev/null +++ b/DQMServices/Components/scripts/dqm-plot @@ -0,0 +1,1332 @@ +#!/usr/bin/env python3 + +""" +Simple DQM comparison plotter using mplhep with CMS styling. + +Usage: dqm-plot -s "DQMData/Run 1/HLT/Run summary/Muon/*" [options] $DQM_FILES +See all available options with `dqm-plot -h`. +""" + +import ROOT +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +import mplhep as hep +import argparse +import os +import re +import multiprocessing +import warnings +import urllib.request +import urllib.error +from pathlib import Path +from matplotlib import colors as _mcolors + +# Suppress numpy warnings about subnormal float values +warnings.filterwarnings("ignore", category=UserWarning, module="numpy") + +# Set CMS style globally +plt.style.use(hep.style.CMS) + +# Use matplotlib's mathtext +plt.rcParams["mathtext.default"] = "regular" + + +class DQMPlotter: + def __init__(self, figsize=(12, 9), ratio_height=0.3, processors=None): + """ + Initialize the plotter. + + Args: + figsize: Figure size (width, height) in inches + ratio_height: Fraction of figure height for ratio plot + processors: Number of processors to use for multiprocessing (None for auto-detect) + """ + self.figsize = figsize + self.ratio_height = ratio_height + # Color palette from https://cms-analysis.docs.cern.ch/guidelines/plotting/colors/ + self.colors = [ + "#3f8fda", + "#ffa90e", + "#bd1f01", + "#94a4a2", + "#832db6", + "#a96b59", + "#e76300", + "#b9ac70", + "#717581", + "#92dadd", + ] + self.markers = [ + "o", # circle + "s", # square + "^", # triangle up + "D", # diamond + "v", # triangle down + "p", # pentagon + "*", # star + "h", # hexagon + "<", # triangle left + ">", # triangle right + ] + self.processors = ( + processors if processors is not None else multiprocessing.cpu_count() + ) + + def _extend_color_palette(self, needed: int): + """Ensure self.colors has at least 'needed' distinct entries.""" + if needed <= len(self.colors): + return + + extra_needed = needed - len(self.colors) + new_colors = [] + + cmap = plt.colormaps["hsv"] + for i in range(max(extra_needed, 1)): + rgba = cmap(i / max(extra_needed, 1)) + hexcol = _mcolors.to_hex(rgba, keep_alpha=False) + if hexcol not in self.colors and hexcol not in new_colors: + new_colors.append(hexcol) + if len(new_colors) >= extra_needed: + break + + self.colors.extend(new_colors) + + def root_to_numpy(self, hist): + """ + Convert ROOT histogram to numpy arrays. + + Args: + hist: ROOT histogram + + Returns: + tuple: (bin_centers, bin_contents, bin_errors, bin_edges, bin_labels, has_labels) + """ + n_bins = hist.GetNbinsX() + bin_edges = np.array([hist.GetBinLowEdge(i) for i in range(1, n_bins + 2)]) + bin_centers = np.array([hist.GetBinCenter(i) for i in range(1, n_bins + 1)]) + + bin_contents = np.array([hist.GetBinContent(i) for i in range(1, n_bins + 1)]) + bin_errors = np.array([hist.GetBinError(i) for i in range(1, n_bins + 1)]) + + bin_labels = [] + for i in range(1, n_bins + 1): + label = hist.GetXaxis().GetBinLabel(i) + bin_labels.append(self._clean_bin_label(label) if label else "") + + # Only use extracted labels if meaningful + has_labels = any(label and not label.isdigit() for label in bin_labels) + + return bin_centers, bin_contents, bin_errors, bin_edges, bin_labels, has_labels + + def _clean_bin_label(self, label): + """Clean up bin label for better readability.""" + return label.replace("TrackSelectionHighPurity", "HP") + + def _apply_axis_scaling(self, ax, xlabel, ylabel, title, logy=False, has_negative_values=False): + """Apply log scaling to axes based on content and user settings.""" + if logy: + if has_negative_values: + output_file = getattr(self, '_current_output_path', 'unknown') + print(f"\nWarning: Cannot set log scale for plot '{title}' (output: {output_file}) - plot contains negative y-values") + else: + ax.set_yscale("log") + + # Automatic log scale for specific variable types + if "p_{\mathrm{T}}" in xlabel and "pull" not in xlabel.lower(): + ax.set_xscale("log") + + if ("p_{\mathrm{T}}" in ylabel and "pull" not in ylabel.lower()) or "Sigma" in title: + if not has_negative_values: + ax.set_yscale("log") + + if "vertpos" in xlabel: + ax.set_xscale("log") + xlabel = "vert r [cm]" + + return xlabel, ylabel + + def _plot_histogram_data(self, ax, bin_centers, bin_contents, bin_errors, bin_edges, + label, color_idx, plot_histograms): + """Plot histogram data either as histogram or error bars.""" + if plot_histograms: + hep.histplot( + bin_contents, + bins=bin_edges, + yerr=bin_errors, + label=label, + color=self.colors[color_idx % len(self.colors)], + histtype="step", + linewidth=2, + ax=ax, + ) + else: + ax.errorbar( + bin_centers, + bin_contents, + yerr=bin_errors, + label=label, + color=self.colors[color_idx % len(self.colors)], + fmt=self.markers[color_idx % len(self.markers)], + markersize=5, + capsize=2, + linewidth=1.5, + ) + + def _calculate_and_plot_ratio(self, ax_ratio, bin_centers, bin_contents, bin_errors, + ref_centers, ref_contents, ref_errors, color_idx, plot_histograms): + """Calculate and plot ratio between current and reference histogram.""" + if ax_ratio is None: + return [] + + tolerance = 1e-6 + matching_indices = [] + + for idx, center in enumerate(bin_centers): + ref_idx = np.argmin(np.abs(ref_centers - center)) + if np.abs(ref_centers[ref_idx] - center) < tolerance: + matching_indices.append((idx, ref_idx)) + + if not matching_indices: + return [] + + curr_idxs, ref_idxs = zip(*matching_indices) + matching_centers = bin_centers[list(curr_idxs)] + + matching_ref_contents = ref_contents[list(ref_idxs)] + matching_contents = bin_contents[list(curr_idxs)] + matching_ref_errors = ref_errors[list(ref_idxs)] + matching_errors = bin_errors[list(curr_idxs)] + + ratio = np.divide( + matching_contents, + matching_ref_contents, + out=np.zeros_like(matching_contents), + where=matching_ref_contents != 0, + ) + + ratio_errors = np.zeros_like(ratio) + mask = (matching_ref_contents != 0) & (matching_contents != 0) + ratio_errors[mask] = np.abs(ratio[mask]) * np.sqrt( + np.power(matching_errors[mask] / np.maximum(matching_contents[mask], 1e-10), 2) + + np.power(matching_ref_errors[mask] / np.maximum(matching_ref_contents[mask], 1e-10), 2) + ) + ratio_errors = np.nan_to_num(ratio_errors, nan=0.0, posinf=0.0, neginf=0.0) + + if plot_histograms and len(matching_centers) > 1: + bin_widths = np.diff(np.sort(matching_centers)) + avg_width = np.mean(bin_widths) + matching_edges = np.concatenate([ + [matching_centers[0] - avg_width / 2], + matching_centers + avg_width / 2, + ]) + hep.histplot( + ratio, bins=matching_edges, yerr=ratio_errors, + color=self.colors[color_idx % len(self.colors)], + histtype="step", linewidth=2, ax=ax_ratio, + ) + else: + ax_ratio.errorbar( + matching_centers, ratio, yerr=ratio_errors, + color=self.colors[color_idx % len(self.colors)], + fmt=self.markers[color_idx % len(self.markers)], + markersize=5, capsize=2, linewidth=1.5, + ) + + return ratio[(ratio > 0) & np.isfinite(ratio)] + + def _wrap_legend_labels(self, labels, width=25): + """Soft-wrap legend labels at natural break points to reduce horizontal size.""" + wrapped = [] + for lab in labels: + # Explicit new lines + if "\\n" in lab: + explicit_lines = lab.split("\\n") + wrapped.append("\n".join(explicit_lines)) + continue + + # Preserve " - " separator (for overlay labels like "File - Collection") + if " - " in lab: + parts = lab.split(" - ", 1) # Split only on first occurrence + file_part = parts[0] + collection_part = parts[1] if len(parts) > 1 else "" + + # If the combined length is too long, put on separate lines + if len(lab) > width: + wrapped.append(f"{file_part}\n- {collection_part}") + else: + wrapped.append(lab) + continue + + # Automatic split using / or _ + parts = re.split(r'(/|_)+', lab) + tokens = [] + buffer = "" + for p in parts: + if not p: + continue + candidate = (buffer + p) if buffer else p + if len(candidate) > width and buffer: + tokens.append(buffer.rstrip("_/")) + buffer = p + else: + buffer = candidate + if buffer: + tokens.append(buffer.rstrip("_/")) + + # CamelCase and digit splitting + final_tokens = [] + for tok in tokens: + if len(tok) > width: + subtoks = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?![a-z])|\d+', tok) + line = "" + for st in subtoks: + if len(line) + len(st) + 1 > width and line: + final_tokens.append(line) + line = st + else: + line = (line + st) if not line else (line + st) + if line: + final_tokens.append(line) + else: + final_tokens.append(tok) + wrapped_label = "\n".join(final_tokens) if final_tokens else lab + wrapped.append(wrapped_label) + return wrapped + + def _configure_legend(self, ax, labels, legend_title, logy=False, force_outside=False): + """Configure legend; wrap long entries and move outside if needed.""" + if not labels: + return + + wrapped_labels = self._wrap_legend_labels(labels) + max_label_length = max(len(l.replace("\n", "")) for l in wrapped_labels) + + place_outside = ( + force_outside + or max_label_length > 30 + or (len(wrapped_labels) > 8 and max_label_length > 20) + ) + + legend_columns = 1 if len(wrapped_labels) <= 2 else 2 + legend_fontsize = "22" + if len(wrapped_labels) > 6: + legend_fontsize = "20" + if len(wrapped_labels) > 10: + legend_fontsize = "18" + + legend_title_fontsize = "24" + if legend_title and len(legend_title) > 30: + legend_title_fontsize = "20" + + if place_outside: + y_min, y_max = ax.get_ylim() + y_range = y_max - y_min + ax.set_ylim(y_min, y_max + y_range * 0.1) + ax.figure.subplots_adjust(right=0.9) + ax.legend( + wrapped_labels, + loc="upper left", + bbox_to_anchor=(1.01, 1.0), + borderaxespad=0.0, + title=legend_title, + fontsize=legend_fontsize, + title_fontsize=legend_title_fontsize, + frameon=False, + ) + return + + legend_padding = 0.5 if len(wrapped_labels) <= 2 else np.ceil(len(wrapped_labels) / 2) * 0.25 + legend_padding = min(legend_padding, 2.0) + if logy or ax.get_yscale() == "log": + legend_padding *= 50 + if any("\n" in l for l in wrapped_labels): + legend_padding *= 2 + y_min, y_max = ax.get_ylim() + y_range = y_max - y_min + ax.set_ylim(y_min, y_max + y_range * legend_padding) + + ax.legend( + wrapped_labels, + loc="upper right", + ncols=legend_columns, + title=legend_title, + fontsize=legend_fontsize, + title_fontsize=legend_title_fontsize, + columnspacing=1.0, + ) + + def _apply_custom_formatter(self, ax): + """Apply custom scientific notation formatter to y-axis.""" + if ax.get_yscale() != "log": + from matplotlib.ticker import ScalarFormatter + + class CustomScalarFormatter(ScalarFormatter): + def format_data_short(self, value): + if self.orderOfMagnitude != 0: + return f"×10$^{{{self.orderOfMagnitude}}}$" + return "" + + formatter = CustomScalarFormatter(useOffset=True, useMathText=True) + ax.yaxis.set_major_formatter(formatter) + + # Move y-axis scientific notation to avoid overlap with CMS label + ax.yaxis.get_offset_text().set_position((-0.01, 1.02)) + ax.yaxis.get_offset_text().set_horizontalalignment("right") + ax.yaxis.get_offset_text().set_verticalalignment("bottom") + + def _load_histograms(self, file_paths, hist_path, labels): + """Load histograms from files, returning list of histograms and valid labels.""" + histograms = [] + valid_labels = [] + + for file_path, label in zip(file_paths, labels): + try: + root_file = ROOT.TFile.Open(file_path, "READ") + if not root_file or root_file.IsZombie(): + print(f"Warning: Could not open {file_path}") + continue + + hist = root_file.Get(hist_path) + if not hist: + root_file.Close() + continue + + # Clone histogram to avoid issues when file is closed + hist_clone = hist.Clone(f"hist_{len(histograms)}") + hist_clone.SetDirectory(0) + histograms.append(hist_clone) + valid_labels.append(label) + + root_file.Close() + + except Exception as e: + print(f"Error processing {file_path}: {e}") + continue + + return histograms, valid_labels + + def extract_labels_from_hist(self, hist): + """ + Extract title and axis labels from ROOT histogram. + + Args: + hist: ROOT histogram + + Returns: + tuple: (title, xlabel, ylabel) + """ + title = hist.GetTitle() + xlabel = title + ylabel = "Occurrences" + + # Match "vs", "vs.", and flexible spacing/periods between v and s; allow underscores or spaces as delimiters + vs_regex = re.compile(r'[_\s]+v\s*\.?\s*s\s*\.?[_\s]+', re.IGNORECASE) + if vs_regex.search(title): + # Detect explicit underscore-delimited form even with optional dots/spaces + used_underscore_delim = bool(re.search(r'_v\s*\.?\s*s\s*\.?_', title.lower())) + parts = vs_regex.split(title, maxsplit=1) + left, right = parts[0].strip(), parts[1].strip() + + if used_underscore_delim: + left = left.replace("_", " ") + right = right.replace("_", " ") + + if "#sigma(" in title.lower(): + core = left[left.find("(") + 1 : left.rfind(")")] + ylabel = r"$\delta$" + core + "/" + core + right_clean = right + if "Mean" in title: + ylabel = "<" + ylabel + ">" + right_clean = right_clean.replace("Mean", "") + elif "Sigma" in title: + ylabel = r"$\sigma$(" + ylabel + ")" + right_clean = right_clean.replace("Sigma", "") + xlabel = right_clean.strip() + elif "Mean" in title: + right_clean = right.replace("Mean", "").strip() + ylabel = "<" + left + ">" + xlabel = right_clean + elif "Sigma" in title: + right_clean = right.replace("Sigma", "").strip() + ylabel = r"$\sigma$<" + left + ">" + xlabel = right_clean + else: + # Default: "ylabel vs xlabel" + ylabel = left + xlabel = right + if hist.InheritsFrom("TProfile") and "mean " in ylabel: + ylabel = ylabel.replace("mean ", "<") + ">" + else: + # Pull plots + if "pull" not in title.lower(): + if "eta" in title.lower(): + xlabel = r"$\eta$" + elif "pt2" in title.lower(): + xlabel = r"$p_{\mathrm{T}}^2$" + elif "pt" in title.lower(): + xlabel = r"$p_{\mathrm{T}}$" + elif "phi" in title.lower(): + xlabel = r"$\phi$" + # Efficiency and turn-on plots + if "eff" in title.lower(): + ylabel = "Efficiency" + elif "turn-on" in title.lower(): + ylabel = "Turn-On" + + return ( + self.convert_root_to_latex(title), + self.convert_root_to_latex(xlabel), + self.convert_root_to_latex(ylabel), + ) + + def convert_root_to_latex(self, text): + """ + Convert ROOT-style notation to matplotlib mathtext format. + + Args: + text: String with ROOT notation + + Returns: + String with mathtext notation + """ + conversions = [ + ("#eta", r"$\eta$"), + ("#phi", r"$\phi$"), + ("#theta", r"$\theta$"), + ("#alpha", r"$\alpha$"), + ("#beta", r"$\beta$"), + ("#gamma", r"$\gamma$"), + ("#delta", r"$\delta$"), + ("#Delta", r"$\Delta$"), + ("#sigma", r"$\sigma$"), + ("d_{xy}", r"$d_{xy}$"), + ("d_{z}", r"$d_{z}$"), + ("#chi", r"$\chi$"), + ("#nu", r"$\nu$"), + ("#tau", r"$\tau$"), + ("pt2", r"$p_{\mathrm{T}}^2$"), + ("Pt2", r"$p_{\mathrm{T}}^2$"), + ("pT2", r"$p_{\mathrm{T}}^2$"), + ("p_{T}", r"$p_{\mathrm{T}}$"), + ("p_{t}", r"$p_{\mathrm{T}}$"), + ("pT", r"$p_{\mathrm{T}}$"), + ("pt", r"$p_{\mathrm{T}}$"), + ("GeV/c^{2}", r"GeV/$c^2$"), + ("GeV/c", r"GeV/$c$"), + ("^{2}", r"$^2$"), + ("number of", "#"), + ("Number of", "#"), + ] + + result = text + for root_notation, mathtext_notation in conversions: + result = result.replace(root_notation, mathtext_notation) + + # Replace standalone tokens without # + token_map = { + "eta": r"$\eta$", + "phi": r"$\phi$", + "theta": r"$\theta$", + "alpha": r"$\alpha$", + "beta": r"$\beta$", + "gamma": r"$\gamma$", + "delta": r"$\delta$", + "sigma": r"$\sigma$", + "chi": r"$\chi$" + } + # Tokens not preceded by letter/digit/_ or backslash and not followed by letter/digit/_. + pattern = re.compile( + r'(? 1: + groups.append(patterns) + + return groups + + def _normalize_path_for_overlay(self, hist_path, overlay_patterns): + """ + Normalize histogram path by removing overlay pattern components. + + Args: + hist_path: Full histogram path + overlay_patterns: List of patterns that should be removed for grouping + + Returns: + Normalized path (or None if path doesn't match any pattern) + """ + for pattern in overlay_patterns: + if re.search(pattern, hist_path): + # Remove the matching pattern part + normalized = re.sub(pattern, "OVERLAY_PLACEHOLDER", hist_path) + return normalized, pattern + + return None, None + + def _group_histograms_by_overlay(self, all_hists, overlay_groups): + """ + Group histograms that should be overlaid together. + + Args: + all_hists: Set of all histogram paths + overlay_groups: List of overlay pattern groups + + Returns: + Dictionary mapping normalized paths to lists of (hist_path, pattern) tuples + """ + grouped = {} + used_hists = set() + + for overlay_patterns in overlay_groups: + # Find all histograms matching any pattern in this overlay group + for hist_path in all_hists: + normalized, matched_pattern = self._normalize_path_for_overlay( + hist_path, overlay_patterns + ) + + if normalized: + if normalized not in grouped: + grouped[normalized] = [] + grouped[normalized].append((hist_path, matched_pattern)) + used_hists.add(hist_path) + + # Filter groups to only include those with multiple histograms + filtered_groups = { + norm_path: hists + for norm_path, hists in grouped.items() + if len(hists) > 1 + } + + return filtered_groups, used_hists + + def _generate_overlay_output_path(self, normalized_path, matched_patterns, + pattern, output_dir): + """ + Generate output path for overlaid histograms. + + Args: + normalized_path: Normalized path with placeholder + matched_patterns: List of patterns that were matched + pattern: Original search pattern + output_dir: Base output directory + + Returns: + Output file path + """ + # Create combined collection name from patterns + collection_names = [] + for pat in matched_patterns: + # Extract meaningful name from pattern (remove regex special chars) + name = re.sub(r'[^a-zA-Z0-9_]', '', pat) + if name: + collection_names.append(name) + + combined_name = "+".join(sorted(set(collection_names))) + + # Replace placeholder with combined name + overlay_path = normalized_path.replace("OVERLAY_PLACEHOLDER", combined_name) + + return self._generate_output_path(overlay_path, pattern, output_dir) + + def plot_comparison( + self, + histograms, + labels, + output_path, + logy=False, + normalize=False, + cms_text="Preliminary", + energy_text=None, + show_energy=False, + data=False, + grid=False, + plot_histograms=False, + do_ratio=True, + save_as_pdf=False, + legend_outside=False, + overlay_groups=None, + ): + """ + Create comparison plot with ratio panel. + + Args: + histograms: List of ROOT histograms to compare + labels: List of labels for each histogram + output_path: Output file path + logy: Use log scale for y-axis + normalize: Normalize histograms to unit area + cms_text: CMS label text + energy_text: Custom energy text (if None, uses default) + show_energy: Whether to show energy text + grid: Whether to show grid on both main and ratio plots + plot_histograms: Whether to plot histograms instead of individual bin points with errors + pdf: Whether to save as PDF in addition to PNG + legend_outside: Force legend to be placed outside the plot area + overlay_groups: List of pattern groups to overlay + """ + if len(histograms) != len(labels): + raise ValueError("Number of histograms must match number of labels") + + title, xlabel, ylabel = self.extract_labels_from_hist(histograms[0]) + + legend_title = None + if hasattr(self, "_current_output_path"): + output_parts = self._current_output_path.replace("\\", "/").split("/") + if len(output_parts) > 1: + # Get the parent directory name (second to last part) + legend_title = output_parts[-2] if len(output_parts) >= 2 else None + # Ignore legend name for summary plots + if "global" in output_parts[-1].lower() or "coll" in output_parts[-1].lower(): + legend_title = None + # Split long overlay titles at + marker + elif legend_title and "+" in legend_title and len(legend_title) > 30: + legend_title = legend_title.replace("+", "\n+") + + fig = plt.figure(figsize=self.figsize) + gs = gridspec.GridSpec( + 2, 1, height_ratios=[1 - self.ratio_height, self.ratio_height], hspace=0.10 + ) + + ax_main = fig.add_subplot(gs[0]) + + # CMS styling + if show_energy: + if energy_text is None: + # Default energy text (13 TeV) + hep.cms.label(cms_text, data=data, ax=ax_main) + else: + hep.cms.label(cms_text, data=data, ax=ax_main, rlabel=energy_text) + else: + hep.cms.label(cms_text, data=data, ax=ax_main, rlabel="") + + ax_ratio = None + if do_ratio and len(histograms) > 1: + ax_ratio = fig.add_subplot(gs[1], sharex=ax_main) + + ref_centers = None + ref_contents = None + ref_errors = None + has_labels = False + bin_labels = None + has_negative_values = False + + all_ratios = [] + ref_centers = ref_contents = ref_errors = None + + for i, (hist, label) in enumerate(zip(histograms, labels)): + bin_centers, bin_contents, bin_errors, bin_edges, bin_labels, has_labels = ( + self.root_to_numpy(hist) + ) + + if normalize and np.sum(bin_contents) > 0: + norm_factor = 1.0 / np.sum(bin_contents) + bin_contents *= norm_factor + bin_errors *= norm_factor + + # Use first histogram as reference + if i == 0: + ref_centers, ref_contents, ref_errors = bin_centers, bin_contents, bin_errors + + self._plot_histogram_data(ax_main, bin_centers, bin_contents, bin_errors, + bin_edges, label, i, plot_histograms) + + if i > 0: + valid_ratios = self._calculate_and_plot_ratio( + ax_ratio, bin_centers, bin_contents, bin_errors, + ref_centers, ref_contents, ref_errors, i, plot_histograms + ) + all_ratios.extend(valid_ratios) + + # Styling for main plot + # Calculate visible label length by removing LaTeX notation for sizing + visible_ylabel = re.sub(r"\$[^$]*\$", "X", ylabel) + label_size = "medium" if len(visible_ylabel) < 20 else "small" + if title: + ax_main.set_title(title, pad=50) + has_negative_values = np.any(bin_contents < 0) + xlabel, ylabel = self._apply_axis_scaling(ax_main, xlabel, ylabel, title, logy, has_negative_values) + + # Set custom labels if available + if has_labels: + ax_main.set_xticks( + ref_centers, + bin_labels, + size="small" if len(bin_labels) < 10 else "xx-small", + rotation=45, + ha="right", + va="top", + ) + ax_main.tick_params(axis="x", which="minor", bottom=False) + else: + ax_main.set_xlabel(xlabel) + + ax_main.set_ylabel(ylabel, fontsize=label_size) + + self._configure_legend(ax_main, labels, legend_title, logy, force_outside=legend_outside) + + if grid: + ax_main.grid(True, alpha=0.75, linestyle="dashdot", linewidth=0.75) + + self._apply_custom_formatter(ax_main) + + # Ratio plot styling + if ax_ratio is not None: + # Only show label in ratio and keep the same ticks as main plot + ax_main.set_xlabel("") + ax_main.tick_params(axis="x", labelbottom=False) + + # Set ratio plot limits + if len(all_ratios) > 0: + # Use percentiles to avoid extreme outliers + ratio_min = np.percentile(all_ratios, 5) + ratio_max = np.percentile(all_ratios, 95) + + # Set range around 1 in case of small fluctuations + ratio_min = min(ratio_min, 0.95) + ratio_max = max(ratio_max, 1.05) + + # Add at least 15% padding + ratio_range = ratio_max - ratio_min + padding = max(0.15, ratio_range * 0.1) + + # Clamp limits between 0.1 and 5.0 + final_min = max(0.1, ratio_min - padding) + final_max = min(5.0, ratio_max + padding) + + ax_ratio.set_ylim(final_min, final_max) + else: + ax_ratio.set_ylim(0.5, 1.5) + + if has_labels: + ax_ratio.set_xticks( + ref_centers, + bin_labels, + size="small" if len(bin_labels) < 10 else "xx-small", + rotation=45, + ha="right", + va="top", + ) + ax_ratio.tick_params(axis="x", which="minor", bottom=False) + else: + ax_ratio.set_xlabel(xlabel) + + ax_ratio.set_ylabel("Ratio") + ax_ratio.axhline(y=1, color="black", linestyle="--", alpha=0.7) + + if grid: + ax_ratio.grid(True, alpha=0.75, linestyle="dashdot", linewidth=0.75) + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + plt.savefig(output_path, dpi=300, bbox_inches="tight") + + if save_as_pdf: + pdf_path = output_path.rsplit(".", 1)[0] + ".pdf" + plt.savefig(pdf_path, dpi=300, bbox_inches="tight") + plt.close() + + def compare_files( + self, + file_paths, + hist_pattern, + labels=None, + output_dir="plots", + more_files=False, + create_web_index=False, + overlay_groups=None, + overlay_individual=True, + **plot_kwargs, + ): + """ + Compare histograms matching a pattern from multiple files. + + Args: + file_paths: List of ROOT file paths + hist_pattern: Regex pattern to match histogram paths + labels: Labels for each file (uses filename if None) + output_dir: Output directory for plots + more_files: Color palette will be extended to support more than 10 files if True + create_web_index: Whether to create index.php files for web viewing + overlay_groups: List of pattern groups to overlay + overlay_individual: Whether to also create individual plots for overlaid collections + **plot_kwargs: Additional arguments for plot_comparison + """ + if more_files: + self._extend_color_palette(len(file_paths)) + else: + if len(file_paths) > 10: + raise ValueError( + f"Default color palette supports a maximum of 10 files, got {len(file_paths)}. " + "Please reduce the number of input files or use the option --more-files if you are really sure of what you are doing." + ) + + # Normalize patterns to a list + patterns = ( + list(hist_pattern) + if isinstance(hist_pattern, (list, tuple, set)) + else [hist_pattern] + ) + + if labels is None: + labels = [] + for f in file_paths: + legend_label = Path(f).stem + if legend_label.startswith("DQM_"): + legend_label = legend_label.replace("DQM_", "") + labels.append(legend_label) + elif isinstance(labels, str): + labels = [label.strip() for label in labels.split(",")] + + if len(labels) != len(file_paths): + raise ValueError( + f"Number of legend entries ({len(labels)}) must match number of files ({len(file_paths)})" + ) + + all_matching_hists = set() + hist_to_pattern = {} + + print("Scanning all files for histograms matching the pattern(s)...") + for i, file_path in enumerate(file_paths): + try: + root_file = ROOT.TFile.Open(file_path, "READ") + if not root_file or root_file.IsZombie(): + print(f"Error: Could not open {file_path}") + continue + + total_for_file = 0 + for pat in patterns: + file_hists = self.find_histograms_in_file(root_file, pat) + total_for_file += len(file_hists) + all_matching_hists.update(file_hists) + for hp in file_hists: + hist_to_pattern.setdefault(hp, pat) + + root_file.Close() + print(f"Found {total_for_file} matching histograms in {file_path}") + + except Exception as e: + print(f"Error scanning {file_path}: {e}") + + if not all_matching_hists: + joined = ", ".join(map(str, patterns)) + print(f"No histograms matching pattern(s) '{joined}' found in any file") + return + + print( + f"Total unique histograms found across all files: {len(all_matching_hists)}" + ) + + plot_tasks = [] + + # Handle overlay groups if specified + if overlay_groups: + grouped_hists, used_hists = self._group_histograms_by_overlay( + all_matching_hists, overlay_groups + ) + + if grouped_hists: + print(f"Found {len(grouped_hists)} histogram groups to overlay") + + # Create tasks for overlaid histograms + for normalized_path, hist_pattern_pairs in grouped_hists.items(): + # Sort by pattern to ensure consistent ordering + hist_pattern_pairs = sorted(hist_pattern_pairs, key=lambda x: x[1]) + hist_paths = [hp for hp, _ in hist_pattern_pairs] + matched_patterns = [pat for _, pat in hist_pattern_pairs] + + # Use first pattern for output path generation + base_pattern = hist_to_pattern.get(hist_paths[0], patterns[0]) + output_path = self._generate_overlay_output_path( + normalized_path, matched_patterns, base_pattern, output_dir + ) + + # For single file: overlay collections within the file + if len(file_paths) == 1: + # Load all histograms from different collections + overlay_labels = [] + for hist_path, pattern in hist_pattern_pairs: + # Extract collection name from path + collection = re.search(pattern, hist_path) + collection_name = collection.group(0) if collection else pattern + overlay_labels.append(f"{labels[0]} - {collection_name}") + + plot_tasks.append({ + "file_paths": file_paths * len(hist_paths), + "hist_path": hist_paths, + "labels": overlay_labels, + "colors": self.colors, + "output_path": output_path, + "plot_kwargs": plot_kwargs, + "is_overlay": True, + }) + else: + # For multiple files: overlay collections across files + # Create consistent ordering: all collections for file1, then all for file2, etc. + overlay_labels = [] + extended_file_paths = [] + extended_hist_paths = [] + + for file_idx, (file_path, file_label) in enumerate(zip(file_paths, labels)): + for hist_path, pattern in hist_pattern_pairs: + collection = re.search(pattern, hist_path) + collection_name = collection.group(0) if collection else pattern + overlay_labels.append(f"{file_label} - {collection_name}") + extended_file_paths.append(file_path) + extended_hist_paths.append(hist_path) + + plot_tasks.append({ + "file_paths": extended_file_paths, + "hist_path": extended_hist_paths, + "labels": overlay_labels, + "colors": self.colors, + "output_path": output_path, + "plot_kwargs": plot_kwargs, + "is_overlay": True, + }) + + # Process non-overlaid histograms separately + if overlay_individual: + # Include individual plots for overlaid collections + remaining_hists = all_matching_hists + if remaining_hists: + print(f"Processing {len(remaining_hists)} histograms (including individual overlay collections)") + else: + # Exclude overlaid histograms from individual processing + remaining_hists = all_matching_hists - used_hists + if remaining_hists: + print(f"Processing {len(remaining_hists)} non-overlaid histograms separately") + else: + remaining_hists = all_matching_hists + + # Create tasks for non-overlaid histograms (standard behavior) + for hist_path in sorted(remaining_hists): + matched_pat = hist_to_pattern.get(hist_path, patterns[0]) + output_path = self._generate_output_path(hist_path, matched_pat, output_dir) + + plot_tasks.append({ + "file_paths": file_paths, + "hist_path": hist_path, + "labels": labels, + "colors": self.colors, + "output_path": output_path, + "plot_kwargs": plot_kwargs, + "is_overlay": False, + }) + + self._process_files(plot_tasks) + + if create_web_index: + print("Adding php index files for web viewing...") + self._create_web_index_files(output_dir) + + def _generate_output_path(self, hist_path, pattern, output_dir): + """ + Generate output path preserving folder structure from regex match. + + Args: + hist_path: Full histogram path from ROOT file + pattern: Regex pattern used to find the histogram + output_dir: Base output directory + + Returns: + Output file path with preserved folder structure + """ + match = re.search(pattern, hist_path) + if match: + match_end = match.end() + remaining_path = hist_path[match_end:].lstrip("/") + + if remaining_path: + path_parts = remaining_path.split("/") + + if len(path_parts) > 1: + # Create subdirectories and filename + subdirs = "/".join(path_parts[:-1]) + filename = path_parts[-1] + output_path = os.path.join(output_dir, subdirs, f"{filename}") + else: + # Single file, no subdirectories + filename = path_parts[0] + output_path = os.path.join(output_dir, f"{filename}") + else: + # If no remaining path, use the last part of the matched path + matched_part = match.group(0) + filename = matched_part.split("/")[-1] + output_path = os.path.join(output_dir, f"{filename}") + else: + hist_name = hist_path.replace("/", "_").replace("\\", "_") + output_path = os.path.join(output_dir, f"{hist_name}") + + return output_path + + def _create_web_index_files(self, output_dir): + """Create index.php files in each subdirectory for web viewing.""" + index_php_url = "https://cernbox.cern.ch/remote.php/dav/public-files/XCmC5GCFnF7Aqfd/index.php" + + try: + with urllib.request.urlopen(index_php_url) as response: + index_php_content = response.read().decode("utf-8") + except urllib.error.URLError as e: + print(f"Warning: Could not download index.php from CERNBox: {e}") + print("Skipping web index creation.") + return + + n_index = 0 + for root, dirs, files in os.walk(output_dir): + index_php_path = os.path.join(root, "index.php") + with open(index_php_path, "w", encoding="utf-8") as f: + f.write(index_php_content) + n_index += 1 + print(f"Created index.php files in {n_index} directories for web viewing") + + def _process_files(self, plot_tasks): + """Process plotting tasks using multiprocessing.""" + print(f"Using {self.processors} parallel processes") + + with multiprocessing.Pool(self.processors) as pool: + results = [] + for i, task in enumerate(plot_tasks): + task_copy = task.copy() + task_copy.pop("progress_counter", None) + task_copy.pop("total_tasks", None) + result = pool.apply_async(process_histogram_task, (task_copy,)) + results.append((i, result)) + + completed = 0 + failed = 0 + for i, result in results: + try: + result.get(timeout=300) + completed += 1 + if completed % 10 == 0 or completed == len(plot_tasks): + print( + f"\rProgress: {completed}/{len(plot_tasks)} plots completed", + end="", + flush=True, + ) + except multiprocessing.TimeoutError: + print(f"\nTask {i} timed out") + failed += 1 + except Exception as e: + print(f"\nError in task {i}: {e}") + failed += 1 + + pool.close() + pool.join() + + if failed > 0: + print(f"\n{failed} tasks failed out of {len(plot_tasks)} total") + print() + +def process_histogram_task(task): + """Process a single histogram task in a separate process.""" + try: + file_paths = task["file_paths"] + hist_path = task["hist_path"] + labels = task["labels"] + output_path = task["output_path"] + plot_kwargs = task["plot_kwargs"] + is_overlay = task["is_overlay"] + + plotter = DQMPlotter() + plotter._current_output_path = output_path + plotter.colors = task["colors"] + + if is_overlay: + # hist_path is a list of paths for overlay mode + histograms = [] + valid_labels = [] + + if isinstance(hist_path, list): + # Process in order to maintain consistent color/marker assignment + for fp, hp, lab in zip(file_paths, hist_path, labels): + hists, vlabs = plotter._load_histograms([fp], hp, [lab]) + histograms.extend(hists) + valid_labels.extend(vlabs) + else: + histograms, valid_labels = plotter._load_histograms( + file_paths, hist_path, labels + ) + else: + # Keep user-specified order for files + histograms, valid_labels = plotter._load_histograms( + file_paths, hist_path, labels + ) + + if len(histograms) > 0: + plotter.plot_comparison(histograms, valid_labels, output_path, **plot_kwargs) + + except Exception as e: + print(f"Error in process_histogram_task: {e}") + raise + +if __name__ == "__main__": + """Command line interface for DQM plotter.""" + parser = argparse.ArgumentParser( + description="Compare DQM histograms with CMS styling" + ) + parser.add_argument("files", nargs="+", help="ROOT files to compare") + parser.add_argument( + "-s", + "--source", + required=True, + nargs="+", + action="append", + help="Histogram path or regex pattern(s) to match. " + "Use multiple -s or space-separated values to provide several patterns.", + ) + parser.add_argument( + "-l", "--legend", help="Comma-separated list of legend entries for each file. Use \\n for explicit line breaks." + ) + parser.add_argument("-o", "--output-dir", default="plots", help="Output directory") + parser.add_argument("--logy", action="store_true", help="Use log scale for y-axis") + parser.add_argument("--normalize", action="store_true", help="Normalize histograms") + parser.add_argument( + "--cms-text", + default="Preliminary", + help="CMS label text (e.g. Preliminary, Work in progress)", + ) + parser.add_argument( + "--energy-text", + default=None, + help="Custom energy text in the top right corner (e.g., '14 TeV', 'Run 3')", + ) + parser.add_argument( + "--data", action="store_true", help="Use CMS datastyle for plots" + ) + parser.add_argument( + "--no-energy", action="store_true", help="Don't show energy text" + ) + parser.add_argument("--no-ratio", action="store_true", help="Disable ratio plots") + parser.add_argument( + "--no-grid", action="store_true", help="Remove grid from all plots" + ) + parser.add_argument( + "--histograms", + action="store_true", + help="Plot histograms instead of points (default: points with error bars)", + ) + parser.add_argument( + "--pdf", action="store_true", help="Also save plots in PDF format" + ) + parser.add_argument( + "--more-files", + action="store_true", + help="Allow more than 10 input files (legend might overlap with the plots, color palette only supports 10 files)", + ) + parser.add_argument( + "-n", + "--nProcessors", + type=int, + default=None, + help="Number of parallel processes to use (default: auto-detect)", + ) + parser.add_argument( + "--web", + action="store_true", + help="Create index.php files for web viewing (downloads from CernBox)", + ) + parser.add_argument( + "--legend-outside", + action="store_true", + help="Force legend to be placed outside the plot area", + ) + parser.add_argument( + "--overlay", + action="append", + help="Overlay histograms from different collections. Specify colon-separated patterns " + "(e.g., 'hltPhase2PixelTracks:hltPhase2PixelTracksCAExtension'). " + "Can be used multiple times for different overlay groups.", + ) + parser.add_argument( + "--no-overlay-individual", + action="store_true", + help="Disable creation of individual plots for collections that are overlaid (default: create both overlay and individual plots)", + ) + + args = parser.parse_args() + + plotter = DQMPlotter(processors=args.nProcessors) + + # Flatten list of patterns + source_patterns = [p for group in args.source for p in group] if isinstance(args.source, list) else [args.source] + + # Parse overlay groups + overlay_groups = plotter._parse_overlay_groups(args.overlay) if args.overlay else None + + plotter.compare_files( + file_paths=args.files, + hist_pattern=source_patterns, + labels=args.legend, + output_dir=args.output_dir, + logy=args.logy, + normalize=args.normalize, + cms_text=args.cms_text, + energy_text=args.energy_text, + show_energy=not args.no_energy or args.energy_text is not None, + data=args.data, + do_ratio=not args.no_ratio, + grid=not args.no_grid, + plot_histograms=args.histograms, + save_as_pdf=args.pdf, + more_files=args.more_files, + create_web_index=args.web, + legend_outside=args.legend_outside, + overlay_groups=overlay_groups, + overlay_individual=not args.no_overlay_individual, + ) diff --git a/DQMServices/Components/test/BuildFile.xml b/DQMServices/Components/test/BuildFile.xml index 8ecf362568cf6..dd1f06da85a70 100644 --- a/DQMServices/Components/test/BuildFile.xml +++ b/DQMServices/Components/test/BuildFile.xml @@ -2,4 +2,5 @@ + diff --git a/DQMServices/Components/test/test_dqm-plot.sh b/DQMServices/Components/test/test_dqm-plot.sh new file mode 100755 index 0000000000000..7d998b97fafee --- /dev/null +++ b/DQMServices/Components/test/test_dqm-plot.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +function die { echo $1: status $2; exit $2; } + +REMOTE="/store/group/phys_tracking/cmssw_unittests/" +DQMFILE="DQM_V0001_R000000001__RelValTTbar_14TeV__CMSSW_12_1_0_pre5-121X_mcRun3_2021_realistic_v15-v1__DQMIO.root" +COMMMAND=`xrdfs cms-xrd-global.cern.ch locate ${REMOTE}${DQMFILE}` +STATUS=$? +echo "xrdfs command status = "$STATUS + +if [ $STATUS -eq 0 ]; then + echo "Using file ${DQMFILE}. Running in ${LOCAL_TEST_DIR}." + xrdcp root://cms-xrd-global.cern.ch/${REMOTE}${DQMFILE} . + + (dqm-plot \ + -n 4 \ + --web \ + --pdf \ + -s "DQMData/Run 1/HLT/Run summary/Tracking/ValidationWRTOffline/*" \ + --overlay "hltMergedWrtHighPurity:hltMergedWrtHighPurityPV" \ + --energy-text "TEST" \ + -l "File 1, File 2" \ + ./${DQMFILE} ./${DQMFILE}) || die 'failed running dqm-plot $DQMFILE' $? + rm -fr ./${DQMFILE} +else + die "SKIPPING test, file ${DQMFILE} not found" 0 +fi \ No newline at end of file