OpenGene · KimBioInfoStudio · Mar 31, 2026
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,9 @@ fastp
 *.out
 *.app
 
+# Python
+__pycache__/
+
 # Editor Config
 .vscode
 

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,114 @@
+# fastp Benchmark Suite
+
+End-to-end benchmark for fastp: measures throughput, peak memory, and verifies output correctness across I/O format combinations and feature modes.
+
+## Quick Start
+
+```bash
+# Run all I/O format modes with generated data (10M pairs)
+python3 -m benchmark --opt ./fastp
+
+# Compare optimized build against baseline
+python3 -m benchmark --opt ./fastp_opt --orig ./fastp_baseline
+
+# Use your own FASTQ files
+python3 -m benchmark --r1 /data/sample_R1.fq.gz --r2 /data/sample_R2.fq.gz --opt ./fastp
+```
+
+## Modes
+
+### I/O Format Modes (default: `all`)
+
+Test every input/output compression combination:
+
+| Mode       | Type | Input  | Output |
+|------------|------|--------|--------|
+| fq-fq      | PE   | .fq    | .fq    |
+| fq-gz      | PE   | .fq    | .fq.gz |
+| gz-fq      | PE   | .fq.gz | .fq    |
+| gz-gz      | PE   | .fq.gz | .fq.gz |
+| se-fq-fq   | SE   | .fq    | .fq    |
+| se-fq-gz   | SE   | .fq    | .fq.gz |
+| se-gz-fq   | SE   | .fq.gz | .fq    |
+| se-gz-gz   | SE   | .fq.gz | .fq.gz |
+| stdin-stdout| SE  | stdin  | stdout |
+
+### Feature Modes (`all-feat`)
+
+Exercise specific fastp processing paths:
+
+| Mode       | fastp Flags                               | Data        | Tests                              |
+|------------|-------------------------------------------|-------------|------------------------------------|
+| merge      | `--merge --correction`                    | overlapping | PE merge + base correction (#676)  |
+| correction | `--correction`                            | overlapping | overlap analysis + base correction |
+| dedup-pe   | `--dedup`                                 | random      | duplicate detection (PE)           |
+| dedup-se   | `--dedup`                                 | random      | duplicate detection (SE)           |
+| qualcut-pe | `--cut_front --cut_tail -W 4 -M 20`      | random      | sliding window quality cut (PE)    |
+| qualcut-se | `--cut_front --cut_tail -W 4 -M 20`      | random      | sliding window quality cut (SE)    |
+| ht-pe      | (default) `-w 32`                         | random      | high-thread concurrency stress     |
+
+### Mode Groups
+
+| Alias      | Expands To                                     |
+|------------|------------------------------------------------|
+| all        | all 9 I/O format modes (default)               |
+| all-pe     | PE format modes only                           |
+| all-se     | SE format modes + stdin-stdout                 |
+| all-feat   | all 7 feature modes                            |
+| everything | all I/O + all feature modes (16 total)         |
+
+```bash
+python3 -m benchmark --mode all-feat --opt ./fastp
+python3 -m benchmark --mode merge,correction --opt ./fastp --orig ./fastp_v110
+python3 -m benchmark --mode everything --opt ./fastp
+```
+
+## Options
+
+```
+--opt PATH       Optimized binary path (default: /tmp/fastp_opt)
+--orig PATH      Baseline binary for comparison (optional)
+--r1 PATH        Custom R1 input (.fq or .fq.gz), skips data generation
+--r2 PATH        Custom R2 input (.fq or .fq.gz), required for PE modes
+--mode MODE      Comma-separated mode list (default: all)
+--pairs N        Read pairs to generate (default: 10,000,000)
+--threads N      Worker threads, 0 = auto (default: 0)
+--runs N         Repeat count, reports median (default: 3)
+--seed N         RNG seed for data generation (default: 2026)
+--json PATH      Load saved JSON results and print summary table
+--merge A B      Merge two opt-only result JSONs into comparison table
+```
+
+## Test Data
+
+When `--r1`/`--r2` are not provided, the benchmark generates synthetic data:
+
+- **Random PE data** (10M pairs x 150bp): independent R1/R2 with realistic quality profiles. Used for I/O format modes, dedup, qualcut, and high-thread tests.
+- **Overlapping PE data** (10M pairs x 150bp): R1 and R2 derived from the same fragment (insert ~220bp, overlap ~80bp). R2 has injected mismatches at low quality (Q10-14) to trigger base correction. Used for merge and correction modes.
+
+Generated data is cached in `/tmp/fastp_benchmark/data/` and reused across runs. Delete the directory to regenerate.
+
+When custom files are provided:
+- Format conversion is automatic (decompress .gz to .fq or compress .fq to .gz as needed)
+- Merge/correction modes use the custom data directly (real sequencing data has natural overlap)
+
+## Output
+
+- Per-run timing and peak RSS printed live
+- Markdown summary table at the end
+- JSON results saved to `/tmp/fastp_benchmark/results.json`
+- MD5 verification: when baseline is provided, checks that optimized output matches baseline byte-for-byte
+
+## Module Structure
+
+```
+benchmark/
+├── __main__.py    # python3 -m benchmark entry point
+├── run.py         # CLI argument parsing and orchestration
+├── modes.py       # mode definitions, constants, path config
+├── datagen.py     # synthetic FASTQ data generation
+├── sysinfo.py     # hardware/OS info collection
+├── runner.py      # benchmark execution (build_cmd, run, measure)
+├── verify.py      # output MD5 verification
+└── report.py      # summary table formatting
+```
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
@@ -0,0 +1 @@
+"""fastp end-to-end benchmark suite."""
diff --git a/benchmark/__main__.py b/benchmark/__main__.py
@@ -0,0 +1,4 @@
+"""Allow running as: python3 -m benchmark [OPTIONS]"""
+from .run import main
+
+main()
diff --git a/benchmark/datagen.py b/benchmark/datagen.py
@@ -0,0 +1,134 @@
+"""FASTQ test data generation for benchmarking."""
+import gzip
+import random
+import time
+from pathlib import Path
+
+
+def generate_data(r1_gz: Path, r2_gz: Path, num_pairs: int, seed: int = 2026):
+    """Generate random paired-end FASTQ reads (independent R1/R2)."""
+    READ_LEN = 150
+    BATCH = 100_000
+    BASES = b"ACGT"
+    QUAL_POOL = 512
+    rng = random.Random(seed)
+
+    pool = []
+    for _ in range(QUAL_POOL):
+        q = bytearray(READ_LEN)
+        for j in range(READ_LEN):
+            if j < 5 or j > READ_LEN - 10:
+                q[j] = rng.randint(20, 35) + 33
+            else:
+                q[j] = rng.randint(30, 40) + 33
+        pool.append(bytes(q))
+
+    t0 = time.time()
+    written = 0
+    with gzip.open(r1_gz, "wb", compresslevel=1) as f1, \
+         gzip.open(r2_gz, "wb", compresslevel=1) as f2:
+        while written < num_pairs:
+            n = min(BATCH, num_pairs - written)
+            b1 = bytearray()
+            b2 = bytearray()
+            for i in range(n):
+                rid = written + i + 1
+                name = f"@SIM:BENCH:1:{1101 + rid // 10000000}:{rid % 50000}:{(rid * 7) % 50000}".encode()
+                s1 = bytes(rng.choices(BASES, k=READ_LEN))
+                s2 = bytes(rng.choices(BASES, k=READ_LEN))
+                q1 = pool[rng.randint(0, QUAL_POOL - 1)]
+                q2 = pool[rng.randint(0, QUAL_POOL - 1)]
+                b1 += name + b" 1:N:0:ATCG\n" + s1 + b"\n+\n" + q1 + b"\n"
+                b2 += name + b" 2:N:0:ATCG\n" + s2 + b"\n+\n" + q2 + b"\n"
+            f1.write(bytes(b1))
+            f2.write(bytes(b2))
+            written += n
+            if written % 1_000_000 == 0:
+                e = time.time() - t0
+                eta = (num_pairs - written) / (written / e)
+                print(f"  {100 * written / num_pairs:5.1f}%  {written / 1e6:.0f}M pairs"
+                      f"  {written / e / 1e6:.2f}M/s  ETA {eta:.0f}s", flush=True)
+
+    print(f"  Generated in {time.time() - t0:.1f}s")
+
+
+def generate_merge_data(r1_gz: Path, r2_gz: Path, num_pairs: int, seed: int = 2026):
+    """Generate overlapping PE reads for merge/correction testing.
+
+    Each pair derives from a random fragment with insert size ~220bp (sd=30).
+    R2 is the reverse complement of the fragment's 3' end.  Mismatches are
+    injected in the overlap region with low quality on R2 to trigger base
+    correction (R1 Q30+ vs R2 Q10-14 at mismatch positions).
+    """
+    READ_LEN = 150
+    BATCH = 100_000
+    BASES = b"ACGT"
+    COMP = bytes.maketrans(b"ACGT", b"TGCA")
+    QUAL_POOL = 512
+    INSERT_MEAN = 220
+    INSERT_SD = 30
+    MIN_INSERT = READ_LEN + 30   # need >= 30bp overlap for fastp
+    MAX_INSERT = 2 * READ_LEN - 1
+    MISMATCH_RATE = 0.015
+
+    rng = random.Random(seed)
+
+    pool = []
+    for _ in range(QUAL_POOL):
+        q = bytearray(READ_LEN)
+        for j in range(READ_LEN):
+            if j < 5 or j > READ_LEN - 10:
+                q[j] = rng.randint(25, 35) + 33
+            else:
+                q[j] = rng.randint(30, 40) + 33
+        pool.append(bytes(q))
+
+    def revcomp(seq: bytes) -> bytes:
+        return seq.translate(COMP)[::-1]
+
+    t0 = time.time()
+    written = 0
+    with gzip.open(r1_gz, "wb", compresslevel=1) as f1, \
+         gzip.open(r2_gz, "wb", compresslevel=1) as f2:
+        while written < num_pairs:
+            n = min(BATCH, num_pairs - written)
+            b1 = bytearray()
+            b2 = bytearray()
+            for i in range(n):
+                rid = written + i + 1
+                name = f"@SIM:MERGE:1:{1101 + rid // 10000000}:{rid % 50000}:{(rid * 7) % 50000}".encode()
+
+                # Sample insert size from truncated normal distribution
+                insert = int(rng.gauss(INSERT_MEAN, INSERT_SD))
+                insert = max(MIN_INSERT, min(MAX_INSERT, insert))
+
+                # Generate fragment and derive reads
+                frag = bytearray(rng.choices(BASES, k=insert))
+                s1 = bytes(frag[:READ_LEN])
+                s2 = bytearray(revcomp(bytes(frag[insert - READ_LEN:])))
+
+                q1 = bytearray(pool[rng.randint(0, QUAL_POOL - 1)])
+                q2 = bytearray(pool[rng.randint(0, QUAL_POOL - 1)])
+
+                # Inject mismatches in R2 overlap with low quality to trigger correction
+                overlap_len = 2 * READ_LEN - insert
+                n_mm = max(1, int(overlap_len * MISMATCH_RATE))
+                n_mm = min(n_mm, min(4, int(overlap_len * 0.19)))
+                overlap_start = READ_LEN - overlap_len
+                for p in rng.sample(range(overlap_start, READ_LEN), n_mm):
+                    alts = [b for b in BASES if b != s2[p]]
+                    s2[p] = rng.choice(alts)
+                    q2[p] = rng.randint(10 + 33, 14 + 33)  # Q10-14
+
+                b1 += name + b" 1:N:0:ATCG\n" + s1 + b"\n+\n" + bytes(q1) + b"\n"
+                b2 += name + b" 2:N:0:ATCG\n" + bytes(s2) + b"\n+\n" + bytes(q2) + b"\n"
+            f1.write(bytes(b1))
+            f2.write(bytes(b2))
+            written += n
+            if written % 1_000_000 == 0:
+                e = time.time() - t0
+                eta = (num_pairs - written) / (written / e)
+                print(f"  {100 * written / num_pairs:5.1f}%  {written / 1e6:.0f}M pairs"
+                      f"  {written / e / 1e6:.2f}M/s  ETA {eta:.0f}s", flush=True)
+
+    print(f"  Generated in {time.time() - t0:.1f}s")
diff --git a/benchmark/modes.py b/benchmark/modes.py
@@ -0,0 +1,82 @@
+"""Mode definitions, constants, and path configuration."""
+import os
+from pathlib import Path
+
+# ── Paths ────────────────────────────────────────────────────────────────────
+
+BENCH_DIR = Path("/tmp/fastp_benchmark")
+DATA_DIR = BENCH_DIR / "data"
+OUT_DIR = BENCH_DIR / "output"
+RESULTS_JSON = BENCH_DIR / "results.json"
+
+CPU_COUNT = os.cpu_count() or 4
+
+# ── Mode definitions ─────────────────────────────────────────────────────────
+
+PE_MODES = ["fq-fq", "fq-gz", "gz-fq", "gz-gz"]
+SE_MODES = ["se-fq-fq", "se-fq-gz", "se-gz-fq", "se-gz-gz"]
+ALL_MODES = PE_MODES + SE_MODES + ["stdin-stdout"]
+
+# Feature test modes: exercise specific fastp features beyond default filtering
+FEAT_MODES = [
+    "merge", "correction", "dedup-pe", "dedup-se",
+    "qualcut-pe", "qualcut-se", "ht-pe",
+]
+
+FEAT_DEFS = {
+    "merge":      {"base": "pe", "in_fmt": "gz", "out_fmt": "gz", "data": "merge",
+                   "extra": ["--merge", "--correction"]},
+    "correction": {"base": "pe", "in_fmt": "gz", "out_fmt": "gz", "data": "merge",
+                   "extra": ["--correction"]},
+    "dedup-pe":   {"base": "pe", "in_fmt": "gz", "out_fmt": "gz",
+                   "extra": ["--dedup"]},
+    "dedup-se":   {"base": "se", "in_fmt": "gz", "out_fmt": "gz",
+                   "extra": ["--dedup"]},
+    "qualcut-pe": {"base": "pe", "in_fmt": "gz", "out_fmt": "gz",
+                   "extra": ["--cut_front", "--cut_tail", "-W", "4", "-M", "20"]},
+    "qualcut-se": {"base": "se", "in_fmt": "gz", "out_fmt": "gz",
+                   "extra": ["--cut_front", "--cut_tail", "-W", "4", "-M", "20"]},
+    "ht-pe":      {"base": "pe", "in_fmt": "gz", "out_fmt": "gz",
+                   "extra": [], "threads": 32},
+}
+
+MODE_ALIASES = {
+    "all-pe": PE_MODES,
+    "all-se": SE_MODES + ["stdin-stdout"],
+    "all-feat": FEAT_MODES,
+    "everything": ALL_MODES + FEAT_MODES,
+}
+
+
+def parse_mode(mode: str) -> tuple[str, str, str]:
+    """Parse mode string -> (type, in_fmt, out_fmt).
+
+    Returns ("stdin", "", "") for stdin-stdout,
+            ("se", "fq"|"gz", "fq"|"gz") for se-* modes,
+            ("pe", "fq"|"gz", "fq"|"gz") for bare fq-fq etc.
+    Feature modes (merge, correction, etc.) resolve via FEAT_DEFS.
+    """
+    if mode in FEAT_DEFS:
+        d = FEAT_DEFS[mode]
+        return (d["base"], d["in_fmt"], d["out_fmt"])
+    if mode == "stdin-stdout":
+        return ("stdin", "", "")
+    if mode.startswith("se-"):
+        parts = mode[3:].split("-")
+        return ("se", parts[0], parts[1])
+    parts = mode.split("-")
+    return ("pe", parts[0], parts[1])
+
+
+def auto_threads(mode: str) -> int:
+    """Calculate worker threads, reserving reader/writer for each mode.
+
+    PE:  2 readers (L+R) + 2 writers (L+R) = reserve 4
+    SE:  1 reader + 1 writer               = reserve 2
+    Feature modes may override thread count (e.g. ht-pe forces 32).
+    """
+    if mode in FEAT_DEFS and "threads" in FEAT_DEFS[mode]:
+        return FEAT_DEFS[mode]["threads"]
+    mtype = parse_mode(mode)[0]
+    reserved = 4 if mtype == "pe" else 2
+    return max(1, CPU_COUNT - reserved)
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,9 @@ fastp @@
     *.out
     *.app
+    # Python
+    __pycache__/
     # Editor Config
     .vscode
@@ Expand Down @@