From 2fc30d13c7d5e9078f7d115928af5d92359e13e7 Mon Sep 17 00:00:00 2001
From: Jeff Verkoeyen <jeff@clutch.engineering>
Date: Sat, 18 Apr 2026 23:27:04 -0400
Subject: [PATCH] Replace per-block Mmap with pread, ~300x apply speedup on
 Darwin

Profiling update_from_dir with macOS sample(1) for 30 seconds showed
350 of 361 on-CPU samples (97%) inside the __mmap syscall. The hot
path is:

  File_Blocks::read_block_ -> Mmap::Mmap -> mmap

called once per compressed block read. On macOS each mmap syscall
costs ~0.25 ms of kernel overhead (virtual range alloc, page-table
setup, fault-in, teardown on munmap). Across thousands of block
reads per minute-diff the syscall tax dominates wall time and
prevents apply from keeping pace with the 1-diff-per-minute fetch
rate. Linux mmap is cheaper so this is invisible on Linux.

For Overpass's access pattern there is no benefit to a memory
mapping: each compressed block is read once, decompressed into a
separate buffer, and never revisited. Replacing the mmap with pread
into a heap buffer keeps the Mmap::ptr() interface
pointer-compatible with every caller (Zlib and LZ4 Inflate) while
eliminating the syscall tax. Linux performance is unaffected -- pread
hits the same page cache that mmap would have.

Measured effect on a single-diff apply (7076518, 6550 ops) against a
live 291 GB database on Apple Silicon (M-series):

  before: 9 min 01 s  (100% CPU, 97% samples in __mmap)
  after:  1.79 s      (67% CPU, now compute-bound)

~300x speedup. The NO_COMPRESSION branch of File_Blocks::read_block_
already uses pread via data_file.read(); this brings the compressed
path to parity.
---
 src/template_db/file_blocks.h | 55 +++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/src/template_db/file_blocks.h b/src/template_db/file_blocks.h
index 5d42f0068..1a810d628 100644
--- a/src/template_db/file_blocks.h
+++ b/src/template_db/file_blocks.h
@@ -787,25 +787,56 @@ typename File_Blocks< TIndex, TIterator >::Write_Iterator
 class Mmap
 {
 public:
+  // Backed by pread into a heap buffer rather than mmap+munmap. Each
+  // compressed block is read once, decompressed into a separate buffer,
+  // and never revisited -- the memory mapping provides no reuse benefit.
+  //
+  // On macOS every mmap syscall takes ~0.25 ms of kernel overhead
+  // (virtual-range allocation, page-table setup, fault-in, teardown on
+  // munmap). File_Blocks::read_block_ creates one Mmap per compressed
+  // block; across thousands of block reads per minute-diff that
+  // overhead dominates wall time -- profiling showed 97% of
+  // update_from_dir's on-CPU samples inside __mmap.
+  //
+  // Replacing with pread keeps ptr() pointer-compatible with every
+  // callsite (Zlib/LZ4 Inflate) and is no slower on Linux, where pread
+  // hits the same page cache that mmap would have.
   Mmap(int fd, off_t offset, size_t length_, const std::string& file_name, const std::string& origin)
-      : addr(0), length(length_)
+      : buffer(0), length(length_)
   {
     if (length > 0)
-      addr = mmap(0, length, PROT_READ, MAP_PRIVATE, fd, offset);
-    if (addr == (void*)(-1))
-      throw File_Error(errno, file_name, origin);
-    posix_madvise(addr, length, POSIX_MADV_WILLNEED);
+    {
+      buffer = new uint8_t[length];
+      size_t got = 0;
+      while (got < length)
+      {
+        ssize_t n = pread(fd, buffer + got, length - got, offset + got);
+        if (n < 0)
+        {
+          int err = errno;
+          delete[] buffer;
+          buffer = 0;
+          throw File_Error(err, file_name, origin);
+        }
+        if (n == 0)
+        {
+          delete[] buffer;
+          buffer = 0;
+          throw File_Error(0, file_name, origin);
+        }
+        got += (size_t)n;
+      }
+    }
   }
   ~Mmap()
-  { 
-    if (addr)
-      munmap(addr, length);
+  {
+    delete[] buffer;
   }
-  
-  uint64* ptr() { return (uint64*)addr; }
-  
+
+  uint64* ptr() { return (uint64*)buffer; }
+
 private:
-  void* addr;
+  uint8_t* buffer;
   size_t length;
 };