From 2fc30d13c7d5e9078f7d115928af5d92359e13e7 Mon Sep 17 00:00:00 2001 From: Jeff Verkoeyen Date: Sat, 18 Apr 2026 23:27:04 -0400 Subject: [PATCH] Replace per-block Mmap with pread, ~300x apply speedup on Darwin Profiling update_from_dir with macOS sample(1) for 30 seconds showed 350 of 361 on-CPU samples (97%) inside the __mmap syscall. The hot path is: File_Blocks::read_block_ -> Mmap::Mmap -> mmap called once per compressed block read. On macOS each mmap syscall costs ~0.25 ms of kernel overhead (virtual range alloc, page-table setup, fault-in, teardown on munmap). Across thousands of block reads per minute-diff the syscall tax dominates wall time and prevents apply from keeping pace with the 1-diff-per-minute fetch rate. Linux mmap is cheaper so this is invisible on Linux. For Overpass's access pattern there is no benefit to a memory mapping: each compressed block is read once, decompressed into a separate buffer, and never revisited. Replacing the mmap with pread into a heap buffer keeps the Mmap::ptr() interface pointer-compatible with every caller (Zlib and LZ4 Inflate) while eliminating the syscall tax. Linux performance is unaffected -- pread hits the same page cache that mmap would have. Measured effect on a single-diff apply (7076518, 6550 ops) against a live 291 GB database on Apple Silicon (M-series): before: 9 min 01 s (100% CPU, 97% samples in __mmap) after: 1.79 s (67% CPU, now compute-bound) ~300x speedup. The NO_COMPRESSION branch of File_Blocks::read_block_ already uses pread via data_file.read(); this brings the compressed path to parity. --- src/template_db/file_blocks.h | 55 +++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/src/template_db/file_blocks.h b/src/template_db/file_blocks.h index 5d42f0068..1a810d628 100644 --- a/src/template_db/file_blocks.h +++ b/src/template_db/file_blocks.h @@ -787,25 +787,56 @@ typename File_Blocks< TIndex, TIterator >::Write_Iterator class Mmap { public: + // Backed by pread into a heap buffer rather than mmap+munmap. Each + // compressed block is read once, decompressed into a separate buffer, + // and never revisited -- the memory mapping provides no reuse benefit. + // + // On macOS every mmap syscall takes ~0.25 ms of kernel overhead + // (virtual-range allocation, page-table setup, fault-in, teardown on + // munmap). File_Blocks::read_block_ creates one Mmap per compressed + // block; across thousands of block reads per minute-diff that + // overhead dominates wall time -- profiling showed 97% of + // update_from_dir's on-CPU samples inside __mmap. + // + // Replacing with pread keeps ptr() pointer-compatible with every + // callsite (Zlib/LZ4 Inflate) and is no slower on Linux, where pread + // hits the same page cache that mmap would have. Mmap(int fd, off_t offset, size_t length_, const std::string& file_name, const std::string& origin) - : addr(0), length(length_) + : buffer(0), length(length_) { if (length > 0) - addr = mmap(0, length, PROT_READ, MAP_PRIVATE, fd, offset); - if (addr == (void*)(-1)) - throw File_Error(errno, file_name, origin); - posix_madvise(addr, length, POSIX_MADV_WILLNEED); + { + buffer = new uint8_t[length]; + size_t got = 0; + while (got < length) + { + ssize_t n = pread(fd, buffer + got, length - got, offset + got); + if (n < 0) + { + int err = errno; + delete[] buffer; + buffer = 0; + throw File_Error(err, file_name, origin); + } + if (n == 0) + { + delete[] buffer; + buffer = 0; + throw File_Error(0, file_name, origin); + } + got += (size_t)n; + } + } } ~Mmap() - { - if (addr) - munmap(addr, length); + { + delete[] buffer; } - - uint64* ptr() { return (uint64*)addr; } - + + uint64* ptr() { return (uint64*)buffer; } + private: - void* addr; + uint8_t* buffer; size_t length; };