diff --git a/bgzf.c b/bgzf.c index 307c534d1..3e472bb01 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2257,6 +2257,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) return bgzf_seek_common(fp, pos >> 16, pos & 0xFFFF); } +int64_t bgzf_seek_limit(BGZF* fp, int64_t pos, int where, int64_t limit) +{ + if (fp->is_write || where != SEEK_SET || fp->is_gzip) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + + fp->seeked = pos; + + // Perform the seek first - bgzf_seek_common calls hseek which clears readahead_limit + int64_t ret = bgzf_seek_common(fp, pos >> 16, pos & 0xFFFF); + if (ret < 0) + return ret; + + // Set readahead limit hint AFTER seek (hseek clears it, so must be set after) + // This enables bounded Range requests for remote backends + if (limit > 0 && fp->fp) { + off_t compressed_limit = limit >> 16; + // Add some buffer for BGZF block overhead (~64KB worst case) + compressed_limit += 65536; + hfile_set_readahead_limit(fp->fp, compressed_limit); + } + + return ret; +} + int bgzf_is_bgzf(const char *fn) { uint8_t buf[16]; diff --git a/hfile.c b/hfile.c index 044f0d4bc..a557e1e24 100644 --- a/hfile.c +++ b/hfile.c @@ -126,6 +126,7 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) fp->limit = &fp->buffer[capacity]; fp->offset = 0; + fp->readahead_limit = 0; fp->at_eof = 0; fp->mobile = 1; fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+')); @@ -149,6 +150,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode, fp->limit = &fp->buffer[buf_size]; fp->offset = 0; + fp->readahead_limit = 0; fp->at_eof = 1; fp->mobile = 0; fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+')); @@ -484,9 +486,15 @@ off_t hseek(hFILE *fp, off_t offset, int whence) fp->at_eof = 0; fp->offset = pos; + fp->readahead_limit = 0; // Clear hint after seek return pos; } +void hfile_set_readahead_limit(hFILE *fp, off_t limit) +{ + fp->readahead_limit = limit; +} + int hclose(hFILE *fp) { int err = fp->has_errno; diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 0ca15d0e9..52c057f3d 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1138,9 +1138,8 @@ static int restart_from_position(hFILE_libcurl *fp, off_t pos) { int update_headers = 0; int save_errno = 0; - // TODO If we seem to be doing random access, use CURLOPT_RANGE to do - // limited reads (e.g. about a BAM block!) so seeking can reuse the - // existing connection more often. + // When readahead_limit is set (via hfile_set_readahead_limit from BAM index), + // use CURLOPT_RANGE for bounded reads instead of reading to EOF. // Get new headers from the callback (if defined). This changes the // headers in fp before it gets duplicated, but they should be have been @@ -1182,7 +1181,18 @@ static int restart_from_position(hFILE_libcurl *fp, off_t pos) { if (!temp_fp.easy) goto early_error; - err = curl_easy_setopt(temp_fp.easy, CURLOPT_RESUME_FROM_LARGE,(curl_off_t)pos); + // Use bounded Range request if readahead_limit is set (from BAM index chunk info) + if (fp->base.readahead_limit > 0 && fp->base.readahead_limit > pos) { + char range[80]; + off_t end = fp->base.readahead_limit - 1; + if (fp->file_size > 0 && end >= fp->file_size) + end = fp->file_size - 1; + snprintf(range, sizeof(range), "%lld-%lld", (long long)pos, (long long)end); + err = curl_easy_setopt(temp_fp.easy, CURLOPT_RANGE, range); + } else { + // No limit known - read from pos to EOF + err = curl_easy_setopt(temp_fp.easy, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)pos); + } err |= curl_easy_setopt(temp_fp.easy, CURLOPT_PRIVATE, &temp_fp); err |= curl_easy_setopt(temp_fp.easy, CURLOPT_WRITEDATA, &temp_fp); if (err != CURLE_OK) { diff --git a/hts.c b/hts.c index 14134a01f..d37c05089 100644 --- a/hts.c +++ b/hts.c @@ -4291,7 +4291,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek - if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) { + // Use bgzf_seek_limit to enable bounded HTTP Range requests for remote files + if (bgzf_seek_limit(fp, iter->off[iter->i+1].u, SEEK_SET, iter->off[iter->i+1].v) < 0) { hts_log_error("Failed to seek to offset %"PRIu64"%s%s", iter->off[iter->i+1].u, errno ? ": " : "", strerror(errno)); diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 36a4ff77a..acfad84bc 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -274,6 +274,22 @@ ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) { HTSLIB_EXPORT int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) HTS_RESULT_USED; + /** + * Set the virtual file pointer, like bgzf_seek, with a readahead limit hint. + * + * The limit is the virtual file offset up to which data will be read. + * For remote files, this enables bounded HTTP Range requests instead of + * reading to EOF. Use when the read extent is known (e.g., from BAM index). + * + * @param fp BGZF file handler + * @param pos virtual file offset + * @param whence must be SEEK_SET + * @param limit virtual file offset limit (0 = no limit) + * @return non-negative virtual offset on success; -1 on error + */ + HTSLIB_EXPORT + int64_t bgzf_seek_limit(BGZF *fp, int64_t pos, int whence, int64_t limit) HTS_RESULT_USED; + /** * Check if the BGZF end-of-file (EOF) marker is present * diff --git a/htslib/hfile.h b/htslib/hfile.h index e851faf43..dacf4fde9 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -57,6 +57,7 @@ typedef struct hFILE { char *buffer, *begin, *end, *limit; const struct hFILE_backend *backend; off_t offset; + off_t readahead_limit; // Hint: upper bound for next read (0 = no limit) unsigned at_eof:1, mobile:1, readonly:1, preserve:1; int has_errno; // @endcond @@ -149,6 +150,17 @@ static inline void hclearerr(hFILE *fp) HTSLIB_EXPORT off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED; +/// Set a readahead limit hint for remote backends +/** @param limit Upper bound file offset for next read sequence (0 = no limit) + +For remote file backends (HTTP, S3, etc.), this hint enables bounded +range requests instead of reading to EOF. Set before seeking to a known +chunk boundary (e.g., from BAM index) to enable efficient partial fetches. +The limit is automatically cleared after the next seek. +*/ +HTSLIB_EXPORT +void hfile_set_readahead_limit(hFILE *fp, off_t limit); + /// Report the current stream offset /** @return The offset within the stream, starting from zero. */