Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions bgzf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2257,6 +2257,32 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
return bgzf_seek_common(fp, pos >> 16, pos & 0xFFFF);
}

int64_t bgzf_seek_limit(BGZF* fp, int64_t pos, int where, int64_t limit)
{
if (fp->is_write || where != SEEK_SET || fp->is_gzip) {
fp->errcode |= BGZF_ERR_MISUSE;
return -1;
}

fp->seeked = pos;

// Perform the seek first - bgzf_seek_common calls hseek which clears readahead_limit
int64_t ret = bgzf_seek_common(fp, pos >> 16, pos & 0xFFFF);
if (ret < 0)
return ret;

// Set readahead limit hint AFTER seek (hseek clears it, so must be set after)
// This enables bounded Range requests for remote backends
if (limit > 0 && fp->fp) {
off_t compressed_limit = limit >> 16;
// Add some buffer for BGZF block overhead (~64KB worst case)
compressed_limit += 65536;
hfile_set_readahead_limit(fp->fp, compressed_limit);
}

return ret;
}

int bgzf_is_bgzf(const char *fn)
{
uint8_t buf[16];
Expand Down
8 changes: 8 additions & 0 deletions hfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
fp->limit = &fp->buffer[capacity];

fp->offset = 0;
fp->readahead_limit = 0;
fp->at_eof = 0;
fp->mobile = 1;
fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
Expand All @@ -149,6 +150,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode,
fp->limit = &fp->buffer[buf_size];

fp->offset = 0;
fp->readahead_limit = 0;
fp->at_eof = 1;
fp->mobile = 0;
fp->readonly = (strchr(mode, 'r') && ! strchr(mode, '+'));
Expand Down Expand Up @@ -484,9 +486,15 @@ off_t hseek(hFILE *fp, off_t offset, int whence)
fp->at_eof = 0;

fp->offset = pos;
fp->readahead_limit = 0; // Clear hint after seek
return pos;
}

void hfile_set_readahead_limit(hFILE *fp, off_t limit)
{
fp->readahead_limit = limit;
}

int hclose(hFILE *fp)
{
int err = fp->has_errno;
Expand Down
18 changes: 14 additions & 4 deletions hfile_libcurl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1138,9 +1138,8 @@ static int restart_from_position(hFILE_libcurl *fp, off_t pos) {
int update_headers = 0;
int save_errno = 0;

// TODO If we seem to be doing random access, use CURLOPT_RANGE to do
// limited reads (e.g. about a BAM block!) so seeking can reuse the
// existing connection more often.
// When readahead_limit is set (via hfile_set_readahead_limit from BAM index),
// use CURLOPT_RANGE for bounded reads instead of reading to EOF.

// Get new headers from the callback (if defined). This changes the
// headers in fp before it gets duplicated, but they should be have been
Expand Down Expand Up @@ -1182,7 +1181,18 @@ static int restart_from_position(hFILE_libcurl *fp, off_t pos) {
if (!temp_fp.easy)
goto early_error;

err = curl_easy_setopt(temp_fp.easy, CURLOPT_RESUME_FROM_LARGE,(curl_off_t)pos);
// Use bounded Range request if readahead_limit is set (from BAM index chunk info)
if (fp->base.readahead_limit > 0 && fp->base.readahead_limit > pos) {
char range[80];
off_t end = fp->base.readahead_limit - 1;
if (fp->file_size > 0 && end >= fp->file_size)
end = fp->file_size - 1;
snprintf(range, sizeof(range), "%lld-%lld", (long long)pos, (long long)end);
err = curl_easy_setopt(temp_fp.easy, CURLOPT_RANGE, range);
} else {
// No limit known - read from pos to EOF
err = curl_easy_setopt(temp_fp.easy, CURLOPT_RESUME_FROM_LARGE, (curl_off_t)pos);
}
err |= curl_easy_setopt(temp_fp.easy, CURLOPT_PRIVATE, &temp_fp);
err |= curl_easy_setopt(temp_fp.easy, CURLOPT_WRITEDATA, &temp_fp);
if (err != CURLE_OK) {
Expand Down
3 changes: 2 additions & 1 deletion hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -4291,7 +4291,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) {
// Use bgzf_seek_limit to enable bounded HTTP Range requests for remote files
if (bgzf_seek_limit(fp, iter->off[iter->i+1].u, SEEK_SET, iter->off[iter->i+1].v) < 0) {
hts_log_error("Failed to seek to offset %"PRIu64"%s%s",
iter->off[iter->i+1].u,
errno ? ": " : "", strerror(errno));
Expand Down
16 changes: 16 additions & 0 deletions htslib/bgzf.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,22 @@ ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) {
HTSLIB_EXPORT
int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence) HTS_RESULT_USED;

/**
* Set the virtual file pointer, like bgzf_seek, with a readahead limit hint.
*
* The limit is the virtual file offset up to which data will be read.
* For remote files, this enables bounded HTTP Range requests instead of
* reading to EOF. Use when the read extent is known (e.g., from BAM index).
*
* @param fp BGZF file handler
* @param pos virtual file offset
* @param whence must be SEEK_SET
* @param limit virtual file offset limit (0 = no limit)
* @return non-negative virtual offset on success; -1 on error
*/
HTSLIB_EXPORT
int64_t bgzf_seek_limit(BGZF *fp, int64_t pos, int whence, int64_t limit) HTS_RESULT_USED;

/**
* Check if the BGZF end-of-file (EOF) marker is present
*
Expand Down
12 changes: 12 additions & 0 deletions htslib/hfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ typedef struct hFILE {
char *buffer, *begin, *end, *limit;
const struct hFILE_backend *backend;
off_t offset;
off_t readahead_limit; // Hint: upper bound for next read (0 = no limit)
unsigned at_eof:1, mobile:1, readonly:1, preserve:1;
int has_errno;
// @endcond
Expand Down Expand Up @@ -149,6 +150,17 @@ static inline void hclearerr(hFILE *fp)
HTSLIB_EXPORT
off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;

/// Set a readahead limit hint for remote backends
/** @param limit Upper bound file offset for next read sequence (0 = no limit)

For remote file backends (HTTP, S3, etc.), this hint enables bounded
range requests instead of reading to EOF. Set before seeking to a known
chunk boundary (e.g., from BAM index) to enable efficient partial fetches.
The limit is automatically cleared after the next seek.
*/
HTSLIB_EXPORT
void hfile_set_readahead_limit(hFILE *fp, off_t limit);

/// Report the current stream offset
/** @return The offset within the stream, starting from zero.
*/
Expand Down