Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 27 additions & 16 deletions cram/cram_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,7 +1502,10 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,

// Ensure ref and hist are large enough.
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
hts_pos_t ref_start, hts_pos_t *ref_end) {
hts_pos_t ref_start, hts_pos_t *ref_end,
hts_pos_t *max_pos) {
Comment thread
daviesrob marked this conversation as resolved.
Outdated
if (*max_pos < pos)
*max_pos = pos;
if (pos < ref_start)
return -1;
if (pos < *ref_end)
Expand Down Expand Up @@ -1546,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
// Returns 1 on success, <0 on failure
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end,
const uint8_t *MD) {
hts_pos_t *max_pos, const uint8_t *MD) {
uint8_t *seq = bam_get_seq(b);
uint32_t *cigar = bam_get_cigar(b);
uint32_t ncigar = b->core.n_cigar;
Expand All @@ -1558,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],

// No sequence means extend based on CIGAR instead
if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
ref_start, ref_end) < 0)
ref_start, ref_end, max_pos) < 0)
return -1;

int iseq = 0, next_op;
Expand All @@ -1573,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
if (overflow ||
extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
ref_start, ref_end, max_pos) < 0)
return -1;
while (iseq < b->core.l_qseq && len) {
// rewrite to have internal loops?
Expand Down Expand Up @@ -1605,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
MD++;
while (isalpha(*MD)) {
if (extend_ref(ref, hist, iref+ref_start, ref_start,
ref_end) < 0)
ref_end, max_pos) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
Expand All @@ -1621,7 +1624,8 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
}
} else {
// substitution
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end,
max_pos) < 0)
return -1;
if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
&iseq, &cig_ind, &cig_op,
Expand Down Expand Up @@ -1650,12 +1654,14 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
// Returns >=0 on success,
// -1 on failure (eg inconsistent data)
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
hts_pos_t ref_start, hts_pos_t *ref_end) {
hts_pos_t ref_start, hts_pos_t *ref_end,
hts_pos_t *max_pos) {
const uint8_t *MD = bam_aux_get(b, "MD");
int ret = 0;
if (MD && *MD == 'Z') {
// We can use MD to directly compute the reference
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end,
max_pos, MD+1);

if (ret > 0)
return ret;
Expand Down Expand Up @@ -1683,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};

if (extend_ref(ref, hist, iref+ref_start + len,
ref_start, ref_end) < 0)
ref_start, ref_end, max_pos) < 0)
return -1;
if (iseq + len <= b->core.l_qseq) {
// Nullify failed MD:Z if appropriate
Expand Down Expand Up @@ -1726,15 +1732,15 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
// user told us to do embed_ref=2.
char *ref = NULL;
uint32_t (*hist)[5] = NULL;
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0, max_pos = 0;
if (ref_start < 0)
return -1; // cannot build consensus from unmapped data

// initial allocation
if (extend_ref(&ref, &hist,
c->bams[r1 + s->hdr->num_records-1]->core.pos +
c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
ref_start, &ref_end) < 0)
ref_start, &ref_end, &max_pos) < 0)
return -1;

// Add each bam file to the reference/consensus arrays
Expand All @@ -1746,7 +1752,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
goto err;
}
last_pos = c->bams[r1]->core.pos;
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end,
&max_pos) < 0)
goto err;
}

Expand All @@ -1768,8 +1775,9 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
// ref file.
c->ref = ref;
c->ref_start = ref_start+1;
c->ref_end = ref_end+1;
c->ref_end = max_pos+1;
c->ref_free = 1;

return 0;

err:
Expand Down Expand Up @@ -2012,8 +2020,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) {
pthread_mutex_unlock(&fd->ref_lock);
}

if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
hts_pos_t rlen = MAX(fd->refs->ref_id[c->ref_id]->LN_length,
fd->refs->ref_id[c->ref_id]->length);
if (c->ref_end > rlen && rlen)
c->ref_end = rlen;
}

// Iterate through records creating the cram blocks for some
Expand Down Expand Up @@ -3927,7 +3937,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
cram_stats_del(c->stats[DS_NP], p->mate_pos);
cram_stats_del(c->stats[DS_MF], p->mate_flags);
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)
&& !explicit_tlen)
cram_stats_del(c->stats[DS_TS], p->tlen);
cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
}
Expand Down
10 changes: 10 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,16 @@ sub test_view
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

# Embed_ref=2 with CRAM v4 uses explicit_len if it has to instead of
# breaking pairs with detached mode.
# Oddly this bug was only triggered when also specifying a reference.
$ersam = "xx#pair.sam";
$ercram = "xx#pair.tmp.cram";
$ersam2 = "${ercram}.sam";
testv $opts, "./test_view $tv_args -o version=4.0 -o embed_ref=2 -t xx.fa -C -p $ercram $ersam";
testv $opts, "./test_view $tv_args -p $ersam2 $ercram";
testv $opts, "./compare_sam.pl $ersam $ersam2";

if ($test_view_failures == 0) {
passed($opts, "embed_ref=2 tests");
} else {
Expand Down
Loading