Skip to content

Commit

Permalink
r718: retrieve sequence from the index
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Feb 23, 2018
1 parent 29ed675 commit 24a4808
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 20 deletions.
40 changes: 31 additions & 9 deletions index.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
typedef khash_t(idx) idxhash_t;

KHASH_MAP_INIT_STR(str, uint32_t)

#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))

typedef struct mm_idx_bucket_s {
Expand All @@ -29,15 +31,6 @@ typedef struct mm_idx_bucket_s {
void *h; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t;

void mm_idxopt_init(mm_idxopt_t *opt)
{
memset(opt, 0, sizeof(mm_idxopt_t));
opt->k = 15, opt->w = 10, opt->flag = 0;
opt->bucket_bits = 14;
opt->mini_batch_size = 50000000;
opt->batch_size = 4000000000ULL;
}

mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
{
mm_idx_t *mi;
Expand All @@ -54,6 +47,7 @@ void mm_idx_destroy(mm_idx_t *mi)
{
int i;
if (mi == 0) return;
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
for (i = 0; i < 1<<mi->b; ++i) {
free(mi->B[i].p);
free(mi->B[i].a.a);
Expand Down Expand Up @@ -109,6 +103,34 @@ void mm_idx_stat(const mm_idx_t *mi)
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum);
}

int mm_idx_index_name(mm_idx_t *mi)
{
khash_t(str) *h;
uint32_t i;
int has_dup = 0, absent;
if (mi->h) return 0;
h = kh_init(str);
for (i = 0; i < mi->n_seq; ++i) {
khint_t k;
k = kh_put(str, h, mi->seq[i].name, &absent);
if (absent) kh_val(h, k) = i;
else has_dup = 1;
}
mi->h = h;
if (has_dup && mm_verbose >= 2)
fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
return has_dup;
}

int mm_idx_name2id(const mm_idx_t *mi, const char *name)
{
khash_t(str) *h = (khash_t(str)*)mi->h;
khint_t k;
if (h == 0) return -2;
k = kh_get(str, h, name);
return k == kh_end(h)? -1 : kh_val(h, k);
}

int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
{
uint64_t i, st1, en1;
Expand Down
2 changes: 1 addition & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "mmpriv.h"
#include "getopt.h"

#define MM_VERSION "2.8-r711-dirty"
#define MM_VERSION "2.8-r718-dirty"

#ifdef __linux__
#include <sys/resource.h>
Expand Down
7 changes: 6 additions & 1 deletion minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ typedef struct {
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
void *km;
void *km, *h;
} mm_idx_t;

// minimap2 alignment
Expand Down Expand Up @@ -297,6 +297,11 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int

int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);

// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);

// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
Expand Down
1 change: 0 additions & 1 deletion mmpriv.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se

void mm_idxopt_init(mm_idxopt_t *opt);
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);
Expand Down
9 changes: 9 additions & 0 deletions options.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
#include <stdio.h>
#include "mmpriv.h"

void mm_idxopt_init(mm_idxopt_t *opt)
{
memset(opt, 0, sizeof(mm_idxopt_t));
opt->k = 15, opt->w = 10, opt->flag = 0;
opt->bucket_bits = 14;
opt->mini_batch_size = 50000000;
opt->batch_size = 4000000000ULL;
}

void mm_mapopt_init(mm_mapopt_t *opt)
{
memset(opt, 0, sizeof(mm_mapopt_t));
Expand Down
17 changes: 14 additions & 3 deletions python/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ The following Python script demonstrates the key functionality of mappy:
import mappy as mp
a = mp.Aligner("test/MT-human.fa") # load or build index
if not a: raise Exception("ERROR: failed to load/build index")
s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
print(mp.revcomp(s)) # reverse complement
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
for hit in a.map(seq): # traverse alignments
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
Expand Down Expand Up @@ -87,7 +89,15 @@ This method aligns :code:`seq` against the index. It is a generator, *yielding*
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
performs paired-end alignment, assuming the two ends are in the FR orientation.
Alignments of the two ends can be distinguished by the :code:`read_num` field
(see below).
(see Class mappy.Alignment below).

.. code:: python
mappy.Aligner.seq(name, start=0, end=0x7fffffff)
This method retrieves a (sub)sequence from the index and returns it as a Python
string. :code:`None` is returned if :code:`name` is not present in the index or
the start/end coordinates are invalid.

Class mappy.Alignment
~~~~~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -144,11 +154,12 @@ Miscellaneous Functions

.. code:: python
mappy.fastx_read(fn)
mappy.fastx_read(fn, read_comment=False)
This generator function opens a FASTA/FASTQ file and *yields* a
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
optionally gzip'd.
optionally gzip'd. If :code:`read_comment` is True, this generator yields
a :code:`(name,seq,qual,comment)` tuple instead.

.. code:: python
Expand Down
18 changes: 18 additions & 0 deletions python/cmappy.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,22 @@ static inline char *mappy_revcomp(int len, const uint8_t *seq)
return rev;
}

static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
{
int i, rid;
char *s;
*len = 0;
rid = mm_idx_name2id(mi, name);
if (rid < 0) return 0;
if (st >= mi->seq[i].len || st >= en) return 0;
if (en < 0 || en > mi->seq[i].len)
en = mi->seq[i].len;
s = (char*)malloc(en - st + 1);
*len = mm_idx_getseq(mi, rid, st, en, s);
for (i = 0; i < *len; ++i)
s[i] = "ACGTN"[(uint8_t)s[i]];
s[*len] = 0;
return s;
}

#endif
5 changes: 4 additions & 1 deletion python/cmappy.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ cdef extern from "minimap.h":
mm_idx_seq_t *seq
uint32_t *S
mm_idx_bucket_t *B
void *km
void *km, *h

ctypedef struct mm_idx_reader_t:
pass
Expand All @@ -69,6 +69,8 @@ cdef extern from "minimap.h":
void mm_idx_destroy(mm_idx_t *mi)
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)

int mm_idx_index_name(mm_idx_t *mi)

#
# Mapping (key struct defined in cmappy.h below)
#
Expand Down Expand Up @@ -99,6 +101,7 @@ cdef extern from "cmappy.h":
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
void mm_free_reg1(mm_reg1_t *r)
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)

ctypedef struct kstring_t:
unsigned l, m
Expand Down
36 changes: 32 additions & 4 deletions python/mappy.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ cdef class Aligner:
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
cmappy.mm_idx_reader_close(r)
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
cmappy.mm_idx_index_name(self._idx)

def __dealloc__(self):
if self._idx is not NULL:
Expand All @@ -140,8 +141,13 @@ cdef class Aligner:
if self._idx is NULL: return None
if buf is None: b = ThreadBuffer()
else: b = buf
if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt)
else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt)

_seq = seq if isinstance(seq, bytes) else seq.encode()
if seq2 is None:
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &self.map_opt)
else:
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &self.map_opt)

for i in range(n_regs):
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
Expand All @@ -153,7 +159,24 @@ cdef class Aligner:
cmappy.mm_free_reg1(&regs[i])
free(regs)

def fastx_read(fn):
def seq(self, str name, int start=0, int end=0x7fffffff):
cdef int l
cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
if l == 0: return None
r = s[:l] if isinstance(s, str) else s[:l].decode()
free(s)
return r

@property
def k(self): return self._idx.k

@property
def w(self): return self._idx.w

@property
def n_seq(self): return self._idx.n_seq

def fastx_read(fn, read_comment=False):
cdef cmappy.kseq_t *ks
ks = cmappy.mm_fastx_open(str.encode(fn))
if ks is NULL: return None
Expand All @@ -162,7 +185,12 @@ def fastx_read(fn):
else: qual = None
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
yield name, seq, qual
if read_comment:
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
else: comment = None
yield name, seq, qual, comment
else:
yield name, seq, qual
cmappy.mm_fastx_close(ks)

def revcomp(seq):
Expand Down

0 comments on commit 24a4808

Please sign in to comment.