forked from sourmash-bio/sourmash
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute-dna-mh-another-way.py
executable file
·73 lines (56 loc) · 1.8 KB
/
compute-dna-mh-another-way.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#! /usr/bin/env python
"""
Use the MurmurHash library mmh3 and separate Python code to calculate
a MinHash signature for input DNA sequence, as a way to do an
external check on our C++ implementation.
The output of this is used in test_sourmash.py to verify our C++ code.
"""
__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" }
def complement(s):
"""
Return complement of 's'.
"""
c = "".join(__complementTranslation[n] for n in s)
return c
def reverse(s):
"""
Return reverse of 's'.
"""
r = "".join(reversed(s))
return r
def kmers(seq, k):
for start in range(len(seq) - k + 1):
yield seq[start:start + k]
###
K = 21
import sys, screed
import mmh3
import sourmash
print('imported sourmash:', sourmash, file=sys.stderr)
import sourmash.signature
record = next(iter(screed.open(sys.argv[1])))
print('loaded', record.name, file=sys.stderr)
revcomp = reverse(complement((record.sequence)))
mh = sourmash.MinHash(ksize=K, n=500, is_protein=False)
#
# compute the actual hashes to insert by breaking down the sequence
# into k-mers and applying MurmurHash to each one; here, the only
# interesting thing that is done by add_hash is to keep only the
# (numerically) lowest n=500 hashes.
#
# this method of hash computation is exactly how sourmash does it
# internally, and should be approximately the same as what mash does.
#
for fwd_kmer in kmers(record.sequence, K):
rev_kmer = reverse(complement(fwd_kmer))
if fwd_kmer < rev_kmer:
kmer = fwd_kmer
else:
kmer = rev_kmer
hash = mmh3.hash64(kmer, seed=42)[0]
# convert to unsigned int if negative
if hash < 0:
hash += 2**64
mh.add_hash(hash)
s = sourmash.signature.SourmashSignature('', mh, name=record.name)
print(sourmash.signature.save_signatures([s]))