Skip to content

Commit

Permalink
prefix of X for filler taxonomies; changed default fraction cutoff
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicholas Youngblut committed Jan 26, 2021
1 parent c391758 commit 7fb35bd
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 5 deletions.
10 changes: 5 additions & 5 deletions ncbi-gtdb_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
parser.add_argument('-q', '--query-taxonomy', type=str, default='ncbi_taxonomy',
choices=['ncbi_taxonomy', 'gtdb_taxonomy'],
help='Taxonomy of the query list (Default: %(default)s)')
parser.add_argument('-f', '--fraction', type=float, default=0.51,
parser.add_argument('-f', '--fraction', type=float, default=0.90,
help='Homogeneity of LCA (fraction) in order to be used (Default: %(default)s)')
parser.add_argument('-m', '--max-tips', type=int, default=100,
help='Max no. of tips used for LCA determination. If more, subsampling w/out replacement (Default: %(default)s)')
Expand Down Expand Up @@ -205,7 +205,7 @@ def format_taxonomy(T, hierarchy, acc):
Tx = ['' for i in range(len(hierarchy))]
for i,x in enumerate(hierarchy[:-1]):
if len(T) < i + 1 or T[i] == '' or T[i] == 'unclassified' or regex.search(T[i]):
Tx[i] = '__'.join([x[0], acc])
Tx[i] = '__'.join(['X' + x[0], acc])
else:
Tx[i] = T[i]
Tx[-1] = acc
Expand Down Expand Up @@ -294,7 +294,7 @@ def load_gtdb_metadata(infile, G, completeness, contamination):
raise KeyError('Cannot find "ncbi_taxonomy"')
if X == 'none':
stats['no ncbi tax'] += 1
continue
continue
# filtering by checkM stats
try:
X = line[header['checkm_completeness']]
Expand Down Expand Up @@ -351,7 +351,7 @@ def lca_frac_pass(D, lca_frac):
mc = D.most_common(1)
except IndexError:
return [None,None]
if re.search(r'^[pcofgs]__$', mc[0][0]):
if re.search(r'^[Xx][pcofgs]__', mc[0][0]):
return [None,None]
try:
frac = mc[0][1] / float(sum(D.values()))
Expand Down Expand Up @@ -401,7 +401,7 @@ def _query_tax(tax_queries, G, qtax, ttax, lca_frac=1.0, max_tips=100, verbose=F
# iterating queries
for Q in tax_queries:
tips = []
try:
try:
# getting descendents of the node
tips = [desc for desc in descendants(G[qtax], Q[0]) if \
G[qtax].nodes[desc]['taxonomy'] == 'strain']
Expand Down
2 changes: 2 additions & 0 deletions tests/data/ncbi-gtdb/gtdb_tax_queries.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
f__Gearchaeaceae
g__Aquabacter
s__Nitrosopumilus sp000746785
g__Escherichia
Expand All @@ -6,5 +7,6 @@ s__Xanthomonas oryzae
c__Gammaproteobacteria
o__Burkholderiales
c__Bacteroidia
f__BM003
s__Homo sapiens
Blank

0 comments on commit 7fb35bd

Please sign in to comment.