forked from DerrickWood/kraken2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkraken2lib.pm
90 lines (80 loc) · 2.74 KB
/
kraken2lib.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package kraken2lib;
# Copyright 2013-2019, Derrick Wood <[email protected]>
#
# This file is part of the Kraken 2 taxonomic sequence classification system.
# Common subroutines for other Kraken scripts
use strict;
use warnings;
# Input: the argument for a --db option (possibly undefined)
# Returns: the DB to use, taking KRAKEN2_DEFAULT_DB and KRAKEN2_DB_PATH
# into account.
sub find_db {
my $supplied_db_prefix = shift;
my $db_prefix;
if (! defined $supplied_db_prefix) {
if (! exists $ENV{"KRAKEN2_DEFAULT_DB"}) {
die "Must specify DB with either --db or \$KRAKEN2_DEFAULT_DB\n";
}
$supplied_db_prefix = $ENV{"KRAKEN2_DEFAULT_DB"};
}
my @db_path = (".");
if (exists $ENV{"KRAKEN2_DB_PATH"}) {
my $path_str = $ENV{"KRAKEN2_DB_PATH"};
# Allow zero-length path to be current dir
$path_str =~ s/^:/.:/;
$path_str =~ s/:$/:./;
$path_str =~ s/::/:.:/;
@db_path = split /:/, $path_str;
}
# Use supplied DB if abs. or rel. path is given
if ($supplied_db_prefix =~ m|/|) {
$db_prefix = $supplied_db_prefix;
}
else {
# Check all dirs in KRAKEN2_DB_PATH
for my $dir (@db_path) {
my $checked_db = "$dir/$supplied_db_prefix";
if (-e $checked_db && -d _) {
$db_prefix = $checked_db;
last;
}
}
if (! defined $db_prefix) {
my $printed_path = exists $ENV{"KRAKEN2_DB_PATH"} ? qq|"$ENV{'KRAKEN2_DB_PATH'}"| : "undefined";
die "unable to find $supplied_db_prefix in \$KRAKEN2_DB_PATH ($printed_path)\n";
}
}
for my $file (qw/taxo.k2d hash.k2d opts.k2d/) {
if (! -e "$db_prefix/$file") {
die "database (\"$db_prefix\") does not contain necessary file $file\n";
}
}
return $db_prefix;
}
# Input: a FASTA sequence ID
# Output: either (a) a taxonomy ID number found in the sequence ID,
# (b) an NCBI accession number found in the sequence ID, or undef
sub check_seqid {
my $seqid = shift;
my $taxid = undef;
# Note all regexes here use ?: to avoid capturing the ^ or | character
if ($seqid =~ /(?:^|\|)kraken:taxid\|(\d+)/) {
$taxid = $1; # OK, has explicit taxid
}
elsif ($seqid =~ /^(\d+)$/) {
$taxid = $1; # OK, has explicit taxid (w/o token)
}
# Accession number check
elsif ($seqid =~ /(?:^|\|) # Begins seqid or immediately follows pipe
([A-Z]+ # Starts with one or more UC alphas
_? # Might have an underscore next
[A-Z0-9]+) # Ends with UC alphas or digits
(?:\||\b|\.)/x # Followed by pipe, word boundary, or period
)
{
$taxid = $1; # A bit misleading - failure to pass /^\d+$/ means this is
# OK, but requires accession -> taxid mapping
}
return $taxid;
}
1;