Skip to content

Commit

Permalink
amplicon: slightly improve the seed. shenwei356#439
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Feb 19, 2024
1 parent 36e9af7 commit 91edabf
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 83 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- Removing the flag `-V/--validate-seq-length`. Now the whole sequence will be checked if `-v/--validate-seq` is given.
- `seqkit amplicon`:
- Fix the speed problem, introduced in v2.7.0. [#439](https://github.com/shenwei356/seqkit/issues/439).
- Slightly faster by reusing objects.
- [SeqKit v2.7.0](https://github.com/shenwei356/seqkit/releases/tag/v2.7.0) - 2024-01-31
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.7.0/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.7.0)
- `seqkit`:
Expand Down
150 changes: 73 additions & 77 deletions doc/docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,32 +315,31 @@ Usage:
seqkit seq [flags]
Flags:
-k, --color colorize sequences - to be piped into "less -R"
-p, --complement complement sequence, flag '-v' is recommended to switch on
--dna2rna DNA to RNA
-G, --gap-letters string gap letters to be removed with -g/--remove-gaps (default "- \t.")
-h, --help help for seq
-l, --lower-case print sequences in lower case
-M, --max-len int only print sequences shorter than or equal to the maximum length (-1
for no limit) (default -1)
-R, --max-qual float only print sequences with average quality less than this limit (-1 for
no limit) (default -1)
-m, --min-len int only print sequences longer than or equal to the minimum length (-1
for no limit) (default -1)
-Q, --min-qual float only print sequences with average quality greater or equal than this
limit (-1 for no limit) (default -1)
-n, --name only print names/sequence headers
-i, --only-id print IDs instead of full headers
-q, --qual only print qualities
-b, --qual-ascii-base int ASCII BASE, 33 for Phred+33 (default 33)
-g, --remove-gaps remove gaps letters set by -G/--gap-letters, e.g., spaces, tabs, and
dashes (gaps "-" in aligned sequences)
-r, --reverse reverse sequence
--rna2dna RNA to DNA
-s, --seq only print sequences
-u, --upper-case print sequences in upper case
-v, --validate-seq validate bases according to the alphabet
-V, --validate-seq-length int length of sequence to validate (0 for whole seq) (default 10000)
-k, --color colorize sequences - to be piped into "less -R"
-p, --complement complement sequence, flag '-v' is recommended to switch on
--dna2rna DNA to RNA
-G, --gap-letters string gap letters to be removed with -g/--remove-gaps (default "- \t.")
-h, --help help for seq
-l, --lower-case print sequences in lower case
-M, --max-len int only print sequences shorter than or equal to the maximum length (-1 for
no limit) (default -1)
-R, --max-qual float only print sequences with average quality less than this limit (-1 for no
limit) (default -1)
-m, --min-len int only print sequences longer than or equal to the minimum length (-1 for no
limit) (default -1)
-Q, --min-qual float only print sequences with average quality greater or equal than this limit
(-1 for no limit) (default -1)
-n, --name only print names/sequence headers
-i, --only-id print IDs instead of full headers
-q, --qual only print qualities
-b, --qual-ascii-base int ASCII BASE, 33 for Phred+33 (default 33)
-g, --remove-gaps remove gaps letters set by -G/--gap-letters, e.g., spaces, tabs, and
dashes (gaps "-" in aligned sequences)
-r, --reverse reverse sequence
--rna2dna RNA to DNA
-s, --seq only print sequences
-u, --upper-case print sequences in upper case
-v, --validate-seq validate bases according to the alphabet
```

Expand Down Expand Up @@ -1051,22 +1050,21 @@ Usage:
seqkit watch [flags]
Flags:
-B, --bins int number of histogram bins (default -1)
-W, --delay int sleep this many seconds after online plotting (default 1)
-y, --dump print histogram data to stderr instead of plotting
-f, --fields string target fields, available values: ReadLen, MeanQual, GC, GCSkew
(default "ReadLen")
-h, --help help for watch
-O, --img string save histogram to this PDF/image file
-H, --list-fields print out a list of available fields
-L, --log log10(x+1) transform numeric values
-x, --pass pass through mode (write input to stdout)
-p, --print-freq int print/report after this many records (-1 for print after EOF) (default -1)
-b, --qual-ascii-base int ASCII BASE, 33 for Phred+33 (default 33)
-Q, --quiet-mode supress all plotting to stderr
-R, --reset reset histogram after every report
-v, --validate-seq validate bases according to the alphabet
-V, --validate-seq-length int length of sequence to validate (0 for whole seq) (default 10000)
-B, --bins int number of histogram bins (default -1)
-W, --delay int sleep this many seconds after online plotting (default 1)
-y, --dump print histogram data to stderr instead of plotting
-f, --fields string target fields, available values: ReadLen, MeanQual, GC, GCSkew (default
"ReadLen")
-h, --help help for watch
-O, --img string save histogram to this PDF/image file
-H, --list-fields print out a list of available fields
-L, --log log10(x+1) transform numeric values
-x, --pass pass through mode (write input to stdout)
-p, --print-freq int print/report after this many records (-1 for print after EOF) (default -1)
-b, --qual-ascii-base int ASCII BASE, 33 for Phred+33 (default 33)
-Q, --quiet-mode supress all plotting to stderr
-R, --reset reset histogram after every report
-v, --validate-seq validate bases according to the alphabet
```

Expand Down Expand Up @@ -1846,25 +1844,24 @@ Usage:
seqkit locate [flags]
Flags:
--bed output in BED6 format
-c, --circular circular genome. type "seqkit locate -h" for details
-d, --degenerate pattern/motif contains degenerate base
--gtf output in GTF format
-h, --help help for locate
-M, --hide-matched do not show matched sequences
-i, --ignore-case ignore case
-I, --immediate-output print output immediately, do not use write buffer
-s, --max-len-to-show int show at most X characters for the search pattern or matched sequences
-m, --max-mismatch int max mismatch when matching by seq. For large genomes like human
genome, using mapping/alignment tools would be faster
-G, --non-greedy non-greedy mode, faster but may miss motifs overlapping with others
-P, --only-positive-strand only search on positive strand
-p, --pattern strings pattern/motif (multiple values supported. Attention: use double
quotation marks for patterns containing comma, e.g., -p '"A{2,}"')
-f, --pattern-file string pattern/motif file (FASTA format)
-F, --use-fmi use FM-index for much faster search of lots of sequence patterns
-r, --use-regexp patterns/motifs are regular expression
-V, --validate-seq-length int length of sequence to validate (0 for whole seq) (default 10000)
--bed output in BED6 format
-c, --circular circular genome. type "seqkit locate -h" for details
-d, --degenerate pattern/motif contains degenerate base
--gtf output in GTF format
-h, --help help for locate
-M, --hide-matched do not show matched sequences
-i, --ignore-case ignore case
-I, --immediate-output print output immediately, do not use write buffer
-s, --max-len-to-show int show at most X characters for the search pattern or matched sequences
-m, --max-mismatch int max mismatch when matching by seq. For large genomes like human genome,
using mapping/alignment tools would be faster
-G, --non-greedy non-greedy mode, faster but may miss motifs overlapping with others
-P, --only-positive-strand only search on positive strand
-p, --pattern strings pattern/motif (multiple values supported. Attention: use double quotation
marks for patterns containing comma, e.g., -p '"A{2,}"')
-f, --pattern-file string pattern/motif file (FASTA format)
-F, --use-fmi use FM-index for much faster search of lots of sequence patterns
-r, --use-regexp patterns/motifs are regular expression
```

Expand Down Expand Up @@ -2006,22 +2003,21 @@ Usage:
seqkit fish [flags]
Flags:
-a, --all search all
-p, --aln-params string alignment parameters in format
"<match>,<mismatch>,<gap_open>,<gap_extend>" (default "4,-4,-2,-1")
-h, --help help for fish
-i, --invert print out references not matching with any query
-q, --min-qual float minimum mapping quality (default 5)
-b, --out-bam string save aligmnets to this BAM file (memory intensive)
-x, --pass pass through mode (write input to stdout)
-g, --print-aln print sequence alignments
-D, --print-desc print full sequence header
-f, --query-fastx string query fasta
-F, --query-sequences string query sequences
-r, --ranges string target ranges, for example: ":10,30:40,-20:"
-s, --stranded search + strand only
-v, --validate-seq validate bases according to the alphabet
-V, --validate-seq-length int length of sequence to validate (0 for whole seq) (default 10000)
-a, --all search all
-p, --aln-params string alignment parameters in format
"<match>,<mismatch>,<gap_open>,<gap_extend>" (default "4,-4,-2,-1")
-h, --help help for fish
-i, --invert print out references not matching with any query
-q, --min-qual float minimum mapping quality (default 5)
-b, --out-bam string save aligmnets to this BAM file (memory intensive)
-x, --pass pass through mode (write input to stdout)
-g, --print-aln print sequence alignments
-D, --print-desc print full sequence header
-f, --query-fastx string query fasta
-F, --query-sequences string query sequences
-r, --ranges string target ranges, for example: ":10,30:40,-20:"
-s, --stranded search + strand only
-v, --validate-seq validate bases according to the alphabet
```

Expand Down
51 changes: 45 additions & 6 deletions seqkit/cmd/amplicon.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,16 @@ Examples:

strands := []string{"+", "-"}

// -----------
// finder pools
finderPools := make([]*sync.Pool, len(primers))
for i, primer := range primers {
finderPools[i] = &sync.Pool{New: func() interface{} {
finder, _ := NewAmpliconFinder([]byte{'A'}, primer[1], primer[2], maxMismatch)
return finder
}}
}

// -------------------------------------------------------------------
// only for m > 0, where FMI is slow

Expand Down Expand Up @@ -305,6 +315,7 @@ Examples:
<-tokens
}()

var j int
var finder *AmpliconFinder
var loc []int
var mis []int
Expand All @@ -325,9 +336,11 @@ Examples:
record.Seq.RevComInplace()
}

for _, primer = range primers {
finder, err = NewAmpliconFinder(record.Seq.Seq, primer[1], primer[2], maxMismatch)
checkError(err)
for j, primer = range primers {
// finder, err = NewAmpliconFinder(record.Seq.Seq, primer[1], primer[2], maxMismatch)
// checkError(err)
finder = finderPools[j].Get().(*AmpliconFinder)
finder.Reset(record.Seq.Seq, maxMismatch)

if usingRegion {
loc, mis, err = finder.LocateRange(begin, end, fregion, strict)
Expand Down Expand Up @@ -381,6 +394,8 @@ Examples:
results = append(results, string(record.Format(config.LineWidth)))

record.Seq = tmpSeq

finderPools[j].Put(finder)
}
}

Expand Down Expand Up @@ -414,6 +429,7 @@ Examples:

var record *fastx.Record

var j int
var finder *AmpliconFinder
var loc []int

Expand Down Expand Up @@ -453,9 +469,11 @@ Examples:
record.Seq.RevComInplace()
}

for _, primer = range primers {
finder, err = NewAmpliconFinder(record.Seq.Seq, primer[1], primer[2], maxMismatch)
checkError(err)
for j, primer = range primers {
// finder, err = NewAmpliconFinder(record.Seq.Seq, primer[1], primer[2], maxMismatch)
// checkError(err)
finder = finderPools[j].Get().(*AmpliconFinder)
finder.Reset(record.Seq.Seq, maxMismatch)

if usingRegion {
loc, _, err = finder.LocateRange(begin, end, fregion, strict)
Expand Down Expand Up @@ -512,6 +530,8 @@ Examples:
record.FormatToWriter(outfh, config.LineWidth)

record.Seq = tmpSeq

finderPools[j].Put(finder)
}
}

Expand Down Expand Up @@ -632,6 +652,25 @@ type AmpliconFinder struct {
rF, rR *regexp.Regexp
}

func (finder *AmpliconFinder) Reset(sequence []byte, maxMismatch int) error {
if len(sequence) == 0 {
return fmt.Errorf("non-blank sequence needed")
}
finder.Seq = bytes.ToUpper(sequence)
if maxMismatch > 0 { // using FM-index
index := fmi.NewFMIndex()
_, err := index.Transform(finder.Seq)
if err != nil {
return err
}
finder.MaxMismatch = maxMismatch
finder.FMindex = index
}

finder.searched, finder.found = false, false
return nil
}

// NewAmpliconFinder returns a AmpliconFinder struct.
func NewAmpliconFinder(sequence, forwardPrimer, reversePrimerRC []byte, maxMismatch int) (*AmpliconFinder, error) {
if len(sequence) == 0 {
Expand Down

0 comments on commit 91edabf

Please sign in to comment.