diff --git a/configure.ac b/configure.ac index a12c22ca..4bb9f8f6 100644 --- a/configure.ac +++ b/configure.ac @@ -71,7 +71,7 @@ m4_include([m4/ax_pthread.m4]) ################################################################ # 2. AC_INIT ################################################################ -AC_INIT(Easel, 0.44rc1, sean@eddylab.org, easel) +AC_INIT(Easel, 0.44, sean@eddylab.org, easel) AC_MSG_NOTICE([Configuring the Easel library for your system.]) # remember if the user is overriding CFLAGS @@ -108,7 +108,7 @@ fi # EASEL_VERSION e.g. "1.9a" # -EASEL_DATE="March 2018" +EASEL_DATE="April 2018" EASEL_COPYRIGHT="Copyright (C) 2018 Howard Hughes Medical Institute" EASEL_LICENSE="Freely distributed under the BSD open source license." EASEL_VERSION=$PACKAGE_VERSION diff --git a/devkit/rmanprocess.py b/devkit/rmanprocess.py new file mode 100755 index 00000000..7ab08dc5 --- /dev/null +++ b/devkit/rmanprocess.py @@ -0,0 +1,86 @@ +#! /usr/bin/env python + +# rmanprocess.py +# Massages output of PolyGlotMan's `rman -f latex2e` to fit Tufteian userguide style. +# Example: +# rman -f latex2e hmmbuild.man | rmanprocess.py > manpage.tex +# + + +import sys +import re + +in_synopsis = False + +if len(sys.argv) == 1: + f = sys.stdin +else: + f = open(sys.argv[1]) + +for line in f: + line = line.rstrip('\n') + + # State flags (where are we in the document) + if re.match(r'\\section{Synopsis}', line): + in_synopsis = True + elif re.match(r'\\section{', line): + in_synopsis = False + + # + # Linewise substitutions: replace certain entire lines with something else. + # + # Remove \documentclass, and changes to \parindent and \parskip + if re.match(r'\\documentclass', line): continue + if re.match(r'\\setlength{\\parindent}', line): continue + if re.match(r'\\setlength{\\parskip}', line): continue + if re.match(r'\\begin\{document\}', line): continue + + # Replace \section{Name} with \section{progname - description}, using next line too. + if re.match(r'\\section\{Name\}', line): + for line in f: + if not re.fullmatch(r'\s*', line): + break + m = re.match(r'(\S+)\s*\\?-\s*(.+)$', line) + if m: + print(r'\section{{\monob{{{0}}} - {1}}}'.format(m.group(1), m.group(2))) + else: + print("Error: no progname/description line found"); + sys.exit(1) + continue + + # Remove everything after \section{See Also), and finish. + if re.match(r'\\section\{See', line) or re.match(r'\\end\{document', line): + print("\\newpage"); + break + + + # + # Extra directives: preface certain lines with something extra, but still + # process the line. + # + + # In synopsis, put \noindent in front of each commandline, and preserve the .B's as bold. + if in_synopsis and re.match(r'\s*\\textbf{', line): + line = re.sub(r'\\textbf\{', r'\\monob{', line) + print("\\noindent") + + # + # Substitutions within a line. + # The order of these replacements is important. (More specific ones first.) + # + line = re.sub(r'\\begin\{itemize\}', r'\\begin{wideitem}', line) + line = re.sub(r'\\end\{itemize\}', r'\\end{wideitem}', line) + line = re.sub(r'\\section\{', r'\\subsection*{', line) # \subsection* suppresses inclusion in TOC + line = re.sub(r'--', r'{-}{-}', line) + line = re.sub(r'\\item\s*\[\\textbf', r'\\item [\\monob', line) # option names in .TP are emphasized bold + line = re.sub(r'\\textbf\{\\% ', r'\\user{\\% ', line) # example command lines are bold, on their own line + line = re.sub(r'\\textit\{', r'\\monoi{', line) # metavariables (options, args) are .I in man, mono italic in tex + line = re.sub(r'\\textbf\{', r'\\mono{', line) # literals (commands, etc) are .B in man, normal mono in tex + + print(line) + + + +if f != sys.stdin: + f.close() + diff --git a/esl_msafile.c b/esl_msafile.c index febabd06..4f3c300e 100644 --- a/esl_msafile.c +++ b/esl_msafile.c @@ -654,7 +654,10 @@ esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDAT else // if we haven't guessed so far, try selex. { /* selex parser can handle psiblast too */ if (fmt_bysuffix == eslMSAFILE_SELEX) *ret_fmtcode = eslMSAFILE_SELEX; - else if (msafile_check_selex(bf) == eslOK) *ret_fmtcode = eslMSAFILE_SELEX; + else if (msafile_check_selex(bf) == eslOK) { + if (fmt_bysuffix == eslMSAFILE_PSIBLAST) *ret_fmtcode = eslMSAFILE_PSIBLAST; + else *ret_fmtcode = eslMSAFILE_SELEX; + } else ESL_XFAIL(eslENOFORMAT, errbuf, "couldn't guess alignment input format - doesn't even look like selex"); } @@ -717,13 +720,13 @@ esl_msafile_EncodeFormat(char *fmtstring) if (strcasecmp(fmtstring, "stockholm") == 0) return eslMSAFILE_STOCKHOLM; if (strcasecmp(fmtstring, "pfam") == 0) return eslMSAFILE_PFAM; if (strcasecmp(fmtstring, "a2m") == 0) return eslMSAFILE_A2M; - if (strcasecmp(fmtstring, "phylip") == 0) return eslMSAFILE_PHYLIP; - if (strcasecmp(fmtstring, "phylips") == 0) return eslMSAFILE_PHYLIPS; if (strcasecmp(fmtstring, "psiblast") == 0) return eslMSAFILE_PSIBLAST; if (strcasecmp(fmtstring, "selex") == 0) return eslMSAFILE_SELEX; if (strcasecmp(fmtstring, "afa") == 0) return eslMSAFILE_AFA; if (strcasecmp(fmtstring, "clustal") == 0) return eslMSAFILE_CLUSTAL; if (strcasecmp(fmtstring, "clustallike") == 0) return eslMSAFILE_CLUSTALLIKE; + if (strcasecmp(fmtstring, "phylip") == 0) return eslMSAFILE_PHYLIP; + if (strcasecmp(fmtstring, "phylips") == 0) return eslMSAFILE_PHYLIPS; return eslMSAFILE_UNKNOWN; } diff --git a/esl_sqio.c b/esl_sqio.c index 4442f442..0d04d28e 100644 --- a/esl_sqio.c +++ b/esl_sqio.c @@ -709,16 +709,15 @@ esl_sqio_IsAlignment(int fmt) int esl_sqio_EncodeFormat(char *fmtstring) { - if (strcasecmp(fmtstring, "daemon") == 0) return eslSQFILE_DAEMON; - if (strcasecmp(fmtstring, "ddbj") == 0) return eslSQFILE_DDBJ; - if (strcasecmp(fmtstring, "embl") == 0) return eslSQFILE_EMBL; if (strcasecmp(fmtstring, "fasta") == 0) return eslSQFILE_FASTA; - if (strcasecmp(fmtstring, "fmindex") == 0) return eslSQFILE_FMINDEX; + if (strcasecmp(fmtstring, "embl") == 0) return eslSQFILE_EMBL; if (strcasecmp(fmtstring, "genbank") == 0) return eslSQFILE_GENBANK; - if (strcasecmp(fmtstring, "hmmpgmd") == 0) return eslSQFILE_HMMPGMD; - if (strcasecmp(fmtstring, "ncbi") == 0) return eslSQFILE_NCBI; + if (strcasecmp(fmtstring, "ddbj") == 0) return eslSQFILE_DDBJ; if (strcasecmp(fmtstring, "uniprot") == 0) return eslSQFILE_UNIPROT; - + if (strcasecmp(fmtstring, "ncbi") == 0) return eslSQFILE_NCBI; + if (strcasecmp(fmtstring, "daemon") == 0) return eslSQFILE_DAEMON; + if (strcasecmp(fmtstring, "hmmpgmd") == 0) return eslSQFILE_HMMPGMD; + if (strcasecmp(fmtstring, "fmindex") == 0) return eslSQFILE_FMINDEX; return esl_msafile_EncodeFormat(fmtstring); } @@ -741,9 +740,10 @@ esl_sqio_DecodeFormat(int fmt) case eslSQFILE_GENBANK: return "GenBank"; case eslSQFILE_DDBJ: return "DDBJ"; case eslSQFILE_UNIPROT: return "UniProt"; + case eslSQFILE_NCBI: return "NCBI"; case eslSQFILE_DAEMON: return "daemon"; case eslSQFILE_HMMPGMD: return "hmmpgmd"; - case eslSQFILE_NCBI: return "NCBI"; + case eslSQFILE_FMINDEX: return "fmindex"; default: break; } esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such sqio format code %d", fmt); diff --git a/miniapps/esl-afetch.man.in b/miniapps/esl-afetch.man.in index c6d5b50f..c6435ea1 100644 --- a/miniapps/esl-afetch.man.in +++ b/miniapps/esl-afetch.man.in @@ -1,34 +1,23 @@ -.TH "esl-afetch" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-afetch" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-afetch - retrieve alignments from a multi-MSA database +esl\-afetch \- retrieve alignments from a multi-MSA database .SH SYNOPSIS -.TP -Single MSA retrieval: -.B esl-afetch -.I [options] -.I msafile -.I key - -.TP -Multiple MSA retrieval: -.B esl-afetch -f -.I [options] -.I msafile -.I keyfile +.nf +\fBesl\-afetch\fR [\fIoptions\fR] \fImsafile key\fR + (single MSA retrieval) -.TP -Indexing an MSA file for retrieval: -.B esl-afetch --index -.I msafile +\fBesl\-afetch \-f\fR [\fIoptions\fR] \fImsafile keyfile\fR + (multiple MSA retrieval, from a file of keys) +\fBesl\-afetch \-\-index \fImsafile\fR + (index an MSA file for retrieval) .SH DESCRIPTION -.B esl-afetch +.B esl\-afetch retrieves the alignment named .I key from an alignment database in file @@ -44,18 +33,19 @@ number (AC). .PP Alternatively, -.B esl-afetch -f +.B esl\-afetch \-f provides the ability to fetch many alignments at once. The -.I -f -option has it interpret the second argument as a , a -file consisting of one name or accession per line. +.B \-f +option has it interpret the second argument as a +.IR keyfile , +a file consisting of one name or accession per line. .PP The .I msafile should first be SSI indexed with -.B esl-afetch --index +.B esl\-afetch \-\-index for efficient retrieval. An SSI index is not required, but without one alignment retrieval may be painfully slow. @@ -63,52 +53,55 @@ be painfully slow. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -f +.B \-f Interpret the second argument as a .I keyfile instead of as just one -.I key. +.IR key . The .I keyfile contains one name or accession per line. This option doesn't work with the -.B --index +.B \-\-index option. .TP -.BI -o " " +.BI \-o " " Output retrieved alignments to a file .I -instead of to -.I stdout. +instead of to stdout. .TP -.BI -O +.BI \-O Output retrieved alignment to a file named -.I . +.IR key . This is a convenience for saving some typing: instead of -.B esl-afetch -o RRM_1 msafile RRM_1 +.nf + \fB% esl\-afetch \-o RRM_1 msafile RRM_1\fR +.fi you can just type -.B esl-afetch -O msafile RRM_1. +.nf + \fB% esl\-afetch \-O msafile RRM_1\fR +.fi The -.B -O +.B \-O option only works if you're retrieving a single alignment; it is incompatible with -.B -f. +.B \-f. .TP -.B --index +.B \-\-index Instead of retrieving a .I key, the special command -.B esl-afetch --index +.B esl\-afetch \-\-index .I msafile produces an SSI index of the names and accessions of the alignments in diff --git a/miniapps/esl-alimanip.man.in b/miniapps/esl-alimanip.man.in index b48d88da..dcc30e08 100644 --- a/miniapps/esl-alimanip.man.in +++ b/miniapps/esl-alimanip.man.in @@ -1,19 +1,17 @@ -.TH "esl-alimanip" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alimanip" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alimanip - manipulate a multiple sequence alignment +esl\-alimanip \- manipulate a multiple sequence alignment .SH SYNOPSIS -.B esl-alimanip -.I [options] +.B esl\-alimanip +[\fIoptions\fR] .I msafile .SH DESCRIPTION -.pp -.B esl-alimanip +.B esl\-alimanip can manipulate the multiple sequence alignment(s) in .I msafile in various ways. Options exist to remove @@ -21,97 +19,121 @@ specific sequences, reorder sequences, designate reference columns using Stockholm "#=GC RF" markup, and add annotation that numbers columns. +.PP The alignments can be of protein or DNA/RNA sequences. All alignments in the same .I msafile must be either protein or DNA/RNA. The alphabet will be autodetected unless one of the options -.I --amino, -.I --dna, +.B \-\-amino, +.B \-\-dna, or -.I --rna +.B \-\-rna are given. + + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -o " " +.BI \-o " " Save the resulting, modified alignment in Stockholm format to a file .I . The default is to write it to standard output. .TP -.BI --informat " " -Specify that the input alignment be format -.I . -At preset, the only valid choices for +.BI \-\-informat " " +Assert that +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for .I -are: 'stockholm', 'pfam', and 'afa'. 'pfam' is a special -case of Stockholm format in which each sequence is placed on a single -line, instead of being interleaved; 'afa' is aligned FASTA. By default -alignments are assumed to be in Stockholm format (either interleaved -or Pfam). +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). + .TP -.BI --outformat " " -Specify that the output alignment be format -.I . -Choices for +.BI \-\-outformat " " +Write the output in alignment format +.IR . +Common choices for .I -are: 'stockholm', 'pfam', 'a2m', 'psiblast', 'afa'. -By default the alignment is output in interleaved Stockholm format. +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.BR stockholm . .TP -.B --devhelp +.B \-\-devhelp Print help, as with -.B "-h", +.B \-h, but also include undocumented developer options. These options are not listed below, are under development or experimental, and are not guaranteed to even work correctly. Use developer options at your own risk. The only resources for understanding what they actually do are the brief one-line description printed when -.B "--devhelp" +.B \-\-devhelp is enabled, and the source code. .SH EXPERT OPTIONS .TP -.BI --lnfract " " +.BI \-\-lnfract " " Remove any sequences with length less than .I fraction the length of the median length sequence in the alignment. .TP -.BI --lxfract " " +.BI \-\-lxfract " " Remove any sequences with length more than .I fraction the length of the median length sequence in the alignment. .TP -.BI --lmin " " +.BI \-\-lmin " " Remove any sequences with length less than .I residues. .TP -.BI --lmax " " +.BI \-\-lmax " " Remove any sequences with length more than .I residues. .TP -.BI --rfnfract " " +.BI \-\-rfnfract " " Remove any sequences with nongap RF length less than .I fraction the nongap RF length of the alignment. .TP -.BI --detrunc " " +.BI \-\-detrunc " " Remove any sequences that have all gaps in the first .I non-gap #=GC RF columns or the last @@ -119,13 +141,13 @@ non-gap #=GC RF columns or the last non-gap #=GC RF columns. .TP -.BI --xambig " " +.BI \-\-xambig " " Remove any sequences that has more than .I ambiguous (degenerate) residues. .TP -.BI --seq-r " " +.BI \-\-seq\-r " " Remove any sequences with names listed in file .I . Sequence names listed in @@ -134,7 +156,7 @@ can be separated by tabs, new lines, or spaces. The file must be in Stockholm format for this option to work. .TP -.BI --seq-k " " +.BI \-\-seq\-k " " Keep only sequences with names listed in file .I . Sequence names listed in @@ -146,65 +168,65 @@ they appeared in but the order from .I will be used if the -.B --k-reorder +.B \-\-k\-reorder option is enabled. The file must be in Stockholm format for this option to work. .TP -.BI --small +.B \-\-small With -.B --seq-k +.B \-\-seq\-k or -.B --seq-r, +.B \-\-seq\-r, operate in small memory mode. The alignment(s) will not be stored in memory, thus -.B --seq-k +.B \-\-seq\-k and -.B --seq-r +.B \-\-seq\-r will be able to work on very large alignments regardless of the amount of available RAM. The alignment file must be in Pfam format and -.BI --informat " pfam" +.B \-\-informat pfam and one of -.B --amino, -.B --dna, +.B \-\-amino, +.B \-\-dna, or -.B --rna +.B \-\-rna must be given as well. .TP -.BI --k-reorder +.B \-\-k\-reorder With -.BI --seq-k " ", +.BI \-\-seq\-k " ", reorder the kept sequences in the output alignment to the order from the list file .I . .TP -.BI --seq-ins " " +.BI \-\-seq\-ins " " Keep only sequences that have at least 1 inserted residue after nongap RF position .I . .TP -.BI --seq-ni " " +.BI \-\-seq\-ni " " With -.B --seq-ins +.B \-\-seq\-ins require at least .I inserted residues in a sequence for it to be kept. .TP -.BI --seq-xi " " +.BI \-\-seq\-xi " " With -.B --seq-ins +.B \-\-seq\-ins allow at most .I inserted residues in a sequence for it to be kept. .TP -.BI --trim " " +.BI \-\-trim " " File .I is an unaligned FASTA file containing truncated versions of each @@ -216,19 +238,19 @@ in If the alignment output format is Stockholm (the default output format), all per-column (GC) and per-residue (GR) annotation will be removed from the alignment when -.B --trim +.B \-\-trim is used. However, if -.B --t-keeprf +.B \-\-t\-keeprf is also used, the reference annotation (GC RF) will be kept. .TP -.B --t-keeprf +.B \-\-t\-keeprf Specify that the 'trimmed' alignment maintain the original reference (GC RF) annotation. Only works in combination with -.B --trim. +.B \-\-trim. .TP -.BI --minpp " " +.BI \-\-minpp " " Replace all residues in the alignments for which the posterior probability annotation (#=GR PP) is less than .I @@ -238,7 +260,7 @@ gaps. must be greater than 0.0 and less than or equal to 0.95. .TP -.BI --tree " " +.BI \-\-tree " " Reorder sequences by tree order. Perform single linkage clustering on the sequences in the alignment based on sequence identity given the alignment to define a 'tree' @@ -248,18 +270,18 @@ tree is output in Newick format to .I . .TP -.BI --reorder " " +.BI \-\-reorder " " Reorder sequences to the order listed in file .I . Each sequence in the alignment must be listed in .I . Use -.B --k-reorder +.B \-\-k\-reorder to reorder only a subset of sequences to a subset alignment file. The file must be in Stockholm format for this option to work. .TP -.BI --mask2rf " " +.BI \-\-mask2rf " " Read in the 'mask' file .I and use it to define new #=GC RF annotation for the @@ -280,58 +302,61 @@ existing RF annotation, all gap RF characters will remain gaps and nongap RF characters will be redefined as above. .TP -.BI --m-keeprf +.BI \-\-m\-keeprf With -.B --mask2rf, +.B \-\-mask2rf, do not overwrite existing nongap RF characters that are included by the input mask as 'x', leave them as the character they are. .TP -.BI --num-all +.BI \-\-num\-all Add annotation to the alignment numbering all of the columns in the alignment. .TP -.BI --num-rf +.BI \-\-num\-rf Add annotation to the alignment numbering the non-gap (non '.') #=GC RF columns of the alignment. .TP -.BI --rm-gc " " +.BI \-\-rm\-gc " " Remove certain types of #=GC annotation from the alignment. .I "" -must be either "RF", "SS_cons", "SA_cons", or "PP_cons", -at present. +must be one of: +.BR RF , +.BR SS_cons , +.BR SA_cons , +.BR PP_cons . .TP -.BI --sindi +.BI \-\-sindi Annotate individual secondary structures for each sequence by imposing the consensus secondary structure defined by the #=GC SS_cons annotation. .TP -.BI --post2pp +.BI \-\-post2pp Update Infernal's cmalign 0.72-1.0.2 posterior probability "POST" annotation to "PP" annotation, which is read by other miniapps, including -.B esl-alimask +.B esl\-alimask and -.B esl-alistat. +.B esl\-alistat. .TP -.B --amino +.B \-\-amino Assert that the .I msafile contains protein sequences. .TP -.B --dna +.B \-\-dna Assert that the .I msafile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I msafile contains RNA sequences. diff --git a/miniapps/esl-alimap.man.in b/miniapps/esl-alimap.man.in index 7a8751d9..0d9b59c4 100644 --- a/miniapps/esl-alimap.man.in +++ b/miniapps/esl-alimap.man.in @@ -1,18 +1,17 @@ -.TH "esl-alimap" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alimap" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alimap - map two alignments to each other +esl\-alimap \- map two alignments to each other .SH SYNOPSIS -.B esl-alimap -.I [options] +.B esl\-alimap +[\fIoptions\fR] .I msafile1 .I msafile2 .SH DESCRIPTION -.I esl-alimap +.B esl\-alimap is a highly specialized application that determines the optimal alignment mapping of columns between two alignments of the same sequences. An alignment mapping defines for each column in alignment 1 @@ -20,18 +19,21 @@ a matching column in alignment 2. The number of residues in the aligned sequences that are in common between the two matched columns are considered 'shared' by those two columns. +.PP For example, if the nth residue of sequence i occurs in alignment 1 column x and alignment 2 column y, then only a mapping of alignment 1 and 2 that includes column x mapping to column y would correctly map and share the residue. +.PP The optimal mapping of the two alignments is the mapping which maximizes the sum of shared residues between all pairs of matching columns. The fraction of total residues that are shared is reported as the coverage in the -.B esl-alimap +.B esl\-alimap output. +.PP Only the first alignments in .I msafile1 and @@ -39,18 +41,21 @@ and will be mapped to each other. If the files contain more than one alignment, all alignments after the first will be ignored. +.PP The two alignments (one from each file) must contain exactly the same sequences (if they were unaligned, they'd be identical) in precisely the same order. They must also be in Stockholm format. +.PP The output of -.B esl-alimap +.B esl\-alimap differs depending on whether one or both of the alignments contain reference (#=GC RF) annotation. If so, the coverage for residues from nongap RF positions will be reported separately from the total coverage. -.B esl-alimap +.PP +.B esl\-alimap uses a dynamic programming algorithm to compute the optimal mapping. The algorithm is similar to the Needleman-Wunsch-Sellers algorithm but the scores used at each step of the recursion are not @@ -58,131 +63,131 @@ residue-residue comparison scores but rather the number of residues shared between two columns. The -.BI --mask-a2a " ", -.BI --mask-a2rf " ", -.BI --mask-rf2a " ", +.BI \-\-mask\-a2a " ", +.BI \-\-mask\-a2rf " ", +.BI \-\-mask\-rf2a " ", and -.BI --mask-rf2rf " " +.BI \-\-mask\-rf2rf " " options create 'mask' files that pertain to the optimal mapping in slightly different ways. A mask file consists of a single line, of only '0' and '1' characters. These denote which positions of the alignment from -.B msafile1 +.I msafile1 map to positions of the alignment from -.B msafile2 +.I msafile2 as described below for each of the four respective masking options. These masks can be used to extract only those columns of the -.B msafile1 +.I msafile1 alignment that optimally map to columns of the -.B msafile2 +.I msafile2 alignment using the -.B esl-alimask +.B esl\-alimask miniapp. To extract the corresponding set of columns from .I msafile2 (that optimally map to columns of the alignment from -.B msafile1), +.IR msafile1 ), it is necessary to rerun the program with the order of the two msafiles reversed, save new masks, and use -.B esl-alimask +.B esl\-alimask again. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options. .TP -.B -q +.B \-q Be quiet; don't print information the optimal mapping of each column, only report coverage and potentially save masks to optional output files. .TP -.BI --mask-a2a " " +.BI \-\-mask\-a2a " " Save a mask of '0's and '1's to file .I . A '1' at position x means that position x of the alignment from -.B msafile1 +.I msafile1 maps to an alignment position in the alignment from -.B msafile2 +.I msafile2 in the optimal map. .TP -.BI --mask-a2rf " " +.BI \-\-mask\-a2rf " " Save a mask of '0's and '1's to file .I . A '1' at position x means that position x of the alignment from -.B msafile1 +.I msafile1 maps to a nongap RF position in the alignment from -.B msafile2 +.I msafile2 in the optimal map. .TP -.BI --mask-rf2a " " +.BI \-\-mask\-rf2a " " Save a mask of '0's and '1's to file .I . A '1' at position x means that nongap RF position x of the alignment from -.B msafile1 +.I msafile1 maps to an alignment position in the alignment from -.B msafile2 +.I msafile2 in the optimal map. .TP -.BI --mask-rf2rf " " +.BI \-\-mask\-rf2rf " " Save a mask of '0's and '1's to file .I . A '1' at position x means that nongap RF position x of the alignment from -.B msafile1 +.I msafile1 maps to a nongap RF position in the alignment from -.B msafile2 +.I msafile2 in the optimal map. .TP -.BI --submap " " +.BI \-\-submap " " Specify that all of the columns from the alignment from -.B msafile1 +.I msafile1 exist identically (contain the same residues from all sequences) in the alignment from -.B msafile2. +.I msafile2. This makes the task of mapping trivial. However, not all columns of -.B msafile1 +.I msafile1 must exist in -.B msafile2. +.I msafile2. Save the mask to file .I . A '1' at position x of the mask means that position x of the alignment from -.B msafile1 +.I msafile1 is the same as position y of -.B msafile2, +.I msafile2, where y is the number of '1's that occur at positions <= x in the mask. .TP -.B --amino +.B \-\-amino Assert that -.I trusted_file +.I msafile1 and -.I test_file +.I msafile2 contain protein sequences. .TP -.B --dna +.B \-\-dna Assert that -.I trusted_file +.I msafile1 and -.I test_file +.I msafile2 contain DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the -.I trusted_file +.I msafile1 and -.I test_file +.I msafile2 contain RNA sequences. diff --git a/miniapps/esl-alimask.man.in b/miniapps/esl-alimask.man.in index ccafdf62..fd505992 100644 --- a/miniapps/esl-alimask.man.in +++ b/miniapps/esl-alimask.man.in @@ -1,65 +1,44 @@ -.TH "esl-alimask" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alimask" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alimask - remove columns from a multiple sequence alignment +esl\-alimask \- remove columns from a multiple sequence alignment .SH SYNOPSIS -.TP -Remove columns based on a mask in an input file: -.B esl-alimask -.I [options] -.I msafile -.I maskfile +.nf +\fBesl\-alimask \fR[\fIoptions\fR] \fImsafile maskfile\fR + (remove columns based on a mask in an input file) -.TP -Remove a contiguous set of columns at the start and end of an alignment: -.B esl-alimask -t -.I [options] -.I msafile -.I coords +\fBesl\-alimask \-t \fR[\fIoptions\fR] \fImsafile coords\fR + (remove a contiguous set of columns at the start and end of an alignment) -.TP -Remove columns based on their frequency of gaps: -.B esl-alimask -g -.I [options] -.I msafile +\fBesl\-alimask \-g \fR[\fIoptions\fR] \fImsafile\fR + (remove columns based on their frequency of gaps) -.TP -Remove columns based on their posterior probability annotation: -.B esl-alimask -p -.I [options] -.I msafile +\fBesl\-alimask \-p \fR[\fIoptions\fR] \fImsafile\fR + (remove columns based on their posterior probability annotation) -.TP -Only remove columns that are gaps in the RF annotation: -.B esl-alimask --rf-is-mask -.I [options] -.I msafile +\fBesl\-alimask \-\-rf\-is\-mask \fR[\fIoptions\fR] \fImsafile\fR + (only remove columns that are gaps in the RF annotation) + +The \fB\-g\fR and \fB\-p\fR options may be used in combination. +.fi -.TP -The -.B -g -and -.B -p -options may be used in combination. .SH DESCRIPTION -.B esl-alimask +.B esl\-alimask reads a single input alignment, removes some columns from it (i.e. masks it), and outputs the masked alignment. -.B esl-alimask +.PP +.B esl\-alimask can be run in several different modes. -.B esl-alimask +.PP +.B esl\-alimask runs in "mask file mode" by default when two -command-line arguments -.I (msafile -and -.I maskfile) +command-line arguments (\fImsafile\fR and \fImaskfile\fR) are supplied. In this mode, a bit-vector mask in the .I maskfile defines which columns to keep/remove. The mask is a string that may @@ -81,14 +60,12 @@ is in Stockholm format and contains '#=GC RF' annotation. If the mask length is equal to the non-gap RF length, all gap RF columns will automatically be removed. -.B esl-alimask +.PP +.B esl\-alimask runs in "truncation mode" if the -.B -t +.B \-t option is used along with two command line arguments -.I (msafile -and -.I coords -). In this mode, +(\fImsafile\fR and \fIcoords\fR). In this mode, the alignment will be truncated by removing a contiguous set of columns from the beginning and end of the alignment. The second command line argument is the @@ -100,45 +77,44 @@ The string consists of start and end coordinates separated by any nonnumeric, nonwhitespace character or characters you like; for example, -.I 23..100 -, -.I 23/100 -, or -.I 23-100 +.BR 23..100 , +.BR 23/100 , +or +.B 23\-100 all work. To keep all alignment columns beginning at 23 until the end of the alignment, you -can omit the -.I end -; for example, -.I 23: +can omit the end; for example, +.B 23: would work. If the -.B --t-rf +.B \-\-t\-rf option is used in combination with -.B -t, +.B \-t, the coordinates in .I coords are interpreted as non-gap RF column coordinates. For example, with -.B --t-rf, +.B \-\-t\-rf, a .I coords string of -.I 23-100 +.B 23\-100 would remove all columns before the 23rd non-gap residue in the "#=GC RF" annotation and after the 100th non-gap RF residue. -.B esl-alimask +.PP +.B esl\-alimask runs in "RF mask" mode if the -.B --rf-is-mask +.B \-\-rf\-is\-mask option is enabled. In this mode, the alignment must be in Stockholm format and contain '#=GC RF' annotation. -.B esl-alimask +.B esl\-alimask will simply remove all columns that are gaps in the RF annotation. -.B esl-alimask +.PP +.B esl\-alimask runs in "gap frequency mode" if -.B -g +.B \-g is enabled. In this mode columns for which greater than .I fraction of the aligned sequences have gap residues will be removed. @@ -147,26 +123,28 @@ By default, is 0.5, but this value can be changed to .I with the -.BI --gapthresh " " +.BI \-\-gapthresh " " option. In this mode, if the alignment is in Stockholm format and has RF annotation, then all columns that are gaps in the RF annotation will automatically be removed, unless -.B --saveins +.B \-\-saveins is enabled. -.B esl-alimask +.PP +.B esl\-alimask runs in "posterior probability mode" if -.B -p +.B \-p is enabled. In this mode, masking is based on posterior probability annotation, and the input alignment must be in Stockholm format and contain '#=GR PP' (posterior probability) annotation for all sequences. As a special case, if -.B -p +.B \-p is used in combination with -.B --ppcons, +.B \-\-ppcons, then the input alignment need not have '#=GR PP' annotation, but must contain '#=GC PP_cons' (posterior probability consensus) annotation. +.PP Characters in Stockholm alignment posterior probability annotation (both '#=GR PP' and '#=GC PP_cons') can have 12 possible values: the ten digits '0-9', '*', and '.'. If '.', the position must correspond to @@ -179,18 +157,20 @@ probability of between 0.95 and 1.0. Higher posterior probabilities correspond to greater confidence that the aligned residue belongs where it appears in the alignment. +.PP When -.B -p +.B \-p is enabled with -.BI --ppcons " ", +.BI \-\-ppcons " ", columns which have a consensus posterior probability of less than .I will be removed during masking, and all other columns will not be removed. +.PP When -.B -p +.B \-p is enabled without -.B --ppcons, +.B \-\-ppcons, the number of each possible PP value in each column is counted. If .I @@ -204,110 +184,113 @@ do not meet this criterion will be removed. By default, the values of both and .I are 0.95, but they can be changed with the -.BI --pfract " " +.BI \-\-pfract " " and -.BI --pthresh " " +.BI \-\-pthresh " " options, respectively. +.PP In posterior probability mode, all columns that have 0 residues (i.e. that are 100% gaps) will be automatically removed, unless the -.B --pallgapok +.B \-\-pallgapok option is enabled, in which case such columns will not be removed. +.PP Importantly, during posterior probability masking, unless -.B --pavg +.B \-\-pavg is used, PP annotation values are always considered to be the minimum numerical value in their corresponding range. For example, a PP '9' character is converted to a numerical posterior probability of 0.85. If -.B --pavg +.B \-\-pavg is used, PP annotation values are considered to be the average numerical value in their range. For example, a PP '9' character is converted to a numerical posterior probability of 0.90. +.PP In posterior probability mode, if the alignment is in Stockholm format and has RF annotation, then all columns that are gaps in the RF annotation will automatically be removed, unless -.B --saveins +.B \-\-saveins is enabled. +.PP A single run of -.B esl-alimask +.B esl\-alimask can perform both gap frequency-based masking and posterior probability-based masking if both the -.B -g +.B \-g and -.B -p +.B \-p options are enabled. In this case, a gap frequency-based mask and a posterior probability-based mask are independently computed. These two masks are combined to create the final mask using a logical 'and' operation. Any column that is to be removed by either the gap or PP mask will be removed by the final mask. +.PP With the -.B --small +.B \-\-small option, -.B esl-alimask +.B esl\-alimask will operate in memory saving mode and the required RAM for the masking will be minimal (usually less than a Mb) and independent of the alignment size. To use -.B --small, +.BR \-\-small , the alignment alphabet must be specified with either -.B --amino, -.B --dna, +.BR \-\-amino , +.BR \-\-dna , or -.B --rna, +.BR \-\-rna , and the alignment must be in Pfam format (non-interleaved, 1 line/sequence Stockholm format). Pfam format is the default output format of INFERNAL's .B cmalign program. Without -.B --small +.B \-\-small the required RAM will be equal to roughly the size of the first input alignment (the size of the alignment file itself if it only contains one alignment). + .SH OUTPUT By default, -.B esl-alimask +.B esl\-alimask will print only the masked alignment to stdout and then exit. If the -.BI -o " " +.BI \-o " " option is used, the alignment will be saved to file .I , and information on the number of columns kept and removed will be printed to stdout. If -.B -q +.B \-q is used in combination with -.B -o, +.BR \-o , nothing is printed to stdout. +.PP The mask(s) computed by -.B esl-alimask +.B esl\-alimask when the -.B -t -, -.B -p -, -.B -g -, +.BR \-t , +.BR \-p , +.BR \-g , or -.B --rf-is-mask +.B \-\-rf\-is\-mask options are used can be saved to output files using the options -.BI --fmask-rf " ", -.BI --fmask-all " ", -.BI --gmask-rf " ", -.BI --gmask-all " ", -.BI --pmask-rf " ", -and -.BI --pmask-all " ". +\fB\-\-fmask\-rf\fI \fR, +\fB\-\-fmask\-all\fI \fR, +\fB\-\-gmask\-rf\fI \fR, +\fB\-\-gmask\-all\fI \fR, +\fB\-\-pmask\-rf\fI \fR, and +\fB\-\-pmask\-all\fI \fR. In all cases, .I will contain a single line, a bit vector of length -.B n, +.I , where -.B n +.I is the either the total number of columns in the alignment (for the options suffixed with 'all') or the number of non-gap columns in the RF annotation (for the options suffixed with 'rf'). The mask will be a @@ -318,179 +301,215 @@ the 'rf' suffixed options, the mask only applies to non-gap RF columns. The options beginning with 'f' will save the 'final' mask used to keep/remove columns from the alignment. The options beginning with 'g' save the masks based on gap frequency and require -.B -g. +.BR \-g . The options beginning with 'p' save the masks based on posterior probabilities and require -.B -p. +.BR \-p . + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -o " " +.BI \-o " " Output the final, masked alignment to file .I -instead of to -.I stdout. +instead of to stdout. When this option is used, information about the number of columns kept/removed is printed to stdout. .TP -.B -q +.B \-q Be quiet; do not print anything to stdout. This option can only be used in combination with the -.B -o +.B \-o option. .TP -.B --small +.B \-\-small Operate in memory saving mode. Required RAM will be independent of the size of the input alignment to mask, instead of roughly the size of the input alignment. When enabled, the alignment must be in Pfam Stockholm (non-interleaved 1 line/seq) format (see -esl-reformat) and the output alignment will be in Pfam format. +.BR esl\-reformat ) +and the output alignment will be in Pfam format. .TP -.BI --informat " " -Specify that the input alignment be format -.I . -Choices for +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR . +Common choices for .I -are: 'stockholm', 'pfam', 'a2m', 'psiblast', 'afa'. 'pfam' is a special -case of Stockholm format in which each sequence is placed on a single -line, instead of being interleaved; 'afa' is aligned FASTA. By default -alignments are assumed to be in Stockholm format (either interleaved -or Pfam), unless -.B --small -is enabled, in which case alignments are assumed to be in -Pfam format. +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.B stockholm +format, unless +.B \-\-small +is used, in which case +.B pfam +format (non-interleaved Stockholm) is assumed. .TP -.BI --outformat " " -Specify that the output alignment be format -.I . -Choices for +.BI \-\-outformat " " +Write the output +.I msafile +in alignment format +.IR . +Common choices for .I -are: 'stockholm', 'pfam', 'a2m', 'psiblast', 'afa'. -By default the alignment is output in interleaved Stockholm format unless -.B --small -is enabled, in which case the alignment is output in Pfam format. +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.BR stockholm , +unless +.B \-\-small +is enabled, in which case +.B pfam +(noninterleaved Stockholm) is the default output format. + .TP -.BI --fmask-rf " " +.BI \-\-fmask\-rf " " Save the non-gap RF-length final mask used to mask the alignment to file -.I . +.IR . The input alignment must be in Stockholm format and contain '#=GC RF' annotation for this option to be valid. See the OUTPUT section above for more details on output mask files. .TP -.BI --fmask-all " " +.BI \-\-fmask\-all " " Save the full alignment-length final mask used to mask the alignment to file -.I . +.IR . See the OUTPUT section above for more details on output mask files. .TP -.B --amino +.B \-\-amino Specify that the input alignment is a protein alignment. By default, -.B esl-alimask +.B esl\-alimask will try to autodetect the alphabet, but if the alignment is sufficiently small it may be ambiguous. This option defines the alphabet as protein. Importantly, if -.B --small +.B \-\-small is enabled, the alphabet must be specified with either -.B --amino, -.B --dna, +.BR \-\-amino , +.BR \-\-dna , or -.B --rna. +.BR \-\-rna . .TP -.B --dna +.B \-\-dna Specify that the input alignment is a DNA alignment. .TP -.B --rna +.B \-\-rna Specify that the input alignment is an RNA alignment. .TP -.B --t-rf -With -t, specify that the start and end coordinates defined in +.B \-\-t\-rf +With +.BR \-t , +specify that the start and end coordinates defined in the second command line argument .I coords correspond to non-gap RF coordinates. To use this option, the alignment must be in Stockholm format and have "#=GC RF" annotation. See the DESCRIPTION section for an example of using the -.B --t-rf +.B \-\-t\-rf option. .TP -.B --t-rmins -With -t, specify that all columns that are gaps in the reference (RF) +.B \-\-t\-rmins +With +.BR \-t , +specify that all columns that are gaps in the reference (RF) annotation in between the specified start and end coordinates be removed. By default, these columns will be kept. To use this option, the alignment must be in Stockholm format and have "#=GC RF" annotation. .TP -.BI --gapthresh " " -With -g, specify that a column is kept (included by mask) if no more +.BI \-\-gapthresh " " +With +.BR \-g , +specify that a column is kept (included by mask) if no more than .I -fraction of sequences in the alignment have a gap ('.', '-', or '_') +fraction of sequences in the alignment have a gap ('.', '\-', or '_') at that position. All other columns are removed (excluded by mask). By default, .I is 0.5. .TP -.BI --gmask-rf " " +.BI \-\-gmask\-rf " " Save the non-gap RF-length gap frequency-based mask used to mask the alignment to file -.I . +.IR . The input alignment must be in Stockholm format and contain '#=GC RF' annotation for this option to be valid. See the OUTPUT section above for more details on output mask files. .TP -.BI --gmask-all " " +.BI \-\-gmask\-all " " Save the full alignment-length gap frequency-based mask used to mask the alignment to file -.I . +.IR . See the OUTPUT section above for more details on output mask files. .TP -.BI --pfract " " -With -p, specify that a column is kept (included by mask) if the +.BI \-\-pfract " " +With +.BR \-p , +specify that a column is kept (included by mask) if the fraction of sequences with a non-gap residue in that column with a posterior probability of at least .I -(from -.BI --pthresh " " -) is +(from \fB\-\-pthresh\fI \fR) is .I -or greater. All other columns are removed (excluded by mask) +or greater. All other columns are removed (excluded by mask). By default .I is 0.95. .TP -.BI --pthresh " " -With -p, specify that a column is kept (included by mask) if +.BI \-\-pthresh " " +With +.BR \-p , +specify that a column is kept (included by mask) if .I -(from -.BI --pfract " " -) +(from \fB\-\-pfract \fI\fR) fraction of sequences with a non-gap residue in that column have a posterior probability of at least -.I . +.IR . All other columns are removed (excluded by mask). By default .I @@ -500,56 +519,62 @@ Due to the granularity of the PP annotation, different .I values within a range covered by a single PP character will be have the same effect on masking. For example, using -.BI --pthresh " 0.86" +.B \-\-pthresh 0.86 will have the same effect as using -.BI --pthresh " 0.94". +\fB\-\-pthresh 0.94\fR. .TP -.BI --pavg " " -With -p, specify that a column is kept (included by mask) if +.BI \-\-pavg " " +With +.BR \-p , +specify that a column is kept (included by mask) if the average posterior probability of non-gap residues in that column is at least -.I . +.IR . See the DESCRIPTION section for more on posterior probability (PP) masking. .TP -.BI --ppcons " " -With -p, use the '#=GC PP_cons' annotation to define which columns to +.BI \-\-ppcons " " +With +.BR \-p , +use the '#=GC PP_cons' annotation to define which columns to keep/remove. A column is kept (included by mask) if the PP_cons value for that column is .I or greater. Otherwise it is removed. .TP -.B --pallgapok -With -p, do not automatically remove any columns that are 100% gaps +.B \-\-pallgapok +With +.BR \-p , +do not automatically remove any columns that are 100% gaps (i.e. contain 0 aligned residues). By default, such columns will be removed. .TP -.BI --pmask-rf " " +.BI \-\-pmask\-rf " " Save the non-gap RF-length posterior probability-based mask used to mask the alignment to file -.I . +.IR . The input alignment must be in Stockholm format and contain '#=GC RF' annotation for this option to be valid. See the OUTPUT section above for more details on output mask files. .TP -.BI --pmask-all " " +.BI \-\-pmask\-all " " Save the full alignment-length posterior probability-based mask used to mask the alignment to file -.I . +.IR . See the OUTPUT section above for more details on output mask files. .TP -.B --keepins +.B \-\-keepins If -.B -p +.B \-p and/or -.B -g +.B \-g is enabled and the alignment is in Stockholm or Pfam format and has '#=GC RF' annotation, then allow columns that are gaps in the RF annotation to possibly be kept. By default, all gap RF columns would be removed @@ -561,9 +586,9 @@ To automatically remove all gap RF columns when using a .I maskfile as having length equal to the non-gap RF length in the alignment. To automatically remove all gap RF columns when using -.B -t, +.B \-t, use the -.B --t-rmins +.B \-\-t\-rmins option. diff --git a/miniapps/esl-alimerge.man.in b/miniapps/esl-alimerge.man.in index 7776c9db..cdc512b1 100644 --- a/miniapps/esl-alimerge.man.in +++ b/miniapps/esl-alimerge.man.in @@ -1,34 +1,30 @@ -.TH "esl-alimerge" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alimerge" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alimerge - merge alignments based on their reference (RF) annotation +esl\-alimerge \- merge alignments based on their reference (RF) annotation .SH SYNOPSIS -.TP -Merge two alignment files: -.B esl-alimerge -.I [options] -.I alifile1 -.I alifile2 +.nf +\fBesl\-alimerge \fR[\fIoptions\fR] \fIalifile1 alifile2\fR + (merge two alignment files) + +\fBesl\-alimerge \-\-list \fR[\fIoptions\fR] \fIlistfile\fR + (merge many alignment files listed in a file) -.TP -Merge many alignment files listed in a file: -.B esl-alimerge --list -.I [options] -.I listfile .SH DESCRIPTION -.B esl-alimerge +.PP +.B esl\-alimerge reads more than one input alignments, merges them into a single alignment and outputs it. +.PP The input alignments must all be in Stockholm format. All alignments must have reference ('#=GC RF') annotation. Further, the RF annotation must be identical in all alignments once gap characters in the RF -annotation ('.','-','_') have been removed. This requirement allows +annotation ('.','\-','_') have been removed. This requirement allows alignments with different numbers of total columns to be merged together based on consistent RF annotation, such as alignments created by successive runs of the @@ -36,30 +32,32 @@ by successive runs of the program of the INFERNAL package using the same CM. Columns which have a gap character in the RF annotation are called 'insert' columns. +.PP All sequence data in all input alignments will be included in the output alignment regardless of the output format (see -.B --outformat +.B \-\-outformat option below). However, sequences in the merged alignment will usually contain more gaps ('.') than they did in their respective input alignments. This is because -.B esl-alimerge +.B esl\-alimerge must add 100% gap columns to each individual input alignment so that insert columns in the other input alignments can be accomodated in the merged alignment. +.PP If the output format is Stockholm or Pfam, annotation will be transferred from the input alignments to the merged alignment as follows. All per-sequence ('#=GS') and per-residue ('#=GR') annotation is transferred. Per-file ('#=GF') annotation is transferred if it is present and identical in all alignments. Per-column ('#=GC') annotation is transferred if it is present and identical in all alignments once all -insert positions have been removed -.B and +insert positions have been removed and the '#=GC' annotation includes zero non-gap characters in insert columns. +.PP With the -.BI --list " " +.BI \-\-list " " option, .I is a file listing alignment files to merge. In the list file, blank @@ -67,90 +65,101 @@ lines and lines that start with '#' (comments) are ignored. Each data line contains a single word: the name of an alignment file to be merged. All alignments in each file will be merged. +.PP With the -.B --small +.B \-\-small option, -.B esl-alimerge +.B esl\-alimerge will operate in memory saving mode and the required RAM for the merge will be minimal (should be only a few Mb) and independent of the alignment sizes. To use -.B --small, +.BR \-\-small , all alignments must be in Pfam format (non-interleaved, 1 line/sequence Stockholm format). You can reformat alignments to Pfam using the -.B esl-reformat +.B esl\-reformat Easel miniapp. Without -.B --small +.B \-\-small the required RAM will be equal to roughly the size of the final merged alignment file which will necessarily be at least the summed size of all of the input alignment files to be merged and sometimes several times larger. If you're merging large alignments or you're experiencing very slow performance of -.B esl-alimerge, +.BR esl\-alimerge , try reformatting to Pfam and using -.B --small. +.BR \-\-small . + + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -o " " +.BI \-o " " Output merged alignment to file .I -instead of to -.I stdout. +instead of to stdout. .TP -.B -v +.B \-v Be verbose; print information on the size of the alignments being merged, -and the annotation transferred to the merged alignment to -.I stdout. +and the annotation transferred to the merged alignment to stdout. This option can only be used in combination with the -.B -o +.B \-o option (so that the printed info doesn't corrupt the output alignment file). .TP -.B --small +.B \-\-small Operate in memory saving mode. Required RAM will be independent of the sizes of the alignments to merge, instead of roughly the size of the eventual merged alignment. When enabled, all alignments must be in -Pfam Stockholm (non-interleaved 1 line/seq) format (see -esl-reformat) and the output alignment will be in Pfam format. +Pfam Stockholm (non-interleaved 1 line/seq) format; see +.BR esl\-reformat (1). +The output alignment will be in Pfam format. .TP -.B --rfonly +.B \-\-rfonly Only include columns that are not gaps in the GC RF annotation in the merged alignment. .TP -.BI --outformat " " -Specify that the output alignment be format -.I . -Choices for +.BI \-\-outformat " " +Write the output alignment in format +.IR . +Common choices for .I -are: 'stockholm', 'pfam', 'a2m', 'psiblast', 'afa'. By default -alignments are output in Stockholm format. 'pfam' is a special case of -Stockholm format in which each sequence is placed on a single line, -nstead of being interleaved; 'afa' is aligned FASTA. +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string +.I +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is +.BR stockholm . + .TP -.BI --rna +.B \-\-rna Specify that the input alignments are RNA alignments. By default -.B esl-alimerge +.B esl\-alimerge will try to autodetect the alphabet, but if the alignment is sufficiently small it may be ambiguous. This option defines the alphabet as RNA. .TP -.BI --dna +.B \-\-dna Specify that the input alignments are DNA alignments. .TP -.BI --amino +.B \-\-amino Specify that the input alignments are protein alignments. diff --git a/miniapps/esl-alipid.man.in b/miniapps/esl-alipid.man.in index c107b014..9ef46d79 100644 --- a/miniapps/esl-alipid.man.in +++ b/miniapps/esl-alipid.man.in @@ -1,19 +1,18 @@ -.TH "esl-alipid" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alipid" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alipid - calculate pairwise percent identities for all sequence pairs in an MSA +esl\-alipid \- calculate pairwise percent identities for all sequence pairs in an MSA .SH SYNOPSIS - -.B esl-alipid -.I [options] +.B esl\-alipid +[\fIoptions\fR] .I msafile + .SH DESCRIPTION -.pp -.B esl-alistat +.PP +.B esl\-alistat calculates the pairwise percent identity of each sequence pair in in the MSA(s) in .I msafile. @@ -29,34 +28,13 @@ and is the denominator used for the calculation: the shorter of the two (unaligned) sequence lengths. -.pp +.PP If .I msafile -is - (a single dash), alignment input is read from -.I stdin. +is \- (a single dash), alignment input is read from +stdin. -.pp -The -.I msafile -may be in any of several formats. -The format is autodetected by default. -See the -.B --informat -option to assert an input format. - -.pp -The -.I msafile -must contain either protein or DNA/RNA sequences. -The nucleic or amino alphabet will be autodetected by default. -See the -.B --amino, -.B --dna, -or -.B --rna -options to assert an alphabet. - -.pp +.PP Only canonical residues are counted toward .I and @@ -66,32 +44,46 @@ Degenerate residue codes are not counted. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI --informat " " -Specify that the input alignment is in +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string .I -format. Valid format strings include -"stockholm", "pfam", "a2m", "phylip", "phylips", -"psiblast", "selex", "afa", "clustal", and "clustallike". +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). .TP -.B --amino +.B \-\-amino Assert that the .I msafile contains protein sequences. .TP -.B --dna +.B \-\-dna Assert that the .I msafile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I msafile contains RNA sequences. diff --git a/miniapps/esl-alirev.man.in b/miniapps/esl-alirev.man.in index aed0bdea..535cbf71 100644 --- a/miniapps/esl-alirev.man.in +++ b/miniapps/esl-alirev.man.in @@ -1,32 +1,29 @@ -.TH "esl-alirev" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alirev" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alirev - reverse complement a multiple alignment +esl\-alirev \- reverse complement a multiple alignment .SH SYNOPSIS - -.B esl-alirev -.I [options] +.B esl\-alirev +[\fIoptions\fR] .I msafile .SH DESCRIPTION -.pp -.B esl-alirev +.PP +.B esl\-alirev reads the multiple alignment in .I msafile -and outputs its reverse complement to -.I stdout. +and outputs its reverse complement to stdout. -.pp +.PP An example of where you might need to do this is when you've downloaded a chunk of multiway genomic alignment from one of the genome browsers, but your RNA of interest is on the opposite strand. -.pp +.PP Any per-column and per-residue annotation lines are reversed as well, including Stockholm format and old SELEX format annotations. Annotations that Easel recognizes as secondary structure annotation (a @@ -35,28 +32,19 @@ be "reverse complemented" to preserve proper bracketing orders: for example, ...<<<...>>> is reverse complemented to <<<...>>>..., not simply reversed to >>>...<<<..., which would be wrong. -.pp +.PP If .I msafile -is - (a single dash), alignment input is read from -.I stdin. +is \- (a single dash), alignment input is read from stdin. -.pp -The -.I msafile -may be in any of several formats. -The format is autodetected by default. -See the -.B --informat -option to assert an input format. -.pp +.PP By default the output alignment is written in the same format as the input alignment. See the -.B --outformat +.B \-\-outformat option to use a different output format. -.pp +.PP Because the alignment is parsed into Easel's digital internal representation, the output alignment may differ in certain details from the original alignment; these details should be inconsequential @@ -64,21 +52,21 @@ but may catch your eye. One is that if you have a reference annotation line, Easel's output will put consensus residues in upper case, nonconsensus (inserted) residues in lower case. Another is that the headers for some formats, such as Clustal format, are written with an -arbitrary version number -- so you may find yourself revcomping an +arbitrary version number - so you may find yourself revcomping an alignment in "MUSCLE (3.7) multiple sequence alignment" format and it could come out claiming to be a "CLUSTAL 2.1 multiple sequence alignment", just because Easel writes all of its Clustal format alignment files with that header. -.pp +.PP The .I msafile must contain nucleic acid sequences (DNA or RNA). The alphabet will be autodetected by default. See the -.B --dna +.B \-\-dna or -.B --rna +.B \-\-rna options to assert an alphabet. @@ -86,34 +74,59 @@ options to assert an alphabet. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI --informat " " -Specify that the input alignment is in +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string .I -format. Valid format strings include -"stockholm", "pfam", "a2m", "phylip", "phylips", -"psiblast", "selex", "afa", "clustal", and "clustallike". +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). .TP -.BI --outformat " " -Write the output alignment in +.BI \-\-outformat " " +Write the output alignment in alignment format +.IR . +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +The string .I -format. Valid format strings include -"stockholm", "pfam", "a2m", "phylip", "phylips", -"psiblast", "selex", "afa", "clustal", and "clustallike". +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). +Default is to use same format as the input +.IR msafile . .TP -.B --dna +.B \-\-dna Assert that the .I msafile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I msafile contains RNA sequences. diff --git a/miniapps/esl-alistat.man.in b/miniapps/esl-alistat.man.in index 8936aaac..aac395b2 100644 --- a/miniapps/esl-alistat.man.in +++ b/miniapps/esl-alistat.man.in @@ -1,19 +1,17 @@ -.TH "esl-alistat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-alistat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-alistat - summarize a multiple sequence alignment file +esl\-alistat \- summarize a multiple sequence alignment file .SH SYNOPSIS - -.B esl-alistat -.I [options] +.B esl\-alistat +[\fIoptions\fR] .I msafile .SH DESCRIPTION -.pp -.B esl-alistat +.PP +.B esl\-alistat summarizes the contents of the multiple sequence alignment(s) in .I msafile, such as the alignment name, format, alignment length (number of @@ -21,74 +19,66 @@ aligned columns), number of sequences, average pairwise % identity, and mean, smallest, and largest raw (unaligned) lengths of the sequences. +.PP If .I msafile -is - (a single dash), -multiple alignment input is read from -.I stdin. +is \- (a single dash), +multiple alignment input is read from stdin. + -The alignments can be of protein or DNA/RNA sequences. All alignments -in the same -.I msafile -must be either protein or DNA/RNA. The alphabet will be autodetected -unless one of the options -.I --amino, -.I --dna, -or -.I --rna -are given. These options may be useful in automated -pipelines to make -.B esl-alistat -more robust; alphabet autodetection is not infallible. +.PP The -.B --list, -.B --icinfo, -.B --rinfo, -.B --pcinfo, -.B --psinfo, -.B --cinfo, -.B --bpinfo, +.BR \-\-list , +.BR \-\-icinfo , +.BR \-\-rinfo , +.BR \-\-pcinfo , +.BR \-\-psinfo , +.BR \-\-cinfo , +.BR \-\-bpinfo , and -.B --iinfo, +.B \-\-iinfo options allow dumping various statistics on the alignment to optional output files as described for each of those options below. +.PP The -.B --small +.B \-\-small option allows summarizing alignments without storing them in memory and can be useful for large alignment files with sizes that approach or exceed the amount of available RAM. When -.B --small +.B \-\-small is used, -.B esl-alistat +.B esl\-alistat will print fewer statistics on the alignment, omitting data on the smallest and largest sequences and the average identity of the alignment. -.B --small +.B \-\-small only works on Pfam formatted alignments (a special type of non-interleaved Stockholm alignment in which each sequence occurs on a single line) and -.BI --informat " pfam" +.B \-\-informat pfam must be given with -.B --small. +.BR \-\-small . Further, when -.B --small +.B \-\-small is used, the alphabet must be specified with -.I --amino, -.I --dna, +.BR \-\-amino , +.BR \-\-dna , or -.I --rna. +.BR \-\-rna . + + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -1 +.B \-1 Use a tabular output format with one line of statistics per alignment in .I msafile. @@ -101,81 +91,97 @@ Stockholm format). .SH EXPERT OPTIONS .TP -.BI --informat " " -Specify that the input alignment is in +.BI \-\-informat " " +Assert that input +.I msafile +is in alignment format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string .I -format. At present, valid formats are limited to -Stockholm, Pfam, and AFA (aligned FASTA). +is case-insensitive (\fBa2m\fR or \fBA2M\fR both work). + .TP -.B --amino +.B \-\-amino Assert that the .I msafile contains protein sequences. .TP -.B --dna +.B \-\-dna Assert that the .I msafile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I msafile contains RNA sequences. .TP -.B --small +.B \-\-small Operate in small memory mode for Pfam formatted alignments. -.BI --informat " pfam" +.B \-\-informat pfam and one of -.B --amino, -.B --dna, +.BR \-\-amino , +.BR \-\-dna , or -.B --rna +.B \-\-rna must be given as well. .TP -.BI --list " " +.BI \-\-list " " List the names of all sequences in all alignments in .B msafile to file -.I . +.IR . Each sequence name is written on its own line. .TP -.BI --icinfo " " +.BI \-\-icinfo " " Dump the information content per position in tabular format to file -.I . +.IR . Lines prefixed with "#" are comment lines, which explain the meanings of each of the tab-delimited fields. .TP -.BI --rinfo " " +.BI \-\-rinfo " " Dump information on the frequency of gaps versus nongap residues per position in tabular format to file -.I . +.IR . Lines prefixed with "#" are comment lines, which explain the meanings of each of the tab-delimited fields. .TP -.BI --pcinfo " " +.BI \-\-pcinfo " " Dump per column information on posterior probabilities in tabular format to file -.I . +.IR . Lines prefixed with "#" are comment lines, which explain the meanings of each of the tab-delimited fields. .TP -.BI --psinfo " " +.BI \-\-psinfo " " Dump per sequence information on posterior probabilities in tabular format to file -.I . +.IR . Lines prefixed with "#" are comment lines, which explain the meanings of each of the tab-delimited fields. .TP -.BI --iinfo " " +.BI \-\-iinfo " " Dump information on inserted residues in tabular format to file -.I . +.IR . Insert columns of the alignment are those that are gaps in the reference (#=GC RF) annotation. This option only works if the input file is in Stockholm format with reference annotation. @@ -183,26 +189,26 @@ Lines prefixed with "#" are comment lines, which explain the meanings of each of the tab-delimited fields. .TP -.BI --cinfo " " +.BI \-\-cinfo " " Dump per-column residue counts to file -.I . +.IR . If used in combination with -.B --noambig +.B \-\-noambig ambiguous (degenerate) residues will be ignored and not counted. Otherwise, they will be marginalized. For example, in an RNA sequence file, a 'N' will be counted as 0.25 'A', 0.25 'C', 0.25 'G', and 0.25 'U'. .TP -.B --noambig +.B \-\-noambig With -.B --cinfo, +.BR \-\-cinfo , do not count ambiguous (degenerate) residues. .TP -.B --bpinfo +.B \-\-bpinfo Dump per-column basepair counts to file -.I . +.IR . Counts appear for each basepair in the consensus secondary structure (annotated as "#=GC SS_cons"). Only basepairs from sequences for which both paired positions are canonical residues will be counted. That is, any basepair that is a gap @@ -211,17 +217,21 @@ ignored and not counted. .TP -.B --weight +.B \-\-weight With -.B --icinfo, --rinfo, --pcinfo, --iinfo, --cinfo +.BR \-\-icinfo , +.BR \-\-rinfo , +.BR \-\-pcinfo , +.BR \-\-iinfo , +.BR \-\-cinfo , and -.B --bpinfo, +.BR \-\-bpinfo , weight counts based on #=GS WT annotation in the input -.B msafile. +.IR msafile . A residue or basepair from a sequence with a weight of -.I x +.I will be considered -.I x +.I counts. By default, raw, unweighted counts are reported; corresponding to each sequence having an equal weight of 1. diff --git a/miniapps/esl-compalign.man.in b/miniapps/esl-compalign.man.in index 6bf30ab4..ffe17680 100644 --- a/miniapps/esl-compalign.man.in +++ b/miniapps/esl-compalign.man.in @@ -1,14 +1,11 @@ -.TH "esl-compalign" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-compalign" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-compalign - compare two multiple sequence alignments - - +esl\-compalign \- compare two multiple sequence alignments .SH SYNOPSIS -.B esl-compalign -.I [options] +.B esl\-compalign +[\fIoptions\fR] .I trusted_file .I test_file @@ -16,10 +13,12 @@ esl-compalign - compare two multiple sequence alignments .SH DESCRIPTION -.I esl-compalign +.PP +.B esl\-compalign evaluates the accuracy of a predicted multiple sequence alignment with respect to a trusted alignment of the same sequences. +.PP The .I trusted_file and @@ -39,7 +38,8 @@ markup for each alignment. The number of nongap (non '.' characters) in the reference (RF) annotation must be identical between all corresponding alignments in the two files. -.I esl-compalign +.PP +.B esl\-compalign reads an alignment from each file, and compares them based on their 'reference' annotation. The number of correctly predicted residues for each sequence is computed as follows. A residue that is @@ -51,8 +51,9 @@ N+1 must also appear in a nongap RF column in the predicted alignment between nongap RF columns N and N+1 to be counted as 'correct', otherwise it is incorrect. +.PP The default output of -.B esl-compalign +.B esl\-compalign lists each sequence and the number of correctly and incorrectly predicted residues for that sequence. These counts are broken down into counts for residues in the predicted alignments that occur @@ -60,63 +61,66 @@ in 'match' columns and 'insert' columns. A 'match' column is one for which the RF annotation does not contain a gap. An 'insert' column is one for which the RF annotation does contain a gap. + + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options. .TP -.B -c +.B \-c Print per-column statistics instead of per-sequence statistics. .TP -.B -p +.B \-p Print statistics on accuracy versus posterior probability values. The .I test_file must be annotated with posterior probabilities (#=GR PP) for this option to work. + .SH EXPERT OPTIONS .TP -.BI --p-mask " " +.BI \-\-p\-mask " " This option may only be used in combination with the -.B -p +.B \-p option. Read a "mask" from file -.I . +.IR . The mask file must consist of a single line, of only '0' and '1' characters. There must be exactly RFLEN characters where RFLEN is the number of nongap characters in the RF annotation of all alignments in both .I trusted_file and -.I test_file. +.IR test_file . Positions of the mask that are '1' characters indicate that the corresponding nongap RF position is included by the mask. The posterior probability accuracy statistics for match columns will only pertain to positions that are included by the mask, those that are excluded will be ignored from the accuracy calculation. -.BI --c2dfile " " +.BI \-\-c2dfile " " Save a 'draw file' to file .I which can be read into the -.B esl-ssdraw +.B esl\-ssdraw miniapp. This draw file will define two postscript pages for -.B esl-ssdraw. +.BR esl\-ssdraw . The first page will depict the frequency of errors per match position and frequency of gaps per match position, indicated by magenta and yellow, respectively. The darker magenta, the more errors and the darker yellow, the more gaps. The second page will depict the frequency of errors in insert positions in shades of magenta, the darker the magenta the more errors in inserts after each position. See -.B esl-ssdraw +.B esl\-ssdraw documentation for more information on these diagrams. .TP -.B --amino +.B \-\-amino Assert that .I trusted_file and @@ -124,7 +128,7 @@ and contain protein sequences. .TP -.B --dna +.B \-\-dna Assert that .I trusted_file and @@ -132,7 +136,7 @@ and contain DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I trusted_file and diff --git a/miniapps/esl-compstruct.man.in b/miniapps/esl-compstruct.man.in index d814003d..0f08e651 100644 --- a/miniapps/esl-compstruct.man.in +++ b/miniapps/esl-compstruct.man.in @@ -1,23 +1,21 @@ -.TH "esl-compstruct" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-compstruct" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-compstruct - calculate accuracy of RNA secondary structure predictions - +esl\-compstruct \- calculate accuracy of RNA secondary structure predictions .SH SYNOPSIS -.B esl-compstruct -.I [options] +.B esl\-compstruct +[\fIoptions\fR] .I trusted_file .I test_file - .SH DESCRIPTION -.I esl-compstruct +.PP +.B esl\-compstruct evaluates the accuracy of RNA secondary structure predictions on a per-base-pair basis. The @@ -27,7 +25,7 @@ secondary structure annotation. The .I test_file contains the same sequences, in the same order, with predicted RNA secondary structure annotation. -.I esl-compstruct +.B esl\-compstruct reads the structures and compares them, and calculates both the sensitivity (the number of true base pairs that are correctly predicted) @@ -50,13 +48,13 @@ pair (i,j). Mathews and colleagues (Mathews et al., JMB 288:911-940, 1999) use a more relaxed definition. Mathews defines "correct" as follows: a true pair (i,j) is correctly predicted if any of the following pairs are -predicted: (i,j), (i+1,j), (i-1,j), (i,j+1), or (i,j-1). This rule +predicted: (i,j), (i+1,j), (i\-1,j), (i,j+1), or (i,j\-1). This rule allows for "slipped helices" off by one base. The -.B -m +.B \-m option activates this rule for both sensitivity and for specificity. For specificity, the rule is reversed: predicted pair (i,j) is considered to be true if the true structure contains one of -the five pairs (i,j), (i+1,j), (i-1,j), (i,j+1), or (i,j-1). +the five pairs (i,j), (i+1,j), (i\-1,j), (i,j+1), or (i,j\-1). @@ -64,17 +62,17 @@ the five pairs (i,j), (i+1,j), (i-1,j), (i,j+1), or (i,j-1). .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -m +.B \-m Use the Mathews relaxed accuracy rule (see above), instead of requiring exact prediction of base pairs. .TP -.B -p +.B \-p Count pseudoknotted base pairs towards the accuracy, in either trusted or predicted structures. By default, pseudoknots are ignored. .IP @@ -82,22 +80,22 @@ Normally, only the .I trusted_file would have pseudoknot annotation, since most RNA secondary structure prediction programs do not predict pseudoknots. Using the -.B -p +.B \-p option allows you to penalize the prediction program for not predicting known pseudoknots. In a case where both the .I trusted_file and the .I test_file have pseudoknot annotation, the -.B -p +.B \-p option lets you count pseudoknots in evaluating the prediction accuracy. Beware, however, the case where you use a pseudoknot-capable prediction program to generate the -.I test_file, +.IR test_file , but the .I trusted_file does not have pseudoknot annotation; in this case, -.B -p +.B \-p will penalize any predicted pseudoknots when it calculates specificity, even if they're right, because they don't appear in the trusted annotation. This is probably not what you'd want to do. @@ -107,7 +105,7 @@ trusted annotation. This is probably not what you'd want to do. .SH EXPERT OPTIONS .TP -.B --quiet +.B \-\-quiet Don't print any verbose header information. (Used by regression test scripts, for example, to suppress version/date information.) diff --git a/miniapps/esl-construct.man.in b/miniapps/esl-construct.man.in index 79b74db3..1715361a 100644 --- a/miniapps/esl-construct.man.in +++ b/miniapps/esl-construct.man.in @@ -1,17 +1,16 @@ -.TH "esl-construct" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-construct" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-construct - describe or create a consensus secondary structure +esl\-construct \- describe or create a consensus secondary structure .SH SYNOPSIS -.B esl-construct -.I [options] +.B esl\-construct +[\fIoptions\fR] .I msafile .SH DESCRIPTION -.B esl-construct +.B esl\-construct reports information on existing consensus secondary structure annotation of an alignment or derives new consensus secondary structures based on structure annotation for individual aligned sequences. @@ -26,7 +25,7 @@ Stockholm format and contain RNA or DNA sequences. .PP By default, -.B esl-construct +.B esl\-construct generates lists the sequences in the alignment that have structure annotation and the number of basepairs in those structures. If the alignment also contains consensus structure annotation, the default output @@ -40,18 +39,18 @@ the other between columns k and l, if (i == k and j != l) or (j == l and i != k). .PP -.B esl-construct +.B esl\-construct can also be used to derive a new consensus structure based on structure annotation for individual sequences in the alignment by using any of the following options: -.B -x, -.B -r, -.B -c, -.BI --indi " ", -.BI --ffreq " ", -.B --fmin. +.BR \-x , +.BR \-r , +.BR \-c , +\fB\-\-indi \fI\fR, +\fB\-\-ffreq \fI\fR, +.BR \-\-fmin . These are described below. All of these options require the -.BI -o " " +.BI \-o " " option be used as well to specify that a new alignment file .I be created. Differences between the new alignment(s) and the input @@ -61,113 +60,117 @@ SS_cons) annotation and possibly reference (#=GC RF) annotation. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -a +.B \-a List all alignment positions that are involved in at least one conflicting basepair in at least one sequence to the screen, and then exit. .TP -.B -v +.B \-v Be verbose; with no other options, list individual sequence basepair conflicts as well as summary statistics. .TP -.B -x +.B \-x Compute a new consensus structure as the maximally sized set of basepairs (greatest number of basepairs) chosen from all individual structures that contains 0 conflicts. Output the alignment with the new SS_cons annotation. This option must be used in combination with the -.B -o +.B \-o option. .TP -.B -r +.B \-r Remove any consensus basepairs that conflict with >= 1 individual basepair and output the alignment with the new SS_cons annotation. This option must be used in combination with the -.B -o +.B \-o option. .TP -.B -c +.B \-c Define a new consensus secondary structure as the individual structure annotation that has the maximum number of consistent basepairs with the existing consensus secondary structure annotation. This option must be used in combination with the -.B -o +.B \-o option. .TP -.B --rfc +.B \-\-rfc With -.B -c, +.BR \-c , set the reference annotation (#=GC RF) as the sequence whose individual structure becomes the consensus structure. .TP -.BI --indi " " +.BI \-\-indi " " Define a new consensus secondary structure as the individual structure annotation from sequence named -.I . +.IR . This option must be used in combination with the -.B -o +.B \-o option. .TP -.B --rfindi +.B \-\-rfindi With -.BI --indi " ", +\fB\-\-indi \fI\fR, set the reference annotation (#=GC RF) as the sequence named -.B . +.BR . .TP -.BI --ffreq " " +.BI \-\-ffreq " " Define a new consensus structure as the set of basepairs between columns i:j that are paired in more than .I fraction of the individual sequence structures. This option must be used in combination with the -.B -o +.B \-o option. .TP -.B --fmin +.B \-\-fmin Same as -.BI --ffreq " " +.BI \-\-ffreq " " except find the maximal .I that gives a consistent consensus structure. A consistent structure has each base (alignment position) as a member of at most 1 basepair. .TP -.BI -o " ", +.BI \-o " ", Output the alignment(s) with new consensus structure annotation to file -.I . +.IR . .TP -.B --pfam -With -o, specify that the alignment output format be Pfam format, a +.B \-\-pfam +With +.BR -o , +specify that the alignment output format be Pfam format, a special type of non-interleaved Stockholm on which each sequence appears on a single line. .TP -.BI -l " " +.BI \-l " " Create a new file .I that lists the sequences that have at least one basepair that conflicts with a consensus basepair. .TP -.BI --lmax " " -With -l, only list sequences that have more than +.BI \-\-lmax " " +With +.BR \-l , +only list sequences that have more than .I basepairs that conflict with the consensus structure to the list file. diff --git a/miniapps/esl-histplot.man.in b/miniapps/esl-histplot.man.in index 1e3e6a1a..22f0c9e5 100644 --- a/miniapps/esl-histplot.man.in +++ b/miniapps/esl-histplot.man.in @@ -1,42 +1,40 @@ -.TH "esl-histplot" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-histplot" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-histplot - collate data histogram, output xmgrace datafile +esl\-histplot \- collate data histogram, output xmgrace datafile .SH SYNOPSIS -.B esl-histplot -.I [options] +.B esl\-histplot +[\fIoptions\fR] .I datafile .SH DESCRIPTION -.B esl-histplot +.PP +.B esl\-histplot summarizes numerical data in the input file .I datafile. -.pp +.PP One real-numbered value is taken from each line of the input file. Each line is split into whitespace-delimited fields, and one field is converted to data. By default this is the first field; this can be changed by the -.I -f +.B \-f option. -.pp +.PP Default output is a survival plot (Prob(value > x)) in xmgrace XY data -format, to -.B stdout. +format, to stdout. Output may be directed to a file with the -.I -o +.B \-o option. -.pp +.PP If .I datafile -is "-", input lines are read from -.B stdin +is \- (a single dash), input lines are read from stdin instead of opening a file. @@ -45,19 +43,19 @@ instead of opening a file. .SH OPTIONS .TP -.BI -f " " +.BI \-f " " Read data from whitespace-delimited field .I on each line, instead of the first field. Fields are numbered starting from 1. .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -o " " +.BI \-o " " Send output to file .I instead of stdout. diff --git a/miniapps/esl-mask.man.in b/miniapps/esl-mask.man.in index c62548c9..fdae9532 100644 --- a/miniapps/esl-mask.man.in +++ b/miniapps/esl-mask.man.in @@ -1,40 +1,41 @@ -.TH "esl-mask" 1 "@EASEL_DATE@" "Easel @EASELVERSION@" "Easel Manual" +.TH "esl\-mask" 1 "@EASEL_DATE@" "Easel @EASELVERSION@" "Easel Manual" .SH NAME -.TP -esl-mask - mask sequence residues with X's (or other characters) +esl\-mask \- mask sequence residues with X's (or other characters) .SH SYNOPSIS -.B esl-mask -.I [options] +.B esl\-mask +[\fIoptions\fR] .I seqfile .I maskfile .SH DESCRIPTION -.B esl-mask +.PP +.B esl\-mask reads lines from .I maskfile that give start/end coordinates for regions in each sequence in -.I seqfile, +.IR seqfile , masks these residues (changes them to X's), and outputs the masked sequence. +.PP The .I maskfile is a space-delimited file. Blank lines and lines that start with '#' (comments) are ignored. Each data line contains at least three fields: -.I seqname, -.I start, +.IR seqname , +.IR start , and -.I end. +.IR end . The .I seqname is the name of a sequence in the -.I seqfile, +.IR seqfile , and .I start and @@ -43,110 +44,122 @@ are coordinates defining a region in that sequence. The coordinates are indexed <1..L> with respect to a sequence of length . +.PP By default, the sequence names must appear in exactly the same order and number as the sequences in the -.I seqfile. +.IR seqfile. This is easy to enforce, because the format of .I maskfile is also legal as a list of names for -.B esl-sfetch, +.BR esl\-sfetch , so you can always fetch a temporary sequence file with -.B esl-sfetch +.B esl\-sfetch and pipe that to -.B esl-mask. +.BR esl\-mask . (Alternatively, see the -.B -R +.B \-R option for fetching from an SSI-indexed -.I seqfile.) +.IR seqfile .) +.PP The default is to mask the region indicated by -.I ... +\fI\fR..\fI\fR. Alternatively, everything but this region can be masked; see the -.B -r +.B \-r reverse masking option. +.PP The default is to mask residues by converting them to X's. Any other masking character can be chosen (see -.B -m -option), -or alternatively, masked residues can be lowercased (see -.B -l +.B \-m +option), or alternatively, masked residues can be lowercased (see +.B \-l option). -The -.I seqfile -can be in any sequence file format that Easel reads, such as FASTA -format. The format will be autodetected. Alternatively, for increased reliability, -you can specify the input format; see the -.B --informat -option. - .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -l +.B \-l Lowercase; mask by converting masked characters to lower case and unmasked characters to upper case. .TP -.BI -m " " +.BI \-m " " Mask by converting masked residues to .I instead of the default X. .TP -.BI -o " " +.BI \-o " " Send output to file .I instead of stdout. .TP -.B -r +.B \-r Reverse mask; mask everything outside the region .I start..end, as opposed to the default of masking that region. .TP -.B -R +.B \-R Random access; fetch sequences from -.I +.I seqfile rather than requiring that sequence names in -.I +.I maskfile and -.I +.I seqfile come in exactly the same order and number. The -.I must be SSI indexed (see -.B esl-sfetch --index.) +.I seqfile +must be SSI indexed (see \fBesl\-sfetch \-\-index\fR.) .TP -.BI -x " " +.BI \-x " " Extend all masked regions by up to residues on each side. For normal masking, this means masking -.I -..+. +\fI\fR\-\fI\fR..\fI\fR+\fI\fR. For reverse masking, this means masking -.I 1..-1+ +1..\fI\fR\-1+\fI\fR and -.I +1-..L -in a srquence of length L. +\fI\fR+1\-\fI\fR..L +in a sequence of length L. .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -rather than allowing the program to autodetect -the file format. - - +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). diff --git a/miniapps/esl-reformat.man.in b/miniapps/esl-reformat.man.in index b6cd7c21..f49d1709 100644 --- a/miniapps/esl-reformat.man.in +++ b/miniapps/esl-reformat.man.in @@ -1,69 +1,84 @@ -.TH "esl-reformat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-reformat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-reformat - convert sequence file formats +esl\-reformat \- convert sequence file formats .SH SYNOPSIS -.B esl-reformat -.I [options] +.B esl\-reformat +[\fIoptions\fR] .I format .I seqfile .SH DESCRIPTION -.B esl-reformat +.PP +.B esl\-reformat reads the sequence file .I seqfile in any supported format, reformats it into a new format specified by -.I format, +.IR format , then outputs the reformatted text. -.pp +.PP The .I format -argument must (case-insensitively) match a supported sequence file format: -currently, limited to -.I fasta, -.I stockholm, -.I pfam, -or -.I afa -(aligned fasta). +argument must (case-insensitively) match a supported sequence file format. +Common choices for +.I format +include: +.BR fasta , +.BR embl , +.BR genbank. +If +.I seqfile +is an alignment file, +alignment output formats also work. +Common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). -.pp +.PP Unaligned format files cannot be reformatted to aligned formats. However, aligned formats can be reformatted -to unaligned formats - gap characters are +to unaligned formats, in which case gap characters are simply stripped out. .SH OPTIONS .TP -.B -d +.B \-d DNA; convert U's to T's, to make sure a nucleic acid sequence is shown as DNA not RNA. See -.B -r. +.B \-r. .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -l +.B \-l Lowercase; convert all sequence residues to lower case. See -.B -u. +.BR \-u . .TP -.B -n +.B \-n For DNA/RNA sequences, converts any character that's not unambiguous RNA/DNA (e.g. ACGTU/acgtu) to an N. Used to convert IUPAC ambiguity codes to N's, for software that can't handle all IUPAC codes (some @@ -74,35 +89,35 @@ a predictable fashion. .TP -.BI -o " " +.BI \-o " " Send output to file .I instead of stdout. .TP -.B -r +.B \-r RNA; convert T's to U's, to make sure a nucleic acid sequence is shown as RNA not DNA. See -.B -d. +.BR \-d . .TP -.B -u +.B \-u Uppercase; convert all sequence residues to upper case. See -.B -l. +.BR \-l . .TP -.B -x +.B \-x For DNA sequences, convert non-IUPAC characters (such as X's) to N's. This is for compatibility with benighted people who insist on using X instead of the IUPAC ambiguity character N. (X is for ambiguity in an amino acid residue). .IP Warning: like the -.B -n +.B \-n option, the code doesn't check that you are actually giving it DNA. It simply literally just converts non-IUPAC DNA symbols to N. So if you accidentally give it protein sequence, it will happily convert most @@ -115,9 +130,9 @@ every amino acid residue to an N. .TP -.BI --gapsym " " +.BI \-\-gapsym " " Convert all gap characters to -.I . +.IR . Used to prepare alignment files for programs with strict requirements for gap symbols. Only makes sense if the input @@ -125,15 +140,34 @@ the input is an alignment. .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -rather than allowing the program to autodetect -the file format. - +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for +.I +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). .TP -.B --mingap +.B \-\-mingap If .I seqfile is an alignment, remove any columns that contain 100% gap or missing @@ -142,17 +176,17 @@ data characters, minimizing the overall length of the alignment. larger alignment.) .TP -.B --keeprf +.B \-\-keeprf When used in combination with -.B --mingap, +.BR \-\-mingap , never remove a column that is not a gap in the reference (#=GC RF) annotation, even if the column contains 100% gap characters in all aligned sequences. By default with -.B --mingap, +.BR \-\-mingap , nongap RF columns that are 100% gaps in all sequences are removed. .TP -.B --nogap +.B \-\-nogap Remove any aligned columns that contain any gap or missing data symbols at all. Useful as a prelude to phylogenetic analyses, where you only want to analyze columns containing 100% residues, so you want @@ -160,7 +194,7 @@ to strip out any columns with gaps in them. Only makes sense if the file is an alignment file. .TP -.B --wussify +.B \-\-wussify Convert RNA secondary structure annotation strings (both consensus and individual) from old "KHS" format, ><, to the new WUSS notation, <>. If the notation is already in WUSS format, this option will screw it @@ -168,7 +202,7 @@ up, without warning. Only SELEX and Stockholm format files have secondary structure markup at present. .TP -.B --dewuss +.B \-\-dewuss Convert RNA secondary structure annotation strings from the new WUSS notation, <>, back to the old KHS format, ><. If the annotation is already in KHS, this option will corrupt it, without warning. @@ -176,12 +210,12 @@ Only SELEX and Stockholm format files have secondary structure markup. .TP -.B --fullwuss +.B \-\-fullwuss Convert RNA secondary structure annotation strings from simple (input) WUSS notation to full (output) WUSS notation. .TP -.BI --replace " " +.BI \-\-replace " " .I must be in the format .I : @@ -193,26 +227,26 @@ separated by a ":" symbol. Each character from .I in the input file will be replaced by its counterpart (at the same position) from -.I . +.IR . Note that special characters in .I (such as "~") may need to be prefixed by a "\\" character. .TP -.B --small +.B \-\-small Operate in small memory mode for input alignment files in Pfam format. If not used, each alignment is stored in memory so the required memory will be roughly the size of the largest alignment in the input file. With -.B --small, +.BR \-\-small , input alignments are not stored in memory. This option only works in combination with -.BI --informat " pfam" +.B \-\-informat pfam and output format .I pfam or -.I afa. +.IR afa . diff --git a/miniapps/esl-selectn.man.in b/miniapps/esl-selectn.man.in index bd1e08a7..46dae022 100644 --- a/miniapps/esl-selectn.man.in +++ b/miniapps/esl-selectn.man.in @@ -1,20 +1,19 @@ -.TH "esl-selectn" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-selectn" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-selectn - select random subset of lines from file +esl\-selectn \- select random subset of lines from file .SH SYNOPSIS -.B esl-selectn -.I [options] +.B esl\-selectn +[\fIoptions\fR] .I nlines .I filename .SH DESCRIPTION -.pp -.B esl-selectn +.PP +.B esl\-selectn selects .I nlines lines at random from file @@ -22,49 +21,47 @@ lines at random from file and outputs them on .I stdout. -.pp +.PP If .I filename -is - (a single dash), -input is read from -.I stdin. +is \- (a single dash), +input is read from stdin. -.pp -Uses a reservoir sampling algorithm, which is efficient: requires only a single pass through + +.PP +Uses an efficient reservoir sampling algorithm that only requires only a single pass through .I filename, and memory storage proportional to .I nlines -(and importantly, not to the size of +(and importantly, not to the size of the file .I filename itself). -.B esl-selectn +.B esl\-selectn can therefore be used to create large scale statistical sampling -experiments, especially in combination with other -.I Easel +experiments, especially in combination with other Easel miniapplications. .SH OPTIONS - .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI --seed " " +.BI \-\-seed " " Set the random number seed to .I , an integer greater than 0. The default is to use the current value of -.I time(). +.B time(). (As the return value of -.I time() +.B time() is likely to be in units of seconds, two calls to -.B esl-selectn +.B esl\-selectn within the same second will generate exactly the same sample; this may not be what you want.) diff --git a/miniapps/esl-seqrange.man.in b/miniapps/esl-seqrange.man.in index 4d29a631..454c1226 100644 --- a/miniapps/esl-seqrange.man.in +++ b/miniapps/esl-seqrange.man.in @@ -1,23 +1,20 @@ -.TH "esl-seqrange" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-seqrange" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-seqrange - determine a range of sequences for one of many parallel processes. +esl\-seqrange \- determine a range of sequences for one of many parallel processes .SH SYNOPSIS - -.TP -.B esl-sfetch -.I [options] +.B esl\-sfetch +[\fIoptions\fR] .I seqfile .I procidx .I nproc .SH DESCRIPTION -.pp -.B esl-seqrange -reads an indexed +.PP +.B esl\-seqrange +reads an SSI-indexed .I seqfile and determines the range of sequence indices in that file that process number @@ -25,19 +22,21 @@ number out of .I nproc total processes should operate on during a parallel processing of -.I seqfile. +.IR seqfile . -.pp +.PP The .I seqfile must be indexed first using -.B esl-sfetch --index . +.B esl\-sfetch \-\-index +.IR seqfile . This creates an SSI index file -.I .ssi. +.IR seqfile .ssi. An SSI file is required in order for -.B esl-seqrange +.B esl\-seqrange to work. +.PP Sequence index ranges are calculated using a simple rule: the number of sequences for each process should be identical, or as close as possible to identical, across all processes. The lengths of the sequences @@ -46,18 +45,38 @@ are not considered (even though they probably should be). .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -where +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for .I -may be FASTA, GenBank, EMBL, UniProt, or DDBJ. This string -is case-insensitive ("genbank" or "GenBank" both work, for example). +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + + .SH SEE ALSO diff --git a/miniapps/esl-seqstat.man.in b/miniapps/esl-seqstat.man.in index f0c04db9..eeb5357c 100644 --- a/miniapps/esl-seqstat.man.in +++ b/miniapps/esl-seqstat.man.in @@ -1,76 +1,47 @@ -.TH "esl-seqstat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-seqstat" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-seqstat - summarize contents of a sequence file +esl\-seqstat \- summarize contents of a sequence file .SH SYNOPSIS - -.B esl-seqstat -.I [options] +.B esl\-seqstat +[\fIoptions\fR] .I seqfile .SH DESCRIPTION -.pp -.B esl-seqstat +.PP +.B esl\-seqstat summarizes the contents of the -.I seqfile. +.IR seqfile . It prints the format, alphabet type, number of sequences, total number of residues, and the mean, smallest, and largest sequence length. -.pp +.PP If .I seqfile -is - (a single dash), -sequence input is read from -.I stdin. - -.pp -The sequence file may be in any of several different common unaligned -sequence formats including FASTA, GenBank, EMBL, UniProt, or DDBJ. It -may also be an alignment file, in Stockholm format for example. By -default the file format is autodetected. The -.I --informat -option allows you to specify the format and override -autodetection. This -option may be useful for making -.B esl-seqstat -more robust, because format autodetection may fail on unusual files. - -.pp -The sequences can be of protein or DNA/RNA sequences. All sequences -in the same -.I seqfile -must be either protein or DNA/RNA. The alphabet will be autodetected -unless one of the options -.I --amino, -.I --dna, -or -.I --rna -are given. These options may be useful in automated -pipelines to make -.B esl-alistat -more robust; alphabet autodetection is not infallible. +is \- (a single dash), +sequence input is read from stdin. + .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -a +.B \-a Additionally show a summary statistic line showing the name, length, and description of each individual sequence. Each of these lines is prefixed by an = character, in order to allow these lines to be easily grepped out of the output. .TP -.B -c +.B \-c Additionally print the residue composition of the sequence file. @@ -78,28 +49,47 @@ Additionally print the residue composition of the sequence file. .SH EXPERT OPTIONS .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -where +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for .I -may be FASTA, GenBank, EMBL, UniProt, DDBJ, or Stockholm. This string -is case-insensitive ("genbank" or "GenBank" both work, for example). +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + .TP -.B --amino +.B \-\-amino Assert that the .I seqfile contains protein sequences. .TP -.B --dna +.B \-\-dna Assert that the .I seqfile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I seqfile contains RNA sequences. diff --git a/miniapps/esl-sfetch.man.in b/miniapps/esl-sfetch.man.in index f35c4954..545621b3 100644 --- a/miniapps/esl-sfetch.man.in +++ b/miniapps/esl-sfetch.man.in @@ -1,92 +1,71 @@ -.TH "esl-sfetch" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-sfetch" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-sfetch - retrieve (sub-)sequences from a sequence file +esl\-sfetch \- retrieve (sub-)sequences from a sequence file .SH SYNOPSIS -.TP -Single sequence retrieval: -.B esl-sfetch -.I [options] -.I seqfile -.I key +.nf +\fBesl\-sfetch\fR [\fIoptions\fR] \fIseqfile key\fR + (retrieve a single sequence by key) -.TP -Single subsequence retrieval: -.B esl-sfetch -c -.I .. -.I seqfile -.I key +\fBesl\-sfetch \-c \fIfrom\fB..\fIto \fR[\fIoptions\fR]\fI seqfile key\fR + (retrieve a single subsequence by key and coords) -.TP -Multiple sequence retrieval: -.B esl-sfetch -f -.I [options] -.I seqfile -.I keyfile +\fBesl\-sfetch \-f \fR[\fIoptions\fR] \fIseqfile keyfile\fR + (retrieve multiple sequences using a file of keys) -.TP -Multiple subsequence retrieval: -.B esl-sfetch -Cf -.I [options] -.I seqfile -.I subseq-coord-file +\fBesl\-sfetch \-Cf \fR[\fIoptions\fR] \fIseqfile subseq\-coord\-file\fR + (retrieve multiple subsequences using file of keys and coords) -.TP -Indexing a sequence file for retrieval: -.B esl-afetch --index -.I msafile +\fBesl\-afetch \-\-index\fI msafile\fR + (index a sequence file for retrievals) +.fi .SH DESCRIPTION -.pp -.B esl-sfetch +.PP +.B esl\-sfetch retrieves one or more sequences or subsequences from -.I seqfile. +.IR seqfile . -.pp +.PP The .I seqfile -should be indexed first using -.B esl-sfetch --index . +must be indexed using +\fBesl\-sfetch \-\-index\fI seqfile\fR. This creates an SSI index file -.I .ssi. -An SSI file is not necessary, but it greatly accelerates -retrieval. +.IR seqfile .ssi. -.pp +.PP To retrieve a single complete sequence, do -.B esl-sfetch +\fBesl\-sfetch\fI seqfile key\fR, where .I key is the name or accession of the desired sequence. -.pp +.PP To retrieve a single subsequence rather than a complete sequence, use the -.I -c start-end -option to provide start and end coordinates. The start -and end coordinates are provided as one string, separated -by any nonnumeric, nonwhitespace character or characters you like; -for example, -.I -c 23..100 -, -.I -c 23/100 -, or -.I -c 23-100 -all work. To retrieve a suffix of a subsequence, you -can omit the +\fB\-c \fIstart\fR..\fIend\fR +option to provide +.I start +and .I end -; for example, -.I -c 23: -would work. +coordinates. The +.I start +and +.I end +coordinates are provided as one string, separated +by any nonnumeric, nonwhitespace character or characters you like; +see the +.B \-c +option below for more details. -.pp +.PP To retrieve more than one complete sequence at once, you may use the -.I -f +.B \-f option, and the second command line argument will specify the name of a .I keyfile @@ -94,98 +73,72 @@ that contains a list of names or accessions, one per line; the first whitespace-delimited field on each line of this file is parsed as the name/accession. -.pp +.PP To retrieve more than one subsequence at once, use the -.I -C +.B \-C option in addition to -.I -f -, and now the second argument is parsed as a list of subsequence -coordinate lines, with each line containing at least four -whitespace-delimited fields: -.I new_name -.I from -.I to -.I name/accession. -For each such line, sequence -.I name/accession -is found, a subsequence -.I from..to is extracted, -and the subsequence is renamed -.I new_name -before being output. +.BR \-f , +and now the second argument is parsed as a list of subsequence +coordinate lines. See the +.B \-C +option below for more details, including the format of these lines. -.pp +.PP In DNA/RNA files, you may extract (sub-)sequences in reverse complement orientation in two different ways: either by providing a .I from coordinate that is greater than -.I to, +.IR to , or by providing the -.I -r +.I \-r option. -.pp -The sequence file may be in any of several different common unaligned -sequence formats including FASTA, GenBank, EMBL, UniProt, or DDBJ. It -may also be an alignment file, in Stockholm format for example. By -default the file format is autodetected. The -.I --informat -option allows you to specify the format and override -autodetection. This -option may be useful for making -.B esl-sfetch -more robust, because format autodetection may fail on unusual files. - -.pp +.PP When the -.I -f +.B \-f option is used to do multiple (sub-)sequence retrieval, the file -argument may be - (a single dash), in which case the list of +argument may be \- (a single dash), in which case the list of names/accessions (or subsequence coordinate lines) is read from standard input. However, because a standard input stream can't be SSI indexed, -(sub-)sequence retrieval from -.I stdin -may be slow. +(sub-)sequence retrieval from stdin may be slow. .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -c " " +.BI \-c " coords" Retrieve a subsequence with start and end coordinates specified by the -.I +.I coords string. This string consists of start and end coordinates separated by any nonnumeric, nonwhitespace character or characters you like; for example, -.I -c 23..100 -, -.I -c 23/100 -, or -.I -c 23-100 +\fB\-c 23..100\fR, +\fB\-c 23/100\fR, or +\fB\-c 23\-100\fR all work. To retrieve a suffix of a subsequence, you can omit the .I end ; for example, -.I -c 23: +.B \-c 23: would work. To specify reverse complement (for DNA/RNA sequence), -specify -.I +you can specify +.I from greater than -.I ; +.IR to ; for example, -.I -c 100..23 +.B \-c 100..23 retrieves the reverse complement strand from 100 to 23. .TP -.B -f +.B \-f Interpret the second argument as a .I keyfile instead of as just one @@ -194,93 +147,114 @@ The first whitespace-limited field on each line of .I keyfile is interpreted as a name or accession to be fetched. This option doesn't work with the -.B --index +.B \-\-index option. Any other fields on a line after the first one are ignored. Blank lines and lines beginning with # are ignored. .TP -.BI -o " " +.BI \-o " " Output retrieved sequences to a file .I -instead of to -.I stdout. +instead of to stdout. .TP -.BI -n " " +.BI \-n " " Rename the retrieved (sub-)sequence -.I . -This is incompatible with the -.I -f -option. +.IR . +Incompatible with +.BR \-f . .TP -.B -r -Reverse complement the retrieved (sub-)sequence. This only works for +.B \-r +Reverse complement the retrieved (sub-)sequence. Only accepted for DNA/RNA sequences. .TP -.B -C +.B \-C Multiple subsequence retrieval mode, with -.I -f +.B \-f option (required). Specifies that the second command line argument is to be parsed as a subsequence coordinate file, consisting of lines containing four whitespace-delimited fields: -.I new_name -.I from -.I to -.I name/accession. +.IR new_name , +.IR from , +.IR to , +.IR name/accession . For each such line, sequence .I name/accession is found, a subsequence -.I from..to is extracted, +\fIfrom\fR..\fIto\fR is extracted, and the subsequence is renamed .I new_name before being output. Any other fields after the first four are ignored. Blank lines -and lines beginning in # are ignored. +and lines beginning with # are ignored. .TP -.B -O +.B \-O Output retrieved sequence to a file named -.I . +.IR key . This is a convenience for saving some typing: instead of -.B esl-sfetch -o SRPA_HUMAN swissprot SRPA_HUMAN +.nf + \fB% esl\-sfetch \-o SRPA_HUMAN swissprot SRPA_HUMAN\fI +.fi you can just type -.B esl-sfetch -O swissprot SRPA_HUMAN. +.nf + \fB% esl\-sfetch \-O swissprot SRPA_HUMAN +.fi The -.B -O +.B \-O option only works if you're retrieving a single alignment; it is incompatible with -.B -f. +.BR \-f . .TP -.B --index +.B \-\-index Instead of retrieving a .I key, the special command -.B esl-afetch --index -.I msafile +.B esl\-sfetch \-\-index +.I seqfile produces an SSI index of the names and accessions of the alignments in the -.I msafile. +.I seqfile. Indexing should be done once on the -.I msafile +.I seqfile to prepare it for all future fetches. + .SH EXPERT OPTIONS .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -where +.BI \-\-informat " " +Assert that +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for .I -may be FASTA, GenBank, EMBL, UniProt, DDBJ, or Stockholm. This string -is case-insensitive ("genbank" or "GenBank" both work, for example). +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). diff --git a/miniapps/esl-shuffle.man.in b/miniapps/esl-shuffle.man.in index c1280779..16bd2261 100644 --- a/miniapps/esl-shuffle.man.in +++ b/miniapps/esl-shuffle.man.in @@ -1,114 +1,93 @@ -.TH "esl-shuffle" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-shuffle" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-shuffle - shuffling sequences or generating random ones +esl\-shuffle \- shuffling sequences or generating random ones .SH SYNOPSIS -.TP -Shuffling individual sequences: -.B esl-shuffle -.I [options] -.I seqfile +.nf +\fBesl\-shuffle \fR[\fIoptions\fR] \fIseqfile\fR + (shuffle sequences) -.TP -Generating random sequences: -.B esl-shuffle -G -.I [options] +\fBesl\-shuffle \-G \fR[\fIoptions\fR] + (generate random sequences) -.TP -Shuffling multiple sequence alignments columnwise: -.B esl-shuffle -A -.I [options] -.I msafile +\fBesl\-shuffle \-A \fR[\fIoptions\fR] \fImsafile\fR + (shuffle multiple sequence alignments columnwise) -.TP -Shuffling QRNA pairwise alignment input files: -.B esl-shuffle -Q -.I [options] -.I qrna-alignment-file +\fBesl\-shuffle \-Q \fR[\fIoptions\fR] \fIqrna\-alignment\-file\fR + (shuffle QRNA pairwise alignments) +.fi .SH DESCRIPTION -.pp -.B esl-shuffle +.PP +.B esl\-shuffle is capable of four different modes of operation. -.pp +.PP By default, -.B esl-shuffle +.B esl\-shuffle reads individual sequences from -.I seqfile -, shuffles them, and outputs the shuffled sequence. +.IR seqfile , +shuffles them, and outputs the shuffled sequences. By default, shuffling is done by preserving monoresidue composition; other options are listed below. -.pp +.PP With the -.I -G +.B \-G option, -.B esl-shuffle +.B esl\-shuffle generates some number of random sequences of some length in some alphabet. The -.I -N +.B \-N option controls the number (default is 1), the -.I -L +.B \-L option controls the length (default is 0), and the -.I --amino, -.I --dna, +.BR \-\-amino , +.BR \-\-dna , and -.I --rna +.B \-\-rna options control the alphabet. -.pp +.PP With the -.I -A +.B \-A option, -.B esl-shuffle +.B esl\-shuffle reads one or more multiple alignments from -.I +.I msafile and shuffles them columnwise. -.pp +.PP Finally, the -.I -Q +.B \-Q option is for shuffling pairwise alignments in QRNA input files. A QRNA input file is a quasi-FASTA file, where each successive pair of sequences is interpreted as a pairwise alignment; sequences may -contain gap characters (period, dash, or underscore: .-_) and these +contain gap characters (period, dash, or underscore: .\-_) and these pairs of sequences must have exactly the same aligned length. -.pp -An unaligned sequence file to be shuffled may be in any of several -different common unaligned sequence formats including FASTA, GenBank, -EMBL, UniProt, or DDBJ; alignment files are also valid, in which case -individual unaligned sequences are sequentially plucked from the -alignment. By default the file format is autodetected. The -.I --informat -option allows you to specify the format and override -autodetection. This -option may be useful for making -.B esl-shuffle -more robust, because format autodetection may fail on unusual files. + .SH GENERAL OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.BI -o " " +.BI \-o " " Direct output to a file named .I rather than to stdout. .TP -.BI -N " " +.BI \-N " " Generate .I sequences, or @@ -116,11 +95,11 @@ sequences, or perform independent shuffles per input sequence or alignment. .TP -.BI -L " " +.BI \-L " " Generate sequences of length -.I , +.IR , or truncate output shuffled sequences or alignments to a length of -.I . +.IR . @@ -131,35 +110,35 @@ These options only apply in default (sequence shuffling) mode. They are mutually exclusive. .TP -.B -m +.B \-m Monoresidue shuffling (the default): preserve monoresidue composition exactly. -Uses the so-called Fisher/Yates algorithm (Knuth's "Algorithm P"). +Uses the Fisher/Yates algorithm (aka Knuth's "Algorithm P"). .TP -.B -d +.B \-d Diresidue shuffling; preserve diresidue composition exactly. Uses the Altschul/Erickson algorithm (Altschul and Erickson, 1986). A more efficient algorithm (Kandel and Winkler 1996) is known but has not yet been implemented in Easel. .TP -.B -0 +.B \-0 0th order Markov generation: generate a sequence of the same length with the same 0th order Markov frequencies. Such a sequence will approximately preserve the monoresidue composition of the input. .TP -.B -1 +.B \-1 1st order Markov generation: generate a sequence of the same length with the same 1st order Markov frequencies. Such a sequence will approximately preserve the diresidue composition of the input. .TP -.B -r +.B \-r Reversal; reverse each input. .TP -.BI -w " " +.BI \-w " " Regionally shuffle the input in nonoverlapping windows of size .I residues, preserving exact monoresidue composition in each window. @@ -169,57 +148,73 @@ residues, preserving exact monoresidue composition in each window. .SH MULTIPLE ALIGNMENT SHUFFLING OPTIONS .TP -.B -b +.B \-b Sample columns with replacement, in order to generate a bootstrap-resampled alignment dataset. .SH SEQUENCE GENERATION OPTIONS -One of these must be selected. +One of these must be selected, if +.B \-G +is used. .TP -.B --amino +.B \-\-amino Generate amino acid sequences. .TP -.B --dna +.B \-\-dna Generate DNA sequences. .TP -.B --rna -Generate RNA sequences (the default). +.B \-\-rna +Generate RNA sequences. .SH EXPERT OPTIONS .TP -.BI --informat " " -Specify that the sequence file is in format -.I , -where +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for .I -may be FASTA, GenBank, EMBL, UniProt, DDBJ, or Stockholm. This string -is case-insensitive ("genbank" or "GenBank" both work, for example). - -.TP -.BI --seed " " +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + + +.TP +.BI \-\-seed " " Specify the seed for the random number generator, where the seed .I -is an integer greater than zero. This can be used to make the results -of -.B esl-shuffle -reproducible. The default is to choose the random number generator -seed by calling -.B time(). -Note that because -.B time() -likely returns the time in units of seconds, -two calls to -.B esl-shuffle -within the same second will use the same seed and generate -identical random number sequences; you may want to avoid this. +is an integer greater than zero. This can be used to make the results of +.B esl\-shuffle +reproducible. +If +.I +is 0, the random number generator is seeded arbitrarily and +stochastic simulations will vary from run to run. +Arbitrary seeding (0) is the default. diff --git a/miniapps/esl-ssdraw.man.in b/miniapps/esl-ssdraw.man.in index 5b449d27..9c004960 100644 --- a/miniapps/esl-ssdraw.man.in +++ b/miniapps/esl-ssdraw.man.in @@ -1,21 +1,19 @@ -.TH "esl-ssdraw" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-ssdraw" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-ssdraw - create postscript secondary structure diagrams +esl\-ssdraw \- create postscript secondary structure diagrams .SH SYNOPSIS - -.B esl-ssdraw -.I [options] +.B esl\-ssdraw +[\fIoptions\fR] .I msafile .I postscript_template .I postscript_output_file .SH DESCRIPTION -.pp -.B esl-ssdraw +.PP +.B esl\-ssdraw reads an existing template consensus secondary structure diagram from .I postscript_template and creates new postscript diagrams including the template structure @@ -26,26 +24,29 @@ or some of the aligned sequences can be drawn separately, with nucleotides or posterior probabilities mapped onto the corresponding positions of the consensus structure. +.PP The alignment must be in Stockholm format with per-column reference annotation (#=GC RF). The sequences in the alignment must be RNA or DNA sequences. The -.B postscript_template +.I postscript_template file must contain one page that includes consensus nucleotides (positions), where is the number of nongap characters in the reference (RF) annotation of the first alignment in -.B msafile. +.IR msafile . The specific format required in the -.B postscript_template +.I postscript_template is described below in the INPUT section. Postscript diagrams will only be created for the first alignment in -.B msafile. +.IR msafile . + .SH OUTPUT +.PP By default (if run with zero command line options), -.B esl-ssdraw +.B esl\-ssdraw will create a six or seven page -.I postscript_output_file, +.IR postscript_output_file , with each page displaying a different alignment statistic. These pages display the alignment consensus sequence, information content per position, mutual information per position, frequency of inserts per position, @@ -53,50 +54,52 @@ average length of inserts per position, frequency of deletions (gaps) per position, and average posterior probability per position (if posterior probabilites exist in the alignment) If -.B -d +.B \-d is enabled, all of these pages plus additional ones, such as individual sequences (see discussion of -.B.--indi +.B \-\-indi below) will be drawn. These pages can be selected to be drawn individually by using the command line options -.B --cons, -.B --info, -.B --mutinfo, -.B --ifreq, -.B --iavglen, -.B --dall, +.BR \-\-cons , +.BR \-\-info , +.BR \-\-mutinfo , +.BR \-\-ifreq , +.BR \-\-iavglen , +.BR \-\-dall , and -.B --prob. +.BR \-\-prob . The calculation of the statistics for each of these options is discussed below in the description for each option. Importantly, only so-called 'consensus' positions of the alignment will be drawn. A consensus position is one that is a nongap nucleotide in the 'reference' annotation of the Stockholm alignment (#=GC RF) read from -.B msafile. +.IR msafile . +.PP By default, a consensus sequence for the input alignment will be calculated and displayed on the alignment statistic diagrams. The consensus sequence is defined as the most common nucleotide at each consensus position of the alignment. The consensus sequence will not be displayed if the -.B --no-cnt +.B \-\-no\-cnt option is used. The -.B --cthresh, -.B --cambig, +.BR \-\-cthresh , +.BR \-\-cambig , and -.B --athresh +.B \-\-athresh options affect the definition of the consensus sequence as explained below in the descriptions for those options. +.PP If the -.BI --tabfile " " +.BI \-\-tabfile " " option is used, a tab-delimited text file .I will be created that includes per-position lists of the numerical values for each of the calculated statistics that were drawn to -.B postscript_output_file. +.IR postscript_output_file . Comment lines in .I are prefixed with a '#' character and explain the meaning of @@ -104,42 +107,43 @@ each of the tab-delimited columns and how each of the statistics was calculated. If -.B --indi +.B \-\-indi is used, -.B esl-ssdraw +.B esl\-ssdraw will create diagrams showing each sequence in the alignment on a separate page, with aligned nucleotides in their corresponding position in the structure diagram. By default, basepaired nucleotides will be colored based on their basepair type: either Watson-Crick (A:U, U:A, C:G, or G:C), G:U or U:G, or non-canonical (the other ten possible basepairs). This coloring can be turned off with the -.B --no-bp +.B \-\-no\-bp option. Also by default, nucleotides that differ from the most common nucleotide at each aligned consensus position will be outlined. If the most common nucleotide occurs in more than 75% of sequences that do not have a gap at that position, the outline will be bold. Outlining can be turned off with the -.B --no-ol +.B \-\-no\-ol option. .PP With -.B --indi, +.BR \-\-indi , if the alignment contains posterior probability annotation (#=GR PP), the -.B postscript_output_file +.I postscript_output_file will contain an additional page for each sequence drawn with positions colored by the posterior probability of each aligned nucleotide. No posterior probability pages will be drawn if the -.B --no-pp +.B \-\-no\-pp option is used. -.B esl-ssdraw +.PP +.B esl\-ssdraw can also be used to draw 'mask' diagrams which color positions of the structure one of two colors depending on if they are included or excluded by a mask. This is enabled with the -.BI --mask-col " " +.BI \-\-mask\-col " " option. .I must contain a single line of characters, where is the @@ -150,103 +154,114 @@ indicates position is included by the mask. A page comparing the overlap of the .I mask from -.BI --mask-col +.B \-\-mask\-col and another mask in .I will be created if the -.BI --mask-diff " " +.BI \-\-mask\-diff " " option is used. +.PP If the -.BI --mask " " +.BI \-\-mask " " option is used, positions excluded by the mask in .I will be drawn differently (as open circles by default) than positions included by the mask. The style of the masked positions can be modified with the -.B --mask-u, -.B --mask-x, +.BR \-\-mask\-u , +.BR \-\-mask\-x , and -.B --mask-a options. +.B \-\-mask\-a +options. +.PP Finally, two different types of input files can be used to customize output diagrams using the -.B --dfile +.B \-\-dfile and -.B --efile +.B \-\-efile options, as described below. + + .SH INPUT +.PP The -.B postscript_template_file +.I postscript_template_file is a postscript file that must be in a very specific format in order for -.B esl-ssdraw +.B esl\-ssdraw to work. The specifics of the format, described below, are likely to change in future versions of -.B esl-ssdraw. +.BR esl\-ssdraw . The -.B postscript_output_file +.I postscript_output_file files generated by -.B esl-ssdraw +.B esl\-ssdraw will not be valid -.B postscript_template_file +.I postscript_template_file format (i.e. an output file from -.B esl-ssdraw +.B esl\-ssdraw cannot be used as an -.B postscript_template_file +.I postscript_template_file in a subsequent run of the program). +.PP An example -.B postscript_template_file -('trna-ssdraw.ps') is included with the Easel distribution in +.I postscript_template_file +('trna\-ssdraw.ps') is included with the Easel distribution in the 'testsuite/' subdirectory of the top-level 'easel' directory. +.PP The -.B postscript_template_file +.I postscript_template_file is a valid postscript file. It includes postscript commands for drawing a secondary structure. The commands specify x and y coordinates for placing each nucleotide on the page. The -.B postscript_template_file +.I postscript_template_file might also contain commands for drawing lines connecting basepaired positions and tick marks indicating every tenth position, though these are not required, as explained below. +.PP If you are unfamiliar with the postscript language, it may be useful for you to know that a postscript page is, by default, 612 points wide and 792 points tall. The (0,0) coordinate of a postscript file is at the bottom left corner of the page, (0,792) is the top left, (612,0) is the bottom right, and (612,792) is the top right. -.B esl-ssdraw +.B esl\-ssdraw uses 8 point by 8 point cells for drawing positions of the consensus secondary structure. The 'scale' section of the -.B postscript_template_file +.I postscript_template_file allows for different 'zoom levels', as described below. Also, it is important to know that postscript lines beginning with '%' are considered comments and do not include postscript commands. +.PP An -.B esl-ssdraw -.B postscript_template_file +.B esl\-ssdraw +.I postscript_template_file contains n >= 1 pages, each specifying a consensus secondary structure diagram. Each page is delimited by a 'showpage' line in an 'ignore' section (as described below). -.B esl-ssdraw +.B esl\-ssdraw will read all pages of the -.B postscript_template_file +.I postscript_template_file and then choose the appropriate one that corresponds with the alignment in -.B msafile +.I msafile based on the consensus (nongap RF) length of the alignment. For an alignment of consensus length , the first page of -.B postscript_template_file +.I postscript_template_file that has a structure diagram with consensus length will be used as the template structure for the alignment. +.PP Each page of -.B postscript_template_file +.I postscript_template_file contains blocks of text organized into seven different possible sections. Each section must begin with a single line '% begin ' and end with a single line '% end ' and @@ -259,14 +274,16 @@ bpconnects'. The n >=1 lines in between the begin and end lines of each section must be in a specific format that differs for each section as described below. +.PP Importantly, each page must end with an 'ignore' section that includes a single line 'showpage' between the begin and end lines. This lets -.B esl-ssdraw +.B esl\-ssdraw know that a page has ended and another might follow. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file must include a single 'modelname' section. This section must include exactly one line in between its begin and end lines. This line must begin with a '%' character @@ -276,8 +293,9 @@ as the model name and will appear on each page of in the header section. If the name is more than 16 characters, it will be truncated in the output. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file must include a single 'legend' section. This section must include exactly one line in between its begin and end lines. This line must be formatted as '% ', where is an integer @@ -288,14 +306,15 @@ consensus position ; specifies the size of a cell in the legend and specifies how many extra points should be between the right hand edge of the legend and the end of the page. the offset of the right hand end of the legend . For example, the line '% 34 --40. -30. 12 0.' specfies that the legend be placed 40 points to the left +\-40. \-30. 12 0.' specfies that the legend be placed 40 points to the left and 30 points below the 34th consensus position, that cells appearing in the legend be squares of size 12 points by 12 points, and that the right hand side of the legend flush against the right hand edge of the printable page. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file must include a single 'scale' section. This section must include exactly one line in between its begin and end lines. This line must be formatted as ' scale', where and are both positive @@ -313,39 +332,42 @@ of about 0.6), and for smaller RNAs, a scale of more than 1.0 might be desirable (tRNA (about 70 nt) uses a scale of 1.7). The best way to determine the exact scale to use is trial and error. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file can include n >= 0 'regurgitate' sections. These sections can include any number of lines. The text in this section will not be parsed by -.B esl-ssdraw +.B esl\-ssdraw but will be included in each page of -.B postscript_output_file. +.I postscript_output_file. The format of the lines in this section must therefore be valid postscript commands. An example of content that might be in a regurgitate section are commands to draw lines and text annotating the anticodon on a tRNA secondary structure diagram. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file must include at least 1 'ignore' section. One of these sections must include a single line that reads 'showpage'. This section should be placed at the end of each page of the template file. Other ignore sections can include any number of lines. The text in these section will not be parsed by -.B esl-ssdraw +.B esl\-ssdraw nor will it be included in each page of -.B postscript_output_file. +.IR postscript_output_file . An ignore section can contain comments or postscript commands that draw features of the -.B postscript_template_file +.I postscript_template_file that are unwanted in the -.B postscript_output_file. +.IR postscript_output_file . +.PP Each page of a -.B postscript_template_file +.I postscript_template_file must include a single 'text nucleotides' section. This section must include exactly lines, indicating that the consensus secondary structure has exactly nucleotide positions. Each line must be of @@ -355,12 +377,13 @@ specifying the location of the nucleotide on the page, they should be positive real numbers. The best way to determine what these coordinates should be is manually by trial and error, by inspecting the resulting structure as you add each nucleotide. Note that -.B esl-ssdraw +.B esl\-ssdraw will color an 8 point by 8 point cell for each position, so nucleotides should be placed about 8 points apart from each other. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file may or may not include a single 'text positiontext' section. This section can include n >= 1 lines, each specifying text to be placed next to specific positions of the structure, for example, to number them. @@ -372,53 +395,58 @@ manually by trial and error, by inspecting the resulting diagram as you add each line. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file may or may not include a single 'lines positionticks' section. This section can include n >= 1 lines, each specifying the location of a tick mark on the diagram. Each line must be of the format ' moveto show'. A tick mark (line of width 2.0) will be drawn from point (,) to point (,) on each page of -.B postscript_output_file. +.I postscript_output_file. Currently, the best way to determine what these coordinates should be is manually by trial and error, by inspecting the resulting diagram as you add each line. +.PP Each page of a -.B postscript_template_file +.I postscript_template_file may or may not include a single 'lines bpconnects' section. This section must include lines, where is the number of basepairs in the consensus structure of the input -.B msafile +.I msafile annotated as #=GC SS_cons. Each line should connect two basepaired positions in the consensus structure diagram. Each line must be of the format ' moveto show'. A line will be drawn from point (,) to point (,) on each page of -.B postscript_output_file. +.I postscript_output_file. Currently, the best way to determine what these coordinates should be is manually by trial and error, by inspecting the resulting diagram as you add each line. + + .SH REQUIRED MEMORY +.PP The memory required by -.B esl-ssdraw +.B esl\-ssdraw will be equal to roughly the larger of 2 Mb and the size of the first alignment in -.B msafile. +.IR msafile . If the -.B --small +.B \-\-small option is used, the memory required will be independent of the alignment size. To use -.B --small +.B \-\-small the alignment must be in Pfam format, a non-interleaved (1 line/seq) version of Stockholm format. If the -.B --indi +.B \-\-indi option is used, the required memory may exceed the size of the alignment by up to ten-fold, and the output @@ -429,80 +457,80 @@ may be up to 50 times larger than the .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -d +.B \-d Draw the default set of alignment summary diagrams: consensus sequence, information content, mutual information, insert frequency, average insert length, deletion frequency, and average posterior probability (if posterior probability annotation exists in the alignment). These diagrams are also drawn by default (if zero command line options are used), but using the -.B -d +.B \-d option allows the user to add additional pages, such as individual aligned sequences with -.B --indi. +.BR \-\-indi . .TP -.BI --mask " " +.BI \-\-mask " " Read the mask from file -.I , +.IR , and draw positions differently in -.B postscript_output_file +.I postscript_output_file depending on whether they are included or excluded by the mask. .I must contain a single line of length with only '0' and '1' characters. is the number of nongap characters in the reference (#=GC RF) annotation of the first alignment in -.B msafile +.I msafile A '0' at position of the mask indicates position is excluded by the mask, and a '1' indicates that position is included by the mask. .TP -.B --small +.B \-\-small Operate in memory saving mode. Without -.B --indi, +.BR \-\-indi , required RAM will be independent of the size of the alignment in -.B msafile. +.IR msafile . With -.B --indi, +.BR \-\-indi , the required RAM will be roughly ten times the size of the alignment in -.B msafile. +.IR msafile . For -.B --small +.B \-\-small to work, the alignment must be in Pfam Stockholm (non-interleaved 1 line/seq) format. .TP -.B --rf +.B \-\-rf Add a page to -.B postscript_output_file +.I postscript_output_file showing the reference sequence from the #=GC RF annotation in -.B msafile. +.I msafile. By default, basepaired nucleotides will be colored based on what type of basepair they are. To turn this off, use -.B --no-bp. +.B \-\-no\-bp. This page is drawn by default (if zero command-line options are used). .TP -.B --info +.B \-\-info Add a page to -.B postscript_output_file +.I postscript_output_file with consensus (nongap RF) positions colored based on their information content from the alignment. -Information content is calculated as 2.0 - H, where H = sum_x p_x +Information content is calculated as 2.0 \- H, where H = sum_x p_x log_2 p_x for x in {A,C,G,U}. This page is drawn by default (if zero command-line options are used). .TP -.B --mutinfo +.B \-\-mutinfo Add a page to -.B postscript_output_file +.I postscript_output_file with basepaired consensus (nongap RF) positions colored based on the amount of mutual information they have in the alignment. Mutual information is sum_{x,y} p_{x,y} log_2 ((p_x * p_y) / p_{x,y}, where x @@ -517,9 +545,9 @@ basepair are counted. This page is drawn by default (if zero command-line options are used). .TP -.B --ifreq +.B \-\-ifreq Add a page to -.B postscript_output_file +.I postscript_output_file with each consensus (nongap RF) position colored based on the fraction of sequences that span each position that have at least 1 inserted nucleotide after the position. @@ -530,9 +558,9 @@ c >= a. This page is drawn by default (if zero command-line options are used). .TP -.B --iavglen +.B \-\-iavglen Add a page to -.B postscript_output_file +.I postscript_output_file with each consensus (nongap RF) position colored based on average length of insertions that occur after it. The average is calculated as the total number of inserted nucleotides after position x, divided by the @@ -540,17 +568,17 @@ number of sequences that have at least 1 inserted nucleotide after position x (so the minimum possible average insert length is 1.0). .TP -.B --dall +.B \-\-dall Add a page to -.B postscript_output_file +.I postscript_output_file with each consensus (nongap RF) position colored based on the fraction of sequences that have a gap (delete) at the position. This page is drawn by default (if zero command-line options are used). .TP -.B --dint +.B \-\-dint Add a page to -.B postscript_output_file +.I postscript_output_file with each consensus (nongap RF) position colored based on the fraction of sequences that have an internal gap (delete) at the position. An internal gap in a sequence is one that occurs after (5' of) the @@ -559,9 +587,9 @@ sequence's first aligned nucleotide and after This page is drawn by default (if zero command-line options are used). .TP -.B --prob +.B \-\-prob Add a page to -.B postscript_output_file +.I postscript_output_file with positions colored based on average posterior probability (PP). The alignment must contain #=GR PP annotation for all sequences. PP annotation is converted to numerical PP values as follows: '*' = 0.975, '9' = @@ -570,9 +598,9 @@ converted to numerical PP values as follows: '*' = 0.975, '9' = This page is drawn by default (if zero command-line options are used). .TP -.B --span +.B \-\-span Add a page to -.B postscript_output_file +.I postscript_output_file with consensus (nongap RF) positions colored based on the fraction of sequences that 'span' the position. A sequence s spans consensus position x that is actual alignment @@ -581,104 +609,110 @@ b <= a and at least one nongap nucleotide aligned to a consensus position c >= a. This page is drawn by default (if zero command-line options are used). + .SH OPTIONS FOR DRAWING INDIVIDUAL ALIGNED SEQUENCES .TP -.B --indi +.B \-\-indi Add a page displaying the aligned nucleotides in their corresponding consensus positions of the structure diagram for each aligned sequence in the alignment. By default, basepaired nucleotides will be colored based on what type of basepair they are. To turn this off, use -.B --no-bp. +.B \-\-no\-bp. If posterior probability information (#=GR PP) exists in the alignment, one additional page per sequence will be drawn displaying the posterior probabilities. .TP -.B -f +.B \-f With -.B --indi, +.BR \-\-indi , force -.B esl-ssdraw +.B esl\-ssdraw to create a diagram, even if it is predicted to be large (> 100 Mb). By default, if the predicted size exceeds 100 Mb, -.B esl-ssdraw +.B esl\-ssdraw will fail with a warning. + .SH OPTIONS FOR OMITTING PARTS OF THE DIAGRAMS .TP -.BI --no-leg +.B \-\-no\-leg Omit the legend on all pages of -.B postscript_output_file. +.IR postscript_output_file . .TP -.BI --no-head +.B \-\-no\-head Omit the header on all pages of -.B postscript_output_file. +.IR postscript_output_file . .TP -.BI --no-foot +.B \-\-no\-foot Omit the footer on all pages of -.B postscript_output_file. +.IR postscript_output_file . + + .SH OPTIONS FOR SIMPLE TWO-COLOR MASK DIAGRAMS .TP -.B --mask-col +.B \-\-mask\-col With -.B --mask, -.B postscript_output_file +.BR \-\-mask , +.I postscript_output_file will contain exactly 1 page showing positions included by the mask as black squares, and positions excluded as pink squares. .TP -.BI --mask-diff " " +.BI \-\-mask\-diff " " With -.BI --mask " " +.BI \-\-mask " " and -.B mask-col, -.B postscript_output_file +.BR mask\-col , +.I postscript_output_file will contain one additional page comparing the mask from .I and the mask from -.I . +.IR . Positions will be colored based on whether they are included by one mask and not the other, excluded by both masks, and included by both masks. + .SH EXPERT OPTIONS FOR CONTROLLING INDIVIDUAL SEQUENCE DIAGRAMS .TP -.B --no-pp +.B \-\-no\-pp When used in combination with -.B --indi, +.BR \-\-indi , do not draw posterior probability structure diagrams for each sequence, even if the alignment has PP annotation. .TP -.B --no-bp +.B \-\-no\-bp Do not color basepaired nucleotides based on their basepair type. .TP -.B --no-ol +.B \-\-no\-ol When used in combination with -.B --indi, +.BR \-\-indi , do not outline nucleotides that differ from the majority rule consensus nucleotide given the alignment. .TP -.B --no-ntpp +.B \-\-no\-ntpp When used in combination with -.B --indi, +.BR \-\-indi , do not draw nucleotides on the individual sequence posterior probability diagrams. + .SH EXPERT OPTIONS RELATED TO CONSENSUS SEQUENCE DEFINITION .TP -.B --no-cnt +.B \-\-no\-cnt Do not draw consensus nucleotides on alignment statistic diagrams (such as information content diagrams). By default, the consensus nucleotide is defined as the most frequent nucleotide in the alignment at the @@ -688,19 +722,19 @@ fraction of the aligned sequences (that do not contain a gap at the position) are capitalized. By default .I is 0.75, but can be changed with the -.BI --cthresh " " +.BI \-\-cthresh " " option. .TP -.BI --cthresh " " +.BI \-\-cthresh " " Specify the threshold for capitalizing consensus nucleotides defined by the majority rule (i.e. when -.B --cambig +.B \-\-cambig is not enabled) as -.I . +.IR . .TP -.B --cambig +.B \-\-cambig Change how consensus nucleotides are calculated from majority rule to the least ambiguous IUPAC nucleotide that represents at least .I @@ -708,46 +742,48 @@ fraction of the nongap nucleotides at each consensus position. By default .I is 0.9, but can be changed with the -.BI --athresh " " +.BI \-\-athresh " " option. .TP -.BI --athresh " " +.BI \-\-athresh " " With -.B --cambig, +.BR \-\-cambig , specify the threshold for defining consensus nucleotides is the least ambiguous IUPAC nucleotide that represents at least .I fraction of the nongap nucleotides at each position. + .SH EXPERT OPTIONS CONTROLLING STYLE OF MASKING POSITIONS .TP -.B --mask-u +.B \-\-mask\-u With -.B --mask, +.BR \-\-mask , change the style of masked columns to squares. .TP -.B --mask-x +.B \-\-mask\-x With -.B --mask, -change the style of masked columns to 'x's +.BR \-\-mask , +change the style of masked columns to x's. .TP -.B --mask-a +.B \-\-mask\-a With -.B --mask +.B \-\-mask and -.B --mask-u +.B \-\-mask\-u or -.B --mask-x -draw the alternative style of square or 'x' masks +.B \-\-mask\-x +draw the alternative style of square or 'x' masks. + .SH EXPERT OPTIONS RELATED TO INPUT FILES .TP -.BI --dfile " " +.BI \-\-dfile " " Read the 'draw file' .I which specifies numerical values for each consensus position in one or @@ -785,14 +821,14 @@ single page. A draw file specifying pages should include exactly * ( + 4) lines. .TP -.BI --efile " " +.BI \-\-efile " " Read the 'expert draw file' .I which specifies the colors and nucleotides to draw on each consensus position in one or more postscript pages. Unlike with the -.B --dfile +.B \-\-dfile option, no legend will be drawn when -.B --efile +.B \-\-efile is used. For each page, the draw file must include lines, each with four or five tab-delimited tokens. The first four tokens on line @@ -811,13 +847,13 @@ single page. A expert draw file specifying pages should include exactly * ( + 1) lines. .TP -.BI --ifile " " +.BI \-\-ifile " " Read insert information from the file -.I , +.IR , which may have been created with INFERNAL's -.B cmalign +.BR cmalign (1) program. The insert information in -.B msafile +.I msafile will be ignored and the information from .I will supersede it. Inserts are columns that are gaps in the reference diff --git a/miniapps/esl-translate.man.in b/miniapps/esl-translate.man.in index e435e35b..3988cef8 100644 --- a/miniapps/esl-translate.man.in +++ b/miniapps/esl-translate.man.in @@ -1,72 +1,69 @@ -.TH "esl-translate" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-translate" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-translate - translate DNA sequence in six frames into individual ORFs +esl\-translate \- translate DNA sequence in six frames into individual ORFs .SH SYNOPSIS -.B esl-translate -.I [options] +.B esl\-translate +[\fIoptions\fR] .I seqfile .SH DESCRIPTION -.pp +.PP Given a .I seqfile containing DNA or RNA sequences, -.B esl-translate +.B esl\-translate outputs a six-frame translation of them as individual open reading frames in FASTA format. -.pp +.PP By default, only open reading frames greater than 20aa are reported. This minimum ORF length can be changed with the -.I -l +.B \-l option. -.pp +.PP By default, no specific initiation codon is required, and any amino acid can start an open reading frame. -This is because -.I esl-translate +This is so +.B esl\-translate may be used on sequence fragments, eukaryotic genes with introns, or other cases where we do not want to assume that ORFs are complete coding regions. This behavior can be changed. With the -.I -m +.B \-m option, ORFs start with an initiator AUG Met. With the -.I -M +.B \-M option, ORFs start with any of the initiation codons allowed by the genetic code. For example, the "standard" code (NCBI transl_table 1) allows AUG, CUG, and UUG as initiators. When -.I -m +.B \-m or -.I -M +.B \-M are used, an initiator is always translated to Met (even if the initiator is something like UUG or CUG that doesn't encode Met as an elongator). -.pp +.PP If .I seqfile -is - (a single dash), input is read from the stdin pipe. This +is \- (a single dash), input is read from the stdin pipe. This (combined with the output being a standard FASTA file) allows -.B esl-translate +.B esl\-translate to be used in command line incantations. If .I seqfile ends in .gz, it is assumed to be a gzip-compressed file, and Easel will try to read it as a stream from -.B gunzip -c. +\fBgunzip \-c\fR. .SH OUTPUT FORMAT -.pp - - +.PP The output FASTA name/description line contains information about the source and coordinates of each ORF. Each ORF is named .B orf1, @@ -76,11 +73,12 @@ FASTA name/desc line contains 4 additional fields, followed by the description of the source sequence: .TP -.B source= - is the name of the source DNA/RNA sequence. +\fBsource\fR=\fI\fR +.I +is the name of the source DNA/RNA sequence. .TP -.B coords=.. +\fBcoords\fR=\fIstart\fR..\fIend\fR Coords, 1..L, for the translated ORF in a source DNA sequence of length L. If start is greater than end, the ORF is on the bottom (reverse complement) strand. The start is the first nucleotide of the @@ -89,11 +87,11 @@ stop codon is not included in the coordinates (unlike in CDS annotation in GenBank, for example.) .TP -.B length= +\fBlength\fR=\fI\fR Length of the ORF in amino acids. .TP -.B frame= +\fBframe\fR=\fI\fR Which frame the ORF is in. Frames 1..3 are the top strand; 4..6 are the bottom strand. Frame 1 starts at nucleotide 1. Frame 4 starts at nucleotide L. @@ -102,10 +100,10 @@ nucleotide L. .SH ALTERNATIVE GENETIC CODES -.pp +.PP By default, the "standard" genetic code is used (NCBI transl_table 1). Any NCBI genetic code transl_table can be selected with the -.I -c +.B \-c option, as follows: .TP @@ -173,7 +171,7 @@ at a link titled .SH IUPAC DEGENERACY CODES IN DNA -.pp +.PP DNA sequences may contain IUPAC degeneracy codes, such as N, R, Y, etc. If all codons consistent with a degenerate codon translate to the same amino acid (or to a stop), that translation is done; otherwise, @@ -182,14 +180,14 @@ are stops). For example, in the standard code, UAR translates to * (stop), GGN translates to G (glycine), NNN translates to X, and UGR translates to X (it could be either a UGA stop or a UGG Trp). -.pp +.PP Degenerate initiation codons are handled essentially the same. If all codons consistent with the degenerate codon are legal initiators, then the codon is allowed to initiate a new ORF. Stop codons are never a legal initiator (not only with -.I -m +.B \-m or -.I -M +.B \-M but also with the default of allowing any amino acid to initiate), so degenerate codons consistent with a stop cannot be initiators. For example, NNN cannot initiate an ORF, nor can UGR -- even @@ -198,7 +196,7 @@ long stretches of N's as long ORFs of X's, which is probably a feature, given the prevalence of artificial runs of N's in genome sequence assemblies. -.pp +.PP Degenerate DNA codons are not translated to degenerate amino acids other than X, even when that is possible. For example, SAR and MUH are decoded as X, not Z (Q|E) and J (I|L). The extra complexity @@ -208,15 +206,15 @@ needed for a degenerate to degenerate translation doesn't seem worthwhile. .SH OPTIONS .TP -.B -h +.B \-h Print brief help. Includes version number and summary of all options. Also includes a list of the available NCBI transl_tables and their numerical codes, for the -.I -c +.B \-c option. .TP -.BI -c " " +.BI \-c " " Choose alternative genetic code .I where @@ -224,17 +222,17 @@ where is the numerical code of one of the NCBI transl_tables. .TP -.BI -l " " +.BI \-l " " Set the minimum reported ORF length to .I aa. .TP -.B -m +.B \-m Require ORFs to start with an initiator codon AUG (Met). .TP -.B -M +.B \-M Require ORFs to start with an initiator codon, as specified by the allowed initiator codons in the NCBI transl_table. In the default Standard code, AUG, CUG, and UUG are allowed as initiators. An @@ -242,41 +240,54 @@ initiation codon is always translated as Met, even if it does not normally encode Met as an elongator. .TP -.B -W +.B \-W Use a memory-efficient windowed sequence reader. The default is to read entire DNA sequences into memory, which may become memory limited for some very large eukaryotic chromosomes. The windowed reader cannot reverse complement a nonrewindable input stream, so either -.I +.I seqfile must be a file, or you must use -.I --watson +.I \-\-watson to limit translation to the top strand. .TP -.BI --informat " " -Assert that the input -.I -is in -.I -format. Valid formats include -FASTA, GenBank, EMBL, Uniprot, DDBJ, Stockholm, Clustal, and Phylip. +.BI \-\-informat " " +Assert that input +.I seqfile +is in format +.IR , +bypassing format autodetection. +Common choices for .I -is case-insensitive but must be a complete format name; for example, -.BI --informat " fasta" -or -.BI --informat " embl" -work. +include: +.BR fasta , +.BR embl , +.BR genbank. +Alignment formats also work; +common choices include: +.BR stockholm , +.BR a2m , +.BR afa , +.BR psiblast , +.BR clustal , +.BR phylip . +For more information, and for codes for some less common formats, +see main documentation. +The string +.I +is case-insensitive (\fBfasta\fR or \fBFASTA\fR both work). + .TP -.B --watson +.B \-\-watson Only translate the top strand. .TP -.B --crick +.B \-\-crick Only translate the bottom strand. diff --git a/miniapps/esl-weight.man.in b/miniapps/esl-weight.man.in index fd86c1ed..ab90fa57 100644 --- a/miniapps/esl-weight.man.in +++ b/miniapps/esl-weight.man.in @@ -1,103 +1,86 @@ -.TH "esl-weight" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" +.TH "esl\-weight" 1 "@EASEL_DATE@" "Easel @EASEL_VERSION@" "Easel Manual" .SH NAME -.TP -esl-weight - calculate sequence weights in MSA(s) +esl\-weight \- calculate sequence weights in MSA(s) .SH SYNOPSIS - -.B esl-weight -.I [options] +.B esl\-weight +[\fIoptions\fR] .I msafile .SH DESCRIPTION -.pp -.B esl-weight +.PP +.B esl\-weight calculates individual sequence weights for each alignment in -.I msafile, +.I msafile and outputs a new multiple sequence alignment file in Stockholm format with the weights annotated in Stockholm-format -.I #=GS WT +\fB#=GS \fIseqname\fB WT \fIweight\fR lines. The default weighting algorithm is the Gerstein/Sonnhammer/Chothia algorithm. -.pp +.PP If .I msafile -is - (a single dash), -MSA input is read from -.I stdin. +is \- (a single dash), +MSA input is read from stdin. -.pp -The sequences can be of protein or DNA/RNA sequences. All sequences -in the same -.I seqfile -must be either protein or DNA/RNA. The alphabet will be autodetected -unless one of the options -.I --amino, -.I --dna, -or -.I --rna -are given. These options may be useful in automated -pipelines to make -.B esl-weight -more robust; alphabet autodetection is not infallible. - - .SH OPTIONS .TP -.B -h +.B \-h Print brief help; includes version number and summary of all options, including expert options. .TP -.B -g +.B \-g Use the Gerstein/Sonnhammer/Chothia weighting algorithm; this is the default. .TP -.B -p -Use the Henikoff position-based weighting algorithm. +.B \-p +Use the Henikoff position-based weighting algorithm. This is faster +and more memory efficient than the default. .TP -.B -b +.B \-b "BLOSUM weights": use approximately the same rule used in constructing the BLOSUM score matrices. This involves single-linkage clustering at some fractional identity threshold (default 0.62; see -.I --id +.B \-\-id option), then for each cluster, splitting a total weight of one uniformly amongst all sequences in the cluster. + .SH EXPERT OPTIONS .TP -.BI --id " " +.BI \-\-id " " Sets the fractional identity threshold used by the BLOSUM weighting rule (option -.I -b -; required), to a number 0<=x<=1. Default is 0.62. +.BR \-b ; +required), to a number 0<=x<=1. Default is 0.62. .TP -.B --amino +.B \-\-amino Assert that the .I msafile contains protein sequences. .TP -.B --dna +.B \-\-dna Assert that the .I msafile contains DNA sequences. .TP -.B --rna +.B \-\-rna Assert that the .I msafile contains RNA sequences.