From a1acad5b0de31585785ae78876cc7363cc688af3 Mon Sep 17 00:00:00 2001 From: frbot <67206072+frbot@users.noreply.github.com> Date: Tue, 23 Jun 2020 21:36:44 +0100 Subject: [PATCH] Create split_multifasta.pl --- split_multifasta.pl | 422 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 422 insertions(+) create mode 100644 split_multifasta.pl diff --git a/split_multifasta.pl b/split_multifasta.pl new file mode 100644 index 0000000..7c14274 --- /dev/null +++ b/split_multifasta.pl @@ -0,0 +1,422 @@ +#!/usr/bin/perl + +#BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}}; +#use lib (@INC,$ENV{"PERL_MOD_DIR"}); +#no lib "$ENV{PERL_MOD_DIR}/i686-linux"; +#no lib "."; + +=head1 NAME + +split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files. + +=head1 SYNOPSIS + +USAGE: split_multifasta.pl + --input_file=/path/to/some_file.fsa + --output_dir=/path/to/somedir + [ --output_list=/path/to/somefile.list + --output_subdir_size=1000 + --output_subdir_prefix=fasta + --seqs_per_file=1 + --compress_output=1 + ] + + + split_multifasta.pl --in snapdmel.aa --output_dir=./ --f=snaa --seqs_per_file=1000 + +=head1 OPTIONS + +B<--input_file,-i> + The input multi-fasta file to split. + +B<--output_dir,-o> + The directory to which the output files will be written. + +B<--output_list,-s> + Write a list file containing the paths of each of the regular output files. This may be useful + for later scripts that can accept a list as input. + +B<--output_file_prefix,-f> + If defined, each file created will have this string prepended to its name. This is ignored unless + writing multiple sequences to each output file using the --seqs_per_file option with a value greater + than 1, else each file created will just be a number. + +B<--output_subdir_size,-u> + If defined, this script will create numbered subdirectories in the output directory, each + containing this many sequences files. Once this limit is reached, another subdirectory + is created. + +B<--output_subdir_prefix,-p> + To be used along with --output_subdir_size, this allows more control of the names of the + subdirectories created. Rather than just incrementing numbers (like 10), each subdirectory + will be named with this prefix (like prefix10). + +B<--compress_output,-c> + Output fasta files will be gzipped when written. + +B<--debug,-d> + Debug level. Use a large number to turn on verbose debugging. + +B<--log,-l> + Log file + +B<--help,-h> + This help message + +=head1 DESCRIPTION + +This script is used to split a single FASTA file containing multiple sequences into separate +files containing one sequence each. + +=head1 INPUT + +The input is defined with --input_file and should be a single fasta file. File extensions are +ignored. When creating this multi-entry FASTA file, one should take care to make the first +*word* after the > symbol a unique value, as it will be used as the file name for that sequence. +For example: + + >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt + ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA + AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA + AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG + + >gi15387669 Corynebacterium callunae pCC1 plasmid + ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC + CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG + GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG + +Whitespace is ignored within the input file. See the OUTPUT section for more on creation of +output files. + +=head1 OUTPUT + +The name of each output sequence file is pulled from the FASTA header of that sequence. The +first *word* after the > symbol will be used as the file name, along with the extension .fsa. +The word is defined as all the text after the > symbol up to the first whitespace. + +If the above example were your input file, two files would be created: + + gi53791237.fsa + gi15387669.fsa + +Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an +underscore. This only occurs in the file name; the original FASTA header within the file +will be unmodified. + +You can pass a path to the optional --output_list to create a text file containing the full paths +to each of the FASTA files created by this script. + +Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used +on input sets that are too large to write out to one directory. This depends on the limitations +of your file system, but you usually don't want 100,000 files written in the same directory. + +If you have an FASTA file containing 95000 sequences, and use the following option: + + --output_dir=/some/path + --output_subdir_size=30000 + +The following will be created: + + directory file count + --------------------------------- + /some/path/1/ 30000 + /some/path/2/ 30000 + /some/path/3/ 30000 + /some/path/4/ 5000 + +If you choose to create a list file (and you probably want to), it will contain these proper paths. + +You may not want the subdirectories to simply be numbers, as above, so you can use the +--output_subdir_prefix option. For example: + + --output_dir=/some/path + --output_subdir_size=30000 + --output_subdir_prefix=fasta + +The following will be created: + + directory file count + --------------------------------- + /some/path/fasta1/ 30000 + /some/path/fasta2/ 30000 + /some/path/fasta3/ 30000 + /some/path/fasta4/ 5000 + +Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which +can be used along with --outupt_subdir_size and --output_subdir_prefix. The main difference to note +is that, if you use --seqs_per_file, the fasta file created will no longer be named using values +taken from the header, since it will contain multiple headers. Instead, the file will simply be +named using sequential numbers starting at 1 (like 1.fsa). For example: + + --output_dir=/some/path + --output_subdir_size=3000 + --output_subdir_prefix=fasta + --seqs_per_file=10 + +The following will be created: + + directory file count + --------------------------------- + /some/path/fasta1/ 3000 + /some/path/fasta2/ 3000 + /some/path/fasta3/ 3000 + /some/path/fasta4/ 500 + +=head1 CONTACT + + Joshua Orvis + jorvis@tigr.org + +=cut + +use strict; +use Getopt::Long; +# qw(:config no_ignore_case no_auto_abbrev pass_through); +use Pod::Usage; +# BEGIN { +# use Ergatis::Logger; +# } + +my %options = (); +my $results = GetOptions (\%options, + 'input_file|i=s', + 'output_dir|o=s', + 'output_file_prefix|f=s', + 'output_list|s=s', + 'output_subdir_size|u=s', + 'output_subdir_prefix|p=s', + 'seqs_per_file|n|e=s', + 'compress_output|c=s', + 'log|l=s', + 'debug=s', + 'help|h') || pod2usage(); + +# my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename(); +# my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile, +# 'LOG_LEVEL'=>$options{'debug'}); +# $logger = $logger->get_logger(); + + +my $logfile = $options{'log'} || "log.file"; +my $logger = new logger('LOG_FILE'=>$logfile, + 'LOG_LEVEL'=>$options{'debug'}); + +## display documentation +if( $options{'help'} ){ + pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} ); +} + +## make sure everything passed was peachy +&check_parameters(\%options); + +## open the list file if one was passed +my $listfh; +if (defined $options{output_list}) { + open($listfh, ">$options{output_list}") || $logger->logdie("couldn't create $options{output_list} list file"); +} + +my $first = 1; +my $seq = ''; +my $header; + +my $sfh; + +## load the sequence file +if ($options{'input_file'} =~ /\.(gz|gzip)$/) { + open ($sfh, "<:gzip", $options{'input_file'}) + || $logger->logdie("can't open sequence file:\n$!"); +} else { + open ($sfh, "<$options{'input_file'}") + || $logger->logdie("can't open sequence file:\n$!"); +} + +my $sub_dir = 1; +my $seq_file_count = 0; + +## keep track of how many sequences are in the current output file +my $seqs_in_file = 0; +my $group_filename_prefix = 1; + +## holds the output file handle +my $ofh; + +while (<$sfh>) { + ## if we find a header line ... + if (/^\>(.*)/) { + + ## write the previous sequence before continuing with this one + unless ($first) { + &writeSequence(\$header, \$seq); + + ## reset the sequence + $seq = ''; + } + + $first = 0; + $header = $1; + + ## else we've found a sequence line + } else { + ## skip it if it is just whitespace + next if (/^\s*$/); + + ## record this portion of the sequence + $seq .= $_; + } +} + +## don't forget the last sequence +&writeSequence(\$header, \$seq); + +exit; + +sub check_parameters { + my $options = shift; + + ## make sure input_file and output_dir were passed + unless ( $options{input_file} && $options{output_dir} ) { + $logger->logdie("You must pass both --input_file and --output_dir"); + } + + ## make sure input_file exists + if (! -e $options{input_file} ) { + if ( -e "$options{input_file}.gz" ) { + $options{input_file} .= '.gz'; + } else { + $logger->logdie("the input file passed ($options{input_file}) cannot be read or does not exist"); + } + } + + ## make sure the output_dir exists + if (! -e "$options{output_dir}") { + $logger->logdie("the output directory passed could not be read or does not exist"); + } + + ## seqs_per_file, if passed, must be at least one + if (defined $options{seqs_per_file} && $options{seqs_per_file} < 1) { + $logger->logdie("seq_per_file setting cannot be less than one"); + } + + ## handle some defaults + $options{output_subdir_size} = 0 unless ($options{output_subdir_size}); + $options{output_subdir_prefix} = '' unless ($options{output_subdir_prefix}); + $options{seqs_per_file} = 1 unless ($options{seqs_per_file}); + $options{output_file_prefix} = '' unless ($options{output_file_prefix}); +} + +sub writeSequence { + my ($header, $seq) = @_; + + ## the id used to write the output file will be the first thing + ## in the header up to the first whitespace. get that. + $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" ); + my $id = $1; + + ## because it is going to be the filename, we're going to take out the characters that are bad form to use + ## legal characters = a-z A-Z 0-9 - . _ + $id =~ s/[^a-z0-9\-_.]/_/gi; + + my $dirpath; + + ## if we're writing more than one sequence to a file, change the id from + ## fasta header to the current group file name + if ($options{seqs_per_file} > 1) { + $id = $group_filename_prefix; + + ## did the user ask for a file prefix? + if ( $options{output_file_prefix} ) { + $id = $options{output_file_prefix} . $id; + } + } + + + ## the path depends on whether we are using output subdirectories + if ($options{output_subdir_size}) { + $dirpath = "$options{'output_dir'}/$options{output_subdir_prefix}$sub_dir"; + } else { + $dirpath = "$options{'output_dir'}"; + } + + ## did the user ask for a file prefix? + my $filepath = "$dirpath/$id.fsa"; + + ## take any // out of the filepath + $filepath =~ s|/+|/|g; + + ## write the sequence + $logger->debug("Writing sequence to $filepath") if ($logger->is_debug()); + + ## open a new output file if we need to + ## if we're writing multiple sequences per file, we only open a new + ## one when $seqs_in_file = 0 (first sequence) + if ($seqs_in_file == 0) { + + ## if the directory we want to write to doesn't exist yet, create it + mkdir($dirpath) unless (-e $dirpath); + + + if ($options{'compress_output'}) { + open ($ofh, ">:gzip", $filepath.".gz") + || $logger->logdie("can't create '$filepath.gz':\n$!"); + } else { + open ($ofh, ">$filepath") || $logger->logdie("can't create '$filepath':\n$!"); + + } + $seq_file_count++; + + ## add the file we just wrote to the list, if we were asked to + if (defined $options{output_list}) { + print $listfh "$filepath\n"; + } + } + + ## if we're doing output subdirs and hit our size limit, increment to the next dir + if ($options{output_subdir_size} && $options{output_subdir_size} == $seq_file_count) { + $seq_file_count = 0; + $sub_dir++; + } + + ## write the sequence + print $ofh ">$$header\n$$seq\n"; + $seqs_in_file++; + + ## if we hit the limit of how many we want in each file, set the next file name and + ## reset the count of seqs within the file + if ($options{seqs_per_file} == $seqs_in_file) { + $seqs_in_file = 0; + $group_filename_prefix++; + } +} + + +package logger; + +sub new { + my $packname= shift; + my %args= @_; + my $self= \%args; + bless($self,$packname); + return $self; +} + +sub get_logger { + my $self= shift; + return $self; +} + +sub logdie { + my $self= shift; + die @_; +} + +sub debug { + my $self= shift; + warn @_; +} + +sub is_debug { + shift->{LOG_LEVEL} || 0; +} + + +1;