first commit

gawbul · May 20, 2015 · a77b6fd · a77b6fd
commit a77b6fd
Show file tree

Hide file tree

Showing 6 changed files with 288 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# fastq_rnd_seek
diff --git a/fastq_rnd_seek.py b/fastq_rnd_seek.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# import modules
+import os, random, sys
+
+# parse arguments
+if len(sys.argv) != 2:
+ print "Requires one input filename as an argument."
+ sys.exit()
+filename = sys.argv[1]
+
+# check file exists
+if not os.path.exists(filename):
+ print "Filename doesn't exist."
+ sys.exit()
+
+# define variables
+record_positions = []
+line_positions = {}
+
+# open file
+linenum = 0
+length = 0
+infh = open(filename, 'r')
+for line in infh:
+ line_positions[linenum] = length
+ if line.startswith("@"):
+ position = line_positions[linenum]
+ record_positions.append(position)
+ length += len(line)
+ linenum += 1
+infh.close()
+
+# sort positions
+record_positions = sorted(record_positions)
+print "Number of records: %d" % len(record_positions)
+
+# get random record
+rnd_record = random.choice(record_positions) 
+rnd_record_idx = record_positions.index(rnd_record)
+# check that we haven't selected the last record at random
+if rnd_record_idx == record_positions.index(record_positions[-1]):
+ next_record = rnd_record
+ rnd_record = record_positions[rnd_record_idx - 1]
+else:
+ next_record = record_positions[rnd_record_idx + 1]
+print "Selected record positions: %d, %d" % (rnd_record, next_record)
+print "Record length: %d" % (next_record - rnd_record)
+
+# get average record lengths
+record_lengths = []
+for record in record_positions[:-1]:
+ thispos = record
+ nextpos = record_positions[record_positions.index(record) + 1]
+ record_lengths.append(nextpos - thispos)
+print "Average record length: %d" % sum(record_lengths) / len(record_lengths)
+
+# read record
+infh = open(filename, 'r')
+infh.seek(rnd_record)
+record = infh.read((next_record - rnd_record) - 1)
+infh.close()
+
+# display record
+print "Seq:\n\n%s" % record
diff --git a/fastq_rnd_seek_proper.jl b/fastq_rnd_seek_proper.jl
@@ -0,0 +1,63 @@
+#!/Applications/Julia-0.3.5.app/Contents/Resources/julia/bin/julia
+# required modules
+require("argparse")
+using ArgParse
+
+# parse arguments
+function parse_arguments()
+ s = ArgParseSettings()
+ @add_arg_table s begin
+ "filename"
+ help = "Input filename."
+ required = true
+ "numsamples"
+ help = "Number of sequences to sample."
+ arg_type = Int
+ default = 10000
+ end
+ return parse_args(s)
+end
+
+# get random position in file
+function get_random_pos(rnd_num)
+ return rand(0:rnd_num)
+end
+
+# main function
+function main()
+ # retrieve parsed arguments
+ parsed_args = parse_arguments()
+
+ # set filename
+ filename = parsed_args["filename"]
+ numsamples = parsed_args["numsamples"]
+
+ # check file exists
+ if !isfile(filename)
+ throw(LoadError("", 0, "Filename $(filename) not found."))
+ end
+
+ # get file size
+ infilesize = filesize(filename)
+ println("$(infilesize)")
+
+ # open input file
+ infh = open(filename, "r")
+
+ # iterate number of samples times
+ for i = 1:numsamples
+ pos = get_random_pos(infilesize)
+ seek(infh, pos)
+ println(pos)
+ println(position(infh)) 
+ println(readline(infh))
+ exit()
+ end
+
+ # close file handle
+ close(infh)
+end
+
+# call main function
+main()
+
diff --git a/fastq_rnd_seek_proper.py b/fastq_rnd_seek_proper.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# import modules
+import mmap, os, random, sys
+from Bio import SeqIO
+from StringIO import StringIO
+
+# parse arguments
+if len(sys.argv) != 3:
+ print "Requires an input filename and integer as arguments."
+ sys.exit()
+filename = sys.argv[1]
+try:
+ num_seqs = int(sys.argv[2])
+except:
+ print "Require a valid integer value for number of sequences to sample."
+ sys.exit()
+
+# check file exists
+if not os.path.exists(filename):
+ print "Filename %s doesn't exist." % filename
+ sys.exit()
+
+# get size of the file
+filesize = os.path.getsize(filename)
+
+# define variables
+observed_offsets = []
+
+# open file
+with open(filename, 'rb') as infh:
+ mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ)
+ for i in range(num_seqs):
+ failed = True
+ while failed:
+ # pick a random offset and seek to that position
+ random_offset = random.randrange(filesize)
+ mm.seek(random_offset)
+ # always seek to a position where there is a record in the file
+ if mm.rfind("\n@") == -1:
+ continue
+ # find the first record after the offset
+ current_record_offset = mm.find("\n@") + 1
+ # check not already seen this record
+ if current_record_offset in observed_offsets:
+ continue
+ # add current record to list of observed records 
+ observed_offsets.append(current_record_offset)
+ # seek to current record and find the next
+ mm.seek(current_record_offset)
+ next_record_offset = mm.find("\n@")
+ # check not already in the last record set as end of file if we are
+ if next_record_offset == -1:
+ next_record_offset = filesize
+ # read record
+ seq = mm.read(next_record_offset - current_record_offset)
+ # check if is a valid sequence
+ try:
+ record = SeqIO.read(StringIO(seq), 'fastq')
+ except:
+ continue
+ failed = False
+ mm.close()
diff --git a/fastq_rnd_seek_proper_re.py b/fastq_rnd_seek_proper_re.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# import modules
+import mmap, os, random, re, sys
+
+# parse arguments
+if len(sys.argv) != 3:
+ print "Requires an input filename and integer as arguments."
+ sys.exit()
+filename = sys.argv[1]
+try:
+ num_seqs = int(sys.argv[2])
+except:
+ print "Require a valid integer value for number of sequences to sample."
+ sys.exit()
+
+# check file exists
+if not os.path.exists(filename):
+ print "Filename %s doesn't exist." % filename
+ sys.exit()
+
+# get size of the file
+filesize = os.path.getsize(filename)
+
+# define variables
+observed_offsets = []
+chunk_size = 4096
+
+# define FASTQ regex
+pattern = re.compile(r'@.+[\n\r]+[A-Z-]+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}')
+
+# open file
+with open(filename, 'rb') as infh:
+ mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ)
+ for i in range(num_seqs):
+ match = None
+ while not match:
+ # pick a random offset and seek to that position
+ random_offset = random.randrange(filesize)
+ mm.seek(random_offset)
+ # read in a chunk of the file
+ if not random_offset <= filesize - chunk_size:
+ continue
+ chunk = mm.read(chunk_size)
+ match = re.search(pattern, chunk)
+ if not match:
+ continue
+ start, end = match.span()
+ record_offset = start
+ record = chunk[start:end]
+ # check not already seen this record
+ if record_offset in observed_offsets:
+ continue
+ # add current record to list of observed records 
+ observed_offsets.append(record_offset)
+ mm.close()
diff --git a/strandex.py b/strandex.py
@@ -0,0 +1,43 @@
+import os.path
+from six.moves import range
+import re
+import sys
+import argparse
+
+pattern = re.compile(r'@.+[\n\r]+.+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}')
+
+def run(args):
+ file_size = os.path.getsize(args.fastq.name)
+ step = file_size // args.nreads
+ assert step > 0
+ last_record = ''
+ for i in range(0, file_size, step):
+ match = None
+ ahead = 4000
+ while not match:
+ args.fastq.seek(i)
+ if i + ahead > file_size:
+ break
+ else:
+ chunk = args.fastq.read(ahead)
+ match = re.search(pattern, chunk)
+ if match:
+ start, end = match.span()
+ record = chunk[start:end]
+ if record != last_record:
+ sys.stdout.write(record)
+ last_record = record
+ else:
+ ahead += ahead
+ args.fastq.close()
+
+
+def main():
+ parser = argparse.ArgumentParser(prog='strandex', description="sample an approximate number of reads from a fastq file without reading the entire file")
+ parser.add_argument('fastq', type=argparse.FileType('r'), help="input fastq file")
+ parser.add_argument('-n', '--nreads', type=int, default=2000000, help='approximate number of reads to sample from input (default: %(default)s)')
+ args = parser.parse_args()
+ run(args)
+
+if __name__ == "__main__":
+ main()