Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
gawbul committed May 20, 2015
0 parents commit a77b6fd
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# fastq_rnd_seek
64 changes: 64 additions & 0 deletions fastq_rnd_seek.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python
# import modules
import os, random, sys

# parse arguments
if len(sys.argv) != 2:
print "Requires one input filename as an argument."
sys.exit()
filename = sys.argv[1]

# check file exists
if not os.path.exists(filename):
print "Filename doesn't exist."
sys.exit()

# define variables
record_positions = []
line_positions = {}

# open file
linenum = 0
length = 0
infh = open(filename, 'r')
for line in infh:
line_positions[linenum] = length
if line.startswith("@"):
position = line_positions[linenum]
record_positions.append(position)
length += len(line)
linenum += 1
infh.close()

# sort positions
record_positions = sorted(record_positions)
print "Number of records: %d" % len(record_positions)

# get random record
rnd_record = random.choice(record_positions)
rnd_record_idx = record_positions.index(rnd_record)
# check that we haven't selected the last record at random
if rnd_record_idx == record_positions.index(record_positions[-1]):
next_record = rnd_record
rnd_record = record_positions[rnd_record_idx - 1]
else:
next_record = record_positions[rnd_record_idx + 1]
print "Selected record positions: %d, %d" % (rnd_record, next_record)
print "Record length: %d" % (next_record - rnd_record)

# get average record lengths
record_lengths = []
for record in record_positions[:-1]:
thispos = record
nextpos = record_positions[record_positions.index(record) + 1]
record_lengths.append(nextpos - thispos)
print "Average record length: %d" % sum(record_lengths) / len(record_lengths)

# read record
infh = open(filename, 'r')
infh.seek(rnd_record)
record = infh.read((next_record - rnd_record) - 1)
infh.close()

# display record
print "Seq:\n\n%s" % record
63 changes: 63 additions & 0 deletions fastq_rnd_seek_proper.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/Applications/Julia-0.3.5.app/Contents/Resources/julia/bin/julia
# required modules
require("argparse")
using ArgParse

# parse arguments
function parse_arguments()
s = ArgParseSettings()
@add_arg_table s begin
"filename"
help = "Input filename."
required = true
"numsamples"
help = "Number of sequences to sample."
arg_type = Int
default = 10000
end
return parse_args(s)
end

# get random position in file
function get_random_pos(rnd_num)
return rand(0:rnd_num)
end

# main function
function main()
# retrieve parsed arguments
parsed_args = parse_arguments()

# set filename
filename = parsed_args["filename"]
numsamples = parsed_args["numsamples"]

# check file exists
if !isfile(filename)
throw(LoadError("", 0, "Filename $(filename) not found."))
end

# get file size
infilesize = filesize(filename)
println("$(infilesize)")

# open input file
infh = open(filename, "r")

# iterate number of samples times
for i = 1:numsamples
pos = get_random_pos(infilesize)
seek(infh, pos)
println(pos)
println(position(infh))
println(readline(infh))
exit()
end

# close file handle
close(infh)
end

# call main function
main()

62 changes: 62 additions & 0 deletions fastq_rnd_seek_proper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
# import modules
import mmap, os, random, sys
from Bio import SeqIO
from StringIO import StringIO

# parse arguments
if len(sys.argv) != 3:
print "Requires an input filename and integer as arguments."
sys.exit()
filename = sys.argv[1]
try:
num_seqs = int(sys.argv[2])
except:
print "Require a valid integer value for number of sequences to sample."
sys.exit()

# check file exists
if not os.path.exists(filename):
print "Filename %s doesn't exist." % filename
sys.exit()

# get size of the file
filesize = os.path.getsize(filename)

# define variables
observed_offsets = []

# open file
with open(filename, 'rb') as infh:
mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ)
for i in range(num_seqs):
failed = True
while failed:
# pick a random offset and seek to that position
random_offset = random.randrange(filesize)
mm.seek(random_offset)
# always seek to a position where there is a record in the file
if mm.rfind("\n@") == -1:
continue
# find the first record after the offset
current_record_offset = mm.find("\n@") + 1
# check not already seen this record
if current_record_offset in observed_offsets:
continue
# add current record to list of observed records
observed_offsets.append(current_record_offset)
# seek to current record and find the next
mm.seek(current_record_offset)
next_record_offset = mm.find("\n@")
# check not already in the last record set as end of file if we are
if next_record_offset == -1:
next_record_offset = filesize
# read record
seq = mm.read(next_record_offset - current_record_offset)
# check if is a valid sequence
try:
record = SeqIO.read(StringIO(seq), 'fastq')
except:
continue
failed = False
mm.close()
55 changes: 55 additions & 0 deletions fastq_rnd_seek_proper_re.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python
# import modules
import mmap, os, random, re, sys

# parse arguments
if len(sys.argv) != 3:
print "Requires an input filename and integer as arguments."
sys.exit()
filename = sys.argv[1]
try:
num_seqs = int(sys.argv[2])
except:
print "Require a valid integer value for number of sequences to sample."
sys.exit()

# check file exists
if not os.path.exists(filename):
print "Filename %s doesn't exist." % filename
sys.exit()

# get size of the file
filesize = os.path.getsize(filename)

# define variables
observed_offsets = []
chunk_size = 4096

# define FASTQ regex
pattern = re.compile(r'@.+[\n\r]+[A-Z-]+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}')

# open file
with open(filename, 'rb') as infh:
mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ)
for i in range(num_seqs):
match = None
while not match:
# pick a random offset and seek to that position
random_offset = random.randrange(filesize)
mm.seek(random_offset)
# read in a chunk of the file
if not random_offset <= filesize - chunk_size:
continue
chunk = mm.read(chunk_size)
match = re.search(pattern, chunk)
if not match:
continue
start, end = match.span()
record_offset = start
record = chunk[start:end]
# check not already seen this record
if record_offset in observed_offsets:
continue
# add current record to list of observed records
observed_offsets.append(record_offset)
mm.close()
43 changes: 43 additions & 0 deletions strandex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os.path
from six.moves import range
import re
import sys
import argparse

pattern = re.compile(r'@.+[\n\r]+.+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}')

def run(args):
file_size = os.path.getsize(args.fastq.name)
step = file_size // args.nreads
assert step > 0
last_record = ''
for i in range(0, file_size, step):
match = None
ahead = 4000
while not match:
args.fastq.seek(i)
if i + ahead > file_size:
break
else:
chunk = args.fastq.read(ahead)
match = re.search(pattern, chunk)
if match:
start, end = match.span()
record = chunk[start:end]
if record != last_record:
sys.stdout.write(record)
last_record = record
else:
ahead += ahead
args.fastq.close()


def main():
parser = argparse.ArgumentParser(prog='strandex', description="sample an approximate number of reads from a fastq file without reading the entire file")
parser.add_argument('fastq', type=argparse.FileType('r'), help="input fastq file")
parser.add_argument('-n', '--nreads', type=int, default=2000000, help='approximate number of reads to sample from input (default: %(default)s)')
args = parser.parse_args()
run(args)

if __name__ == "__main__":
main()

0 comments on commit a77b6fd

Please sign in to comment.