-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a77b6fd
Showing
6 changed files
with
288 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# fastq_rnd_seek |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/usr/bin/env python | ||
# import modules | ||
import os, random, sys | ||
|
||
# parse arguments | ||
if len(sys.argv) != 2: | ||
print "Requires one input filename as an argument." | ||
sys.exit() | ||
filename = sys.argv[1] | ||
|
||
# check file exists | ||
if not os.path.exists(filename): | ||
print "Filename doesn't exist." | ||
sys.exit() | ||
|
||
# define variables | ||
record_positions = [] | ||
line_positions = {} | ||
|
||
# open file | ||
linenum = 0 | ||
length = 0 | ||
infh = open(filename, 'r') | ||
for line in infh: | ||
line_positions[linenum] = length | ||
if line.startswith("@"): | ||
position = line_positions[linenum] | ||
record_positions.append(position) | ||
length += len(line) | ||
linenum += 1 | ||
infh.close() | ||
|
||
# sort positions | ||
record_positions = sorted(record_positions) | ||
print "Number of records: %d" % len(record_positions) | ||
|
||
# get random record | ||
rnd_record = random.choice(record_positions) | ||
rnd_record_idx = record_positions.index(rnd_record) | ||
# check that we haven't selected the last record at random | ||
if rnd_record_idx == record_positions.index(record_positions[-1]): | ||
next_record = rnd_record | ||
rnd_record = record_positions[rnd_record_idx - 1] | ||
else: | ||
next_record = record_positions[rnd_record_idx + 1] | ||
print "Selected record positions: %d, %d" % (rnd_record, next_record) | ||
print "Record length: %d" % (next_record - rnd_record) | ||
|
||
# get average record lengths | ||
record_lengths = [] | ||
for record in record_positions[:-1]: | ||
thispos = record | ||
nextpos = record_positions[record_positions.index(record) + 1] | ||
record_lengths.append(nextpos - thispos) | ||
print "Average record length: %d" % sum(record_lengths) / len(record_lengths) | ||
|
||
# read record | ||
infh = open(filename, 'r') | ||
infh.seek(rnd_record) | ||
record = infh.read((next_record - rnd_record) - 1) | ||
infh.close() | ||
|
||
# display record | ||
print "Seq:\n\n%s" % record |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/Applications/Julia-0.3.5.app/Contents/Resources/julia/bin/julia | ||
# required modules | ||
require("argparse") | ||
using ArgParse | ||
|
||
# parse arguments | ||
function parse_arguments() | ||
s = ArgParseSettings() | ||
@add_arg_table s begin | ||
"filename" | ||
help = "Input filename." | ||
required = true | ||
"numsamples" | ||
help = "Number of sequences to sample." | ||
arg_type = Int | ||
default = 10000 | ||
end | ||
return parse_args(s) | ||
end | ||
|
||
# get random position in file | ||
function get_random_pos(rnd_num) | ||
return rand(0:rnd_num) | ||
end | ||
|
||
# main function | ||
function main() | ||
# retrieve parsed arguments | ||
parsed_args = parse_arguments() | ||
|
||
# set filename | ||
filename = parsed_args["filename"] | ||
numsamples = parsed_args["numsamples"] | ||
|
||
# check file exists | ||
if !isfile(filename) | ||
throw(LoadError("", 0, "Filename $(filename) not found.")) | ||
end | ||
|
||
# get file size | ||
infilesize = filesize(filename) | ||
println("$(infilesize)") | ||
|
||
# open input file | ||
infh = open(filename, "r") | ||
|
||
# iterate number of samples times | ||
for i = 1:numsamples | ||
pos = get_random_pos(infilesize) | ||
seek(infh, pos) | ||
println(pos) | ||
println(position(infh)) | ||
println(readline(infh)) | ||
exit() | ||
end | ||
|
||
# close file handle | ||
close(infh) | ||
end | ||
|
||
# call main function | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
# import modules | ||
import mmap, os, random, sys | ||
from Bio import SeqIO | ||
from StringIO import StringIO | ||
|
||
# parse arguments | ||
if len(sys.argv) != 3: | ||
print "Requires an input filename and integer as arguments." | ||
sys.exit() | ||
filename = sys.argv[1] | ||
try: | ||
num_seqs = int(sys.argv[2]) | ||
except: | ||
print "Require a valid integer value for number of sequences to sample." | ||
sys.exit() | ||
|
||
# check file exists | ||
if not os.path.exists(filename): | ||
print "Filename %s doesn't exist." % filename | ||
sys.exit() | ||
|
||
# get size of the file | ||
filesize = os.path.getsize(filename) | ||
|
||
# define variables | ||
observed_offsets = [] | ||
|
||
# open file | ||
with open(filename, 'rb') as infh: | ||
mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ) | ||
for i in range(num_seqs): | ||
failed = True | ||
while failed: | ||
# pick a random offset and seek to that position | ||
random_offset = random.randrange(filesize) | ||
mm.seek(random_offset) | ||
# always seek to a position where there is a record in the file | ||
if mm.rfind("\n@") == -1: | ||
continue | ||
# find the first record after the offset | ||
current_record_offset = mm.find("\n@") + 1 | ||
# check not already seen this record | ||
if current_record_offset in observed_offsets: | ||
continue | ||
# add current record to list of observed records | ||
observed_offsets.append(current_record_offset) | ||
# seek to current record and find the next | ||
mm.seek(current_record_offset) | ||
next_record_offset = mm.find("\n@") | ||
# check not already in the last record set as end of file if we are | ||
if next_record_offset == -1: | ||
next_record_offset = filesize | ||
# read record | ||
seq = mm.read(next_record_offset - current_record_offset) | ||
# check if is a valid sequence | ||
try: | ||
record = SeqIO.read(StringIO(seq), 'fastq') | ||
except: | ||
continue | ||
failed = False | ||
mm.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python | ||
# import modules | ||
import mmap, os, random, re, sys | ||
|
||
# parse arguments | ||
if len(sys.argv) != 3: | ||
print "Requires an input filename and integer as arguments." | ||
sys.exit() | ||
filename = sys.argv[1] | ||
try: | ||
num_seqs = int(sys.argv[2]) | ||
except: | ||
print "Require a valid integer value for number of sequences to sample." | ||
sys.exit() | ||
|
||
# check file exists | ||
if not os.path.exists(filename): | ||
print "Filename %s doesn't exist." % filename | ||
sys.exit() | ||
|
||
# get size of the file | ||
filesize = os.path.getsize(filename) | ||
|
||
# define variables | ||
observed_offsets = [] | ||
chunk_size = 4096 | ||
|
||
# define FASTQ regex | ||
pattern = re.compile(r'@.+[\n\r]+[A-Z-]+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}') | ||
|
||
# open file | ||
with open(filename, 'rb') as infh: | ||
mm = mmap.mmap(infh.fileno(), 0, prot=mmap.PROT_READ) | ||
for i in range(num_seqs): | ||
match = None | ||
while not match: | ||
# pick a random offset and seek to that position | ||
random_offset = random.randrange(filesize) | ||
mm.seek(random_offset) | ||
# read in a chunk of the file | ||
if not random_offset <= filesize - chunk_size: | ||
continue | ||
chunk = mm.read(chunk_size) | ||
match = re.search(pattern, chunk) | ||
if not match: | ||
continue | ||
start, end = match.span() | ||
record_offset = start | ||
record = chunk[start:end] | ||
# check not already seen this record | ||
if record_offset in observed_offsets: | ||
continue | ||
# add current record to list of observed records | ||
observed_offsets.append(record_offset) | ||
mm.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import os.path | ||
from six.moves import range | ||
import re | ||
import sys | ||
import argparse | ||
|
||
pattern = re.compile(r'@.+[\n\r]+.+[\n\r]+\+[\n\r]+.+[\n\r]{0,1}') | ||
|
||
def run(args): | ||
file_size = os.path.getsize(args.fastq.name) | ||
step = file_size // args.nreads | ||
assert step > 0 | ||
last_record = '' | ||
for i in range(0, file_size, step): | ||
match = None | ||
ahead = 4000 | ||
while not match: | ||
args.fastq.seek(i) | ||
if i + ahead > file_size: | ||
break | ||
else: | ||
chunk = args.fastq.read(ahead) | ||
match = re.search(pattern, chunk) | ||
if match: | ||
start, end = match.span() | ||
record = chunk[start:end] | ||
if record != last_record: | ||
sys.stdout.write(record) | ||
last_record = record | ||
else: | ||
ahead += ahead | ||
args.fastq.close() | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(prog='strandex', description="sample an approximate number of reads from a fastq file without reading the entire file") | ||
parser.add_argument('fastq', type=argparse.FileType('r'), help="input fastq file") | ||
parser.add_argument('-n', '--nreads', type=int, default=2000000, help='approximate number of reads to sample from input (default: %(default)s)') | ||
args = parser.parse_args() | ||
run(args) | ||
|
||
if __name__ == "__main__": | ||
main() |