Mercurial > repos > jpayne > gtsubsampler
diff subsamplr.py @ 0:b2915e7e9dfa
"GTSubsampler initial commit"
author | jpayne |
---|---|
date | Fri, 19 Feb 2021 13:18:54 -0500 |
parents | |
children | a90a883f88f9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/subsamplr.py Fri Feb 19 13:18:54 2021 -0500 @@ -0,0 +1,107 @@ +from bz2 import open as bzopen +from gzip import open as gzopen + +from contextlib import ExitStack +from itertools import zip_longest +from pathlib import Path +from sys import argv + +import random + + +usage = """ + +""" + +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return zip_longest(*args, fillvalue=fillvalue) + +# file compression signatures +magics = { + b'\x1f\x8b\x08':gzopen, + b'\x42\x5a\x68':bzopen, +} + +def sniff(path): + "Sniff first three bytes of the file to determine format based on the magic number." + with open(path, 'rb') as fp: + magic = fp.read(3) + return magics.get(magic, open) + + +def coverage(collection, genome_size): + "Collection of 1 or 2 tuples, whose 2nd item is the read string" + return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it + + +try: + fin, rin, fout, rout, cov, gen_size, *opts = argv[1:] + ins = [fin, rin] + outs = [fout, rout] +except ValueError: # not enough values to unpack + try: + fin, fout, cov, gen_size, *ops = argv[1:] + ins = [fin] + outs = [fout] + except ValueError: + print(usage) + quit(1) +try: + cov = float(cov) + gen_size = int(gen_size) +except ValueError: + print("Desired coverage and assumed genome size should be numbers") + print(usage) + quit(1) + +seed = "ed2b99d842cddc1ac81d7c01a0bf0555" +if opts: + seed = opts[0] +random.seed(seed) + +assert len(ins) == len(outs) +file_openers = [sniff(path) for path in ins] # output format determined by input format +with ExitStack() as stack: + ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files + inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input + outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files + + # https://en.m.wikipedia.org/wiki/Reservoir_sampling + + reservoir = [] + # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file + # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads + # from the open files + for readpair in zip(*inns): + reservoir.append(readpair) + if coverage(reservoir, gen_size) > cov: + break + + k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage + #W = exp(log(random.random()) / k) + + random.shuffle(reservoir) + + print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.") + + # if the number of reads is too few to meet the coverage cutoff, then the iterators + # should be exhausted and this won't run + # this is essentially Algorithm L, as I understand it + for i, readpair in enumerate(zip(*inns)): + r = random.randint(0, i) + if r < k: + reservoir[r] = readpair + + for readpair in reservoir: # output the sampled reads + for read, file in zip(readpair, outs): + defline, read, spacer, quals = read + file.write(defline) + file.write(read) + file.write(spacer) + file.write(quals) + +# [fp.close() for fp in ins] +# [fp.close() for fp in outs]