jpayne@0: from bz2 import open as bzopen jpayne@0: from gzip import open as gzopen jpayne@0: jpayne@0: from contextlib import ExitStack jpayne@0: from itertools import zip_longest jpayne@0: from pathlib import Path jpayne@0: from sys import argv jpayne@0: jpayne@0: import random jpayne@0: jpayne@0: jpayne@0: usage = """ jpayne@0: jpayne@0: """ jpayne@0: jpayne@0: def grouper(iterable, n, fillvalue=None): jpayne@0: "Collect data into fixed-length chunks or blocks" jpayne@0: # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" jpayne@0: args = [iter(iterable)] * n jpayne@0: return zip_longest(*args, fillvalue=fillvalue) jpayne@0: jpayne@0: # file compression signatures jpayne@0: magics = { jpayne@0: b'\x1f\x8b\x08':gzopen, jpayne@0: b'\x42\x5a\x68':bzopen, jpayne@0: } jpayne@0: jpayne@0: def sniff(path): jpayne@0: "Sniff first three bytes of the file to determine format based on the magic number." jpayne@0: with open(path, 'rb') as fp: jpayne@0: magic = fp.read(3) jpayne@0: return magics.get(magic, open) jpayne@0: jpayne@0: jpayne@0: def coverage(collection, genome_size): jpayne@0: "Collection of 1 or 2 tuples, whose 2nd item is the read string" jpayne@0: return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it jpayne@0: jpayne@0: jpayne@0: try: jpayne@0: fin, rin, fout, rout, cov, gen_size, *opts = argv[1:] jpayne@0: ins = [fin, rin] jpayne@0: outs = [fout, rout] jpayne@0: except ValueError: # not enough values to unpack jpayne@0: try: jpayne@0: fin, fout, cov, gen_size, *ops = argv[1:] jpayne@0: ins = [fin] jpayne@0: outs = [fout] jpayne@0: except ValueError: jpayne@0: print(usage) jpayne@0: quit(1) jpayne@0: try: jpayne@0: cov = float(cov) jpayne@0: gen_size = int(gen_size) jpayne@0: except ValueError: jpayne@0: print("Desired coverage and assumed genome size should be numbers") jpayne@0: print(usage) jpayne@0: quit(1) jpayne@0: jpayne@0: seed = "ed2b99d842cddc1ac81d7c01a0bf0555" jpayne@0: if opts: jpayne@0: seed = opts[0] jpayne@0: random.seed(seed) jpayne@0: jpayne@0: assert len(ins) == len(outs) jpayne@0: file_openers = [sniff(path) for path in ins] # output format determined by input format jpayne@0: with ExitStack() as stack: jpayne@0: ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files jpayne@0: inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input jpayne@0: outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files jpayne@0: jpayne@0: # https://en.m.wikipedia.org/wiki/Reservoir_sampling jpayne@0: jpayne@0: reservoir = [] jpayne@0: # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file jpayne@0: # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads jpayne@0: # from the open files jpayne@0: for readpair in zip(*inns): jpayne@0: reservoir.append(readpair) jpayne@0: if coverage(reservoir, gen_size) > cov: jpayne@0: break jpayne@0: jpayne@0: k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage jpayne@0: #W = exp(log(random.random()) / k) jpayne@0: jpayne@0: random.shuffle(reservoir) jpayne@0: jpayne@0: print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.") jpayne@0: jpayne@0: # if the number of reads is too few to meet the coverage cutoff, then the iterators jpayne@0: # should be exhausted and this won't run jpayne@0: # this is essentially Algorithm L, as I understand it jpayne@0: for i, readpair in enumerate(zip(*inns)): jpayne@0: r = random.randint(0, i) jpayne@0: if r < k: jpayne@0: reservoir[r] = readpair jpayne@0: jpayne@0: for readpair in reservoir: # output the sampled reads jpayne@0: for read, file in zip(readpair, outs): jpayne@0: defline, read, spacer, quals = read jpayne@0: file.write(defline) jpayne@0: file.write(read) jpayne@0: file.write(spacer) jpayne@0: file.write(quals) jpayne@0: jpayne@0: # [fp.close() for fp in ins] jpayne@0: # [fp.close() for fp in outs]