Mercurial > repos > jpayne > gtsubsampler
view subsamplr.py @ 0:b2915e7e9dfa
"GTSubsampler initial commit"
author | jpayne |
---|---|
date | Fri, 19 Feb 2021 13:18:54 -0500 |
parents | |
children | a90a883f88f9 |
line wrap: on
line source
from bz2 import open as bzopen from gzip import open as gzopen from contextlib import ExitStack from itertools import zip_longest from pathlib import Path from sys import argv import random usage = """ """ def grouper(iterable, n, fillvalue=None): "Collect data into fixed-length chunks or blocks" # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) # file compression signatures magics = { b'\x1f\x8b\x08':gzopen, b'\x42\x5a\x68':bzopen, } def sniff(path): "Sniff first three bytes of the file to determine format based on the magic number." with open(path, 'rb') as fp: magic = fp.read(3) return magics.get(magic, open) def coverage(collection, genome_size): "Collection of 1 or 2 tuples, whose 2nd item is the read string" return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it try: fin, rin, fout, rout, cov, gen_size, *opts = argv[1:] ins = [fin, rin] outs = [fout, rout] except ValueError: # not enough values to unpack try: fin, fout, cov, gen_size, *ops = argv[1:] ins = [fin] outs = [fout] except ValueError: print(usage) quit(1) try: cov = float(cov) gen_size = int(gen_size) except ValueError: print("Desired coverage and assumed genome size should be numbers") print(usage) quit(1) seed = "ed2b99d842cddc1ac81d7c01a0bf0555" if opts: seed = opts[0] random.seed(seed) assert len(ins) == len(outs) file_openers = [sniff(path) for path in ins] # output format determined by input format with ExitStack() as stack: ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files # https://en.m.wikipedia.org/wiki/Reservoir_sampling reservoir = [] # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads # from the open files for readpair in zip(*inns): reservoir.append(readpair) if coverage(reservoir, gen_size) > cov: break k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage #W = exp(log(random.random()) / k) random.shuffle(reservoir) print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.") # if the number of reads is too few to meet the coverage cutoff, then the iterators # should be exhausted and this won't run # this is essentially Algorithm L, as I understand it for i, readpair in enumerate(zip(*inns)): r = random.randint(0, i) if r < k: reservoir[r] = readpair for readpair in reservoir: # output the sampled reads for read, file in zip(readpair, outs): defline, read, spacer, quals = read file.write(defline) file.write(read) file.write(spacer) file.write(quals) # [fp.close() for fp in ins] # [fp.close() for fp in outs]