Mercurial > repos > jpayne > gtsubsampler
comparison subsamplr.py @ 0:1fbf1bf3d180 draft default tip
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 20:22:33 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1fbf1bf3d180 |
|---|---|
| 1 from bz2 import open as bzopen | |
| 2 from gzip import open as gzopen | |
| 3 | |
| 4 from contextlib import ExitStack | |
| 5 from itertools import zip_longest | |
| 6 from pathlib import Path | |
| 7 from sys import argv | |
| 8 | |
| 9 import random | |
| 10 | |
| 11 | |
| 12 usage = """ | |
| 13 | |
| 14 """ | |
| 15 | |
| 16 def grouper(iterable, n, fillvalue=None): | |
| 17 "Collect data into fixed-length chunks or blocks" | |
| 18 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" | |
| 19 args = [iter(iterable)] * n | |
| 20 return zip_longest(*args, fillvalue=fillvalue) | |
| 21 | |
| 22 # file compression signatures | |
| 23 magics = { | |
| 24 b'\x1f\x8b\x08':gzopen, | |
| 25 b'\x42\x5a\x68':bzopen, | |
| 26 } | |
| 27 | |
| 28 def sniff(path): | |
| 29 "Sniff first three bytes of the file to determine format based on the magic number." | |
| 30 with open(path, 'rb') as fp: | |
| 31 magic = fp.read(3) | |
| 32 return magics.get(magic, open) | |
| 33 | |
| 34 | |
| 35 def coverage(collection, genome_size): | |
| 36 "Collection of 1 or 2 tuples, whose 2nd item is the read string" | |
| 37 return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it | |
| 38 | |
| 39 | |
| 40 try: | |
| 41 fin, rin, fout, rout, cov, gen_size, *opts = argv[1:] | |
| 42 ins = [fin, rin] | |
| 43 outs = [fout, rout] | |
| 44 except ValueError: # not enough values to unpack | |
| 45 try: | |
| 46 fin, fout, cov, gen_size, *opts = argv[1:] | |
| 47 ins = [fin] | |
| 48 outs = [fout] | |
| 49 except ValueError: | |
| 50 print(usage) | |
| 51 quit(1) | |
| 52 try: | |
| 53 cov = float(cov) | |
| 54 gen_size = int(gen_size) | |
| 55 except ValueError: | |
| 56 print("Desired coverage and assumed genome size should be numbers") | |
| 57 print(usage) | |
| 58 quit(1) | |
| 59 | |
| 60 seed = "ed2b99d842cddc1ac81d7c01a0bf0555" | |
| 61 if opts: | |
| 62 seed = opts[0] | |
| 63 random.seed(seed) | |
| 64 | |
| 65 assert len(ins) == len(outs) | |
| 66 file_openers = [sniff(path) for path in ins] # output format determined by input format | |
| 67 with ExitStack() as stack: | |
| 68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files | |
| 69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input | |
| 70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files | |
| 71 | |
| 72 for file in ins: | |
| 73 if hasattr(file, "name"): | |
| 74 print(file.name) | |
| 75 | |
| 76 # https://en.m.wikipedia.org/wiki/Reservoir_sampling | |
| 77 | |
| 78 reservoir = [] | |
| 79 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file | |
| 80 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads | |
| 81 # from the open files | |
| 82 reads = 0 | |
| 83 for i, readpair in enumerate(zip(*inns)): | |
| 84 reads += len(readpair[0][1]) | |
| 85 reservoir.append(readpair) | |
| 86 if reads / gen_size > cov: | |
| 87 break | |
| 88 | |
| 89 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage | |
| 90 #W = exp(log(random.random()) / k) | |
| 91 | |
| 92 random.shuffle(reservoir) | |
| 93 | |
| 94 print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.") | |
| 95 | |
| 96 # if the number of reads is too few to meet the coverage cutoff, then the iterators | |
| 97 # should be exhausted and this won't run | |
| 98 # this is essentially Algorithm L, as I understand it | |
| 99 for i, readpair in enumerate(zip(*inns)): | |
| 100 r = random.randint(0, i) | |
| 101 if r < k: | |
| 102 reservoir[r] = readpair | |
| 103 | |
| 104 for readpair in reservoir: # output the sampled reads | |
| 105 for read, file in zip(readpair, outs): | |
| 106 defline, read, spacer, quals = read | |
| 107 file.write(defline) | |
| 108 file.write(read) | |
| 109 file.write(spacer) | |
| 110 file.write(quals) | |
| 111 | |
| 112 # [fp.close() for fp in ins] | |
| 113 # [fp.close() for fp in outs] |
