jpayne@0
|
1 from bz2 import open as bzopen
|
jpayne@0
|
2 from gzip import open as gzopen
|
jpayne@0
|
3
|
jpayne@0
|
4 from contextlib import ExitStack
|
jpayne@0
|
5 from itertools import zip_longest
|
jpayne@0
|
6 from pathlib import Path
|
jpayne@0
|
7 from sys import argv
|
jpayne@0
|
8
|
jpayne@0
|
9 import random
|
jpayne@0
|
10
|
jpayne@0
|
11
|
jpayne@0
|
12 usage = """
|
jpayne@0
|
13
|
jpayne@0
|
14 """
|
jpayne@0
|
15
|
jpayne@0
|
16 def grouper(iterable, n, fillvalue=None):
|
jpayne@0
|
17 "Collect data into fixed-length chunks or blocks"
|
jpayne@0
|
18 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
|
jpayne@0
|
19 args = [iter(iterable)] * n
|
jpayne@0
|
20 return zip_longest(*args, fillvalue=fillvalue)
|
jpayne@0
|
21
|
jpayne@0
|
22 # file compression signatures
|
jpayne@0
|
23 magics = {
|
jpayne@0
|
24 b'\x1f\x8b\x08':gzopen,
|
jpayne@0
|
25 b'\x42\x5a\x68':bzopen,
|
jpayne@0
|
26 }
|
jpayne@0
|
27
|
jpayne@0
|
28 def sniff(path):
|
jpayne@0
|
29 "Sniff first three bytes of the file to determine format based on the magic number."
|
jpayne@0
|
30 with open(path, 'rb') as fp:
|
jpayne@0
|
31 magic = fp.read(3)
|
jpayne@0
|
32 return magics.get(magic, open)
|
jpayne@0
|
33
|
jpayne@0
|
34
|
jpayne@0
|
35 def coverage(collection, genome_size):
|
jpayne@0
|
36 "Collection of 1 or 2 tuples, whose 2nd item is the read string"
|
jpayne@0
|
37 return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it
|
jpayne@0
|
38
|
jpayne@0
|
39
|
jpayne@0
|
40 try:
|
jpayne@0
|
41 fin, rin, fout, rout, cov, gen_size, *opts = argv[1:]
|
jpayne@0
|
42 ins = [fin, rin]
|
jpayne@0
|
43 outs = [fout, rout]
|
jpayne@0
|
44 except ValueError: # not enough values to unpack
|
jpayne@0
|
45 try:
|
jpayne@0
|
46 fin, fout, cov, gen_size, *ops = argv[1:]
|
jpayne@0
|
47 ins = [fin]
|
jpayne@0
|
48 outs = [fout]
|
jpayne@0
|
49 except ValueError:
|
jpayne@0
|
50 print(usage)
|
jpayne@0
|
51 quit(1)
|
jpayne@0
|
52 try:
|
jpayne@0
|
53 cov = float(cov)
|
jpayne@0
|
54 gen_size = int(gen_size)
|
jpayne@0
|
55 except ValueError:
|
jpayne@0
|
56 print("Desired coverage and assumed genome size should be numbers")
|
jpayne@0
|
57 print(usage)
|
jpayne@0
|
58 quit(1)
|
jpayne@0
|
59
|
jpayne@0
|
60 seed = "ed2b99d842cddc1ac81d7c01a0bf0555"
|
jpayne@0
|
61 if opts:
|
jpayne@0
|
62 seed = opts[0]
|
jpayne@0
|
63 random.seed(seed)
|
jpayne@0
|
64
|
jpayne@0
|
65 assert len(ins) == len(outs)
|
jpayne@0
|
66 file_openers = [sniff(path) for path in ins] # output format determined by input format
|
jpayne@0
|
67 with ExitStack() as stack:
|
jpayne@0
|
68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files
|
jpayne@0
|
69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input
|
jpayne@0
|
70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files
|
jpayne@0
|
71
|
jpayne@0
|
72 # https://en.m.wikipedia.org/wiki/Reservoir_sampling
|
jpayne@0
|
73
|
jpayne@0
|
74 reservoir = []
|
jpayne@0
|
75 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file
|
jpayne@0
|
76 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads
|
jpayne@0
|
77 # from the open files
|
jpayne@0
|
78 for readpair in zip(*inns):
|
jpayne@0
|
79 reservoir.append(readpair)
|
jpayne@0
|
80 if coverage(reservoir, gen_size) > cov:
|
jpayne@0
|
81 break
|
jpayne@0
|
82
|
jpayne@0
|
83 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage
|
jpayne@0
|
84 #W = exp(log(random.random()) / k)
|
jpayne@0
|
85
|
jpayne@0
|
86 random.shuffle(reservoir)
|
jpayne@0
|
87
|
jpayne@0
|
88 print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.")
|
jpayne@0
|
89
|
jpayne@0
|
90 # if the number of reads is too few to meet the coverage cutoff, then the iterators
|
jpayne@0
|
91 # should be exhausted and this won't run
|
jpayne@0
|
92 # this is essentially Algorithm L, as I understand it
|
jpayne@0
|
93 for i, readpair in enumerate(zip(*inns)):
|
jpayne@0
|
94 r = random.randint(0, i)
|
jpayne@0
|
95 if r < k:
|
jpayne@0
|
96 reservoir[r] = readpair
|
jpayne@0
|
97
|
jpayne@0
|
98 for readpair in reservoir: # output the sampled reads
|
jpayne@0
|
99 for read, file in zip(readpair, outs):
|
jpayne@0
|
100 defline, read, spacer, quals = read
|
jpayne@0
|
101 file.write(defline)
|
jpayne@0
|
102 file.write(read)
|
jpayne@0
|
103 file.write(spacer)
|
jpayne@0
|
104 file.write(quals)
|
jpayne@0
|
105
|
jpayne@0
|
106 # [fp.close() for fp in ins]
|
jpayne@0
|
107 # [fp.close() for fp in outs]
|