Mercurial > repos > jpayne > gtsubsampler
annotate subsamplr.py @ 0:1fbf1bf3d180 draft default tip
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 20:22:33 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
1 from bz2 import open as bzopen |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
2 from gzip import open as gzopen |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
3 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
4 from contextlib import ExitStack |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
5 from itertools import zip_longest |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
6 from pathlib import Path |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
7 from sys import argv |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
8 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
9 import random |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
10 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
11 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
12 usage = """ |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
13 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
14 """ |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
15 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
16 def grouper(iterable, n, fillvalue=None): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
17 "Collect data into fixed-length chunks or blocks" |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
18 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
19 args = [iter(iterable)] * n |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
20 return zip_longest(*args, fillvalue=fillvalue) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
21 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
22 # file compression signatures |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
23 magics = { |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
24 b'\x1f\x8b\x08':gzopen, |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
25 b'\x42\x5a\x68':bzopen, |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
26 } |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
27 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
28 def sniff(path): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
29 "Sniff first three bytes of the file to determine format based on the magic number." |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
30 with open(path, 'rb') as fp: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
31 magic = fp.read(3) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
32 return magics.get(magic, open) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
33 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
34 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
35 def coverage(collection, genome_size): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
36 "Collection of 1 or 2 tuples, whose 2nd item is the read string" |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
37 return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
38 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
39 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
40 try: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
41 fin, rin, fout, rout, cov, gen_size, *opts = argv[1:] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
42 ins = [fin, rin] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
43 outs = [fout, rout] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
44 except ValueError: # not enough values to unpack |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
45 try: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
46 fin, fout, cov, gen_size, *opts = argv[1:] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
47 ins = [fin] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
48 outs = [fout] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
49 except ValueError: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
50 print(usage) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
51 quit(1) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
52 try: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
53 cov = float(cov) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
54 gen_size = int(gen_size) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
55 except ValueError: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
56 print("Desired coverage and assumed genome size should be numbers") |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
57 print(usage) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
58 quit(1) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
59 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
60 seed = "ed2b99d842cddc1ac81d7c01a0bf0555" |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
61 if opts: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
62 seed = opts[0] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
63 random.seed(seed) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
64 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
65 assert len(ins) == len(outs) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
66 file_openers = [sniff(path) for path in ins] # output format determined by input format |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
67 with ExitStack() as stack: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
71 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
72 for file in ins: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
73 if hasattr(file, "name"): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
74 print(file.name) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
75 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
76 # https://en.m.wikipedia.org/wiki/Reservoir_sampling |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
77 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
78 reservoir = [] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
79 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
80 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
81 # from the open files |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
82 reads = 0 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
83 for i, readpair in enumerate(zip(*inns)): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
84 reads += len(readpair[0][1]) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
85 reservoir.append(readpair) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
86 if reads / gen_size > cov: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
87 break |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
88 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
89 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
90 #W = exp(log(random.random()) / k) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
91 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
92 random.shuffle(reservoir) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
93 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
94 print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.") |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
95 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
96 # if the number of reads is too few to meet the coverage cutoff, then the iterators |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
97 # should be exhausted and this won't run |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
98 # this is essentially Algorithm L, as I understand it |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
99 for i, readpair in enumerate(zip(*inns)): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
100 r = random.randint(0, i) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
101 if r < k: |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
102 reservoir[r] = readpair |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
103 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
104 for readpair in reservoir: # output the sampled reads |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
105 for read, file in zip(readpair, outs): |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
106 defline, read, spacer, quals = read |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
107 file.write(defline) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
108 file.write(read) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
109 file.write(spacer) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
110 file.write(quals) |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
111 |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
112 # [fp.close() for fp in ins] |
|
1fbf1bf3d180
planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff
changeset
|
113 # [fp.close() for fp in outs] |
