annotate subsamplr.py @ 0:1fbf1bf3d180 draft default tip

planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
author jpayne
date Mon, 08 Dec 2025 20:22:33 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
1 from bz2 import open as bzopen
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
2 from gzip import open as gzopen
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
3
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
4 from contextlib import ExitStack
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
5 from itertools import zip_longest
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
6 from pathlib import Path
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
7 from sys import argv
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
8
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
9 import random
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
10
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
11
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
12 usage = """
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
13
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
14 """
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
15
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
16 def grouper(iterable, n, fillvalue=None):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
17 "Collect data into fixed-length chunks or blocks"
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
18 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
19 args = [iter(iterable)] * n
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
20 return zip_longest(*args, fillvalue=fillvalue)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
21
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
22 # file compression signatures
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
23 magics = {
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
24 b'\x1f\x8b\x08':gzopen,
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
25 b'\x42\x5a\x68':bzopen,
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
26 }
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
27
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
28 def sniff(path):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
29 "Sniff first three bytes of the file to determine format based on the magic number."
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
30 with open(path, 'rb') as fp:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
31 magic = fp.read(3)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
32 return magics.get(magic, open)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
33
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
34
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
35 def coverage(collection, genome_size):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
36 "Collection of 1 or 2 tuples, whose 2nd item is the read string"
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
37 return sum((len(read[0][1]) for read in collection)) / genome_size # reverse read pair doesn't contribute to coverage so we can ignore it
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
38
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
39
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
40 try:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
41 fin, rin, fout, rout, cov, gen_size, *opts = argv[1:]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
42 ins = [fin, rin]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
43 outs = [fout, rout]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
44 except ValueError: # not enough values to unpack
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
45 try:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
46 fin, fout, cov, gen_size, *opts = argv[1:]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
47 ins = [fin]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
48 outs = [fout]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
49 except ValueError:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
50 print(usage)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
51 quit(1)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
52 try:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
53 cov = float(cov)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
54 gen_size = int(gen_size)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
55 except ValueError:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
56 print("Desired coverage and assumed genome size should be numbers")
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
57 print(usage)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
58 quit(1)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
59
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
60 seed = "ed2b99d842cddc1ac81d7c01a0bf0555"
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
61 if opts:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
62 seed = opts[0]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
63 random.seed(seed)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
64
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
65 assert len(ins) == len(outs)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
66 file_openers = [sniff(path) for path in ins] # output format determined by input format
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
67 with ExitStack() as stack:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
71
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
72 for file in ins:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
73 if hasattr(file, "name"):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
74 print(file.name)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
75
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
76 # https://en.m.wikipedia.org/wiki/Reservoir_sampling
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
77
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
78 reservoir = []
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
79 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
80 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
81 # from the open files
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
82 reads = 0
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
83 for i, readpair in enumerate(zip(*inns)):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
84 reads += len(readpair[0][1])
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
85 reservoir.append(readpair)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
86 if reads / gen_size > cov:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
87 break
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
88
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
89 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
90 #W = exp(log(random.random()) / k)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
91
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
92 random.shuffle(reservoir)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
93
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
94 print(f"{k} reads selected to achieve {coverage(reservoir, gen_size):.3f}X coverage.")
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
95
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
96 # if the number of reads is too few to meet the coverage cutoff, then the iterators
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
97 # should be exhausted and this won't run
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
98 # this is essentially Algorithm L, as I understand it
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
99 for i, readpair in enumerate(zip(*inns)):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
100 r = random.randint(0, i)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
101 if r < k:
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
102 reservoir[r] = readpair
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
103
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
104 for readpair in reservoir: # output the sampled reads
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
105 for read, file in zip(readpair, outs):
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
106 defline, read, spacer, quals = read
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
107 file.write(defline)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
108 file.write(read)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
109 file.write(spacer)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
110 file.write(quals)
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
111
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
112 # [fp.close() for fp in ins]
1fbf1bf3d180 planemo upload commit 1f45ac19d30ec5c5bf7d44a0b09990369a139668
jpayne
parents:
diff changeset
113 # [fp.close() for fp in outs]