Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/parallel.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 import sys | |
2 import multiprocessing | |
3 from . import helpers | |
4 import pybedtools | |
5 | |
6 | |
7 def _parallel_wrap( | |
8 orig_bedtool, | |
9 shuffle_kwargs, | |
10 genome_fn, | |
11 method, | |
12 method_args, | |
13 method_kwargs, | |
14 sort=False, | |
15 shuffle=True, | |
16 reduce_func=None, | |
17 ): | |
18 """ | |
19 Given a BedTool object `orig_bedtool`, call its `method` with `args` and | |
20 `kwargs` and then call `reduce_func` on the results. | |
21 | |
22 See parallel_apply docstring for details | |
23 | |
24 """ | |
25 | |
26 # note: be careful about cleaning up tempfiles | |
27 if not shuffle: | |
28 to_use = orig_bedtool | |
29 else: | |
30 shuffled = orig_bedtool.shuffle(g=genome_fn, **shuffle_kwargs) | |
31 if sort: | |
32 to_use = shuffled.sort() | |
33 helpers.close_or_delete(shuffled) | |
34 else: | |
35 to_use = shuffled | |
36 | |
37 result = getattr(to_use, method)(*method_args, **method_kwargs) | |
38 | |
39 if shuffle: | |
40 helpers.close_or_delete(to_use) | |
41 | |
42 if reduce_func: | |
43 reduced = reduce_func(result) | |
44 if isinstance(result, pybedtools.BedTool): | |
45 helpers.close_or_delete(result) | |
46 return reduced | |
47 else: | |
48 return result | |
49 | |
50 | |
51 def parallel_apply( | |
52 orig_bedtool, | |
53 method, | |
54 genome=None, | |
55 genome_fn=None, | |
56 method_args=None, | |
57 method_kwargs=None, | |
58 shuffle_kwargs=None, | |
59 shuffle=True, | |
60 reduce_func=None, | |
61 processes=1, | |
62 sort=False, | |
63 _orig_pool=None, | |
64 iterations=1000, | |
65 debug=False, | |
66 report_iterations=False, | |
67 ): | |
68 """ | |
69 Call an arbitrary BedTool method many times in parallel. | |
70 | |
71 An example use-case is to generate a null distribution of intersections, | |
72 and then compare this to the actual intersections. | |
73 | |
74 **Important:** due to a known file handle leak in BedTool.__len__, it's | |
75 best to simply check the number of lines in the file, as in the below | |
76 function. This works because BEDTools programs strip any non-interval lines | |
77 in the results. | |
78 | |
79 >>> # set up example BedTools | |
80 >>> a = pybedtools.example_bedtool('a.bed') | |
81 >>> b = pybedtools.example_bedtool('b.bed') | |
82 | |
83 >>> # Method of `a` to call: | |
84 >>> method = 'intersect' | |
85 | |
86 >>> # Kwargs provided to `a.intersect` each iteration | |
87 >>> method_kwargs = dict(b=b, u=True) | |
88 | |
89 >>> # Function that will be called on the results of | |
90 >>> # `a.intersect(**method_kwargs)`. | |
91 >>> def reduce_func(x): | |
92 ... return sum(1 for _ in open(x.fn)) | |
93 | |
94 >>> # Create a small artificial genome for this test (generally you'd | |
95 >>> # use an assembly name, like "hg19"): | |
96 >>> genome = dict(chr1=(0, 1000)) | |
97 | |
98 >>> # Do 10 iterations using 1 process for this test (generally you'd | |
99 >>> # use 1000+ iterations, and as many processes as you have CPUs) | |
100 >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome, | |
101 ... method_kwargs=method_kwargs, iterations=10, processes=1, | |
102 ... reduce_func=reduce_func, debug=True, report_iterations=True) | |
103 | |
104 >>> # get results | |
105 >>> print(list(results)) | |
106 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4] | |
107 | |
108 >>> # We can compare this to the actual intersection: | |
109 >>> reduce_func(a.intersect(**method_kwargs)) | |
110 3 | |
111 | |
112 Alternatively, we could use the `a.jaccard` method, which already does the | |
113 reduction to a dictionary. However, the Jaccard method requires the input | |
114 to be sorted. Here, we specify `sort=True` to sort each shuffled BedTool | |
115 before calling its `jaccard` method. | |
116 | |
117 >>> from pybedtools.parallel import parallel_apply | |
118 >>> a = pybedtools.example_bedtool('a.bed') | |
119 >>> results = parallel_apply(a, method='jaccard', method_args=(b,), | |
120 ... genome=genome, iterations=3, processes=1, sort=True, debug=True) | |
121 >>> for i in results: | |
122 ... print(sorted(i.items())) | |
123 [('intersection', 12), ('jaccard', 0.0171184), ('n_intersections', 1), ('union', 701)] | |
124 [('intersection', 0), ('jaccard', 0.0), ('n_intersections', 0), ('union', 527)] | |
125 [('intersection', 73), ('jaccard', 0.137996), ('n_intersections', 1), ('union', 529)] | |
126 | |
127 Parameters | |
128 ---------- | |
129 orig_bedtool : BedTool | |
130 | |
131 method : str | |
132 The method of `orig_bedtool` to run | |
133 | |
134 method_args : tuple | |
135 Passed directly to getattr(orig_bedtool, method)() | |
136 | |
137 method_kwargs : dict | |
138 Passed directly to getattr(orig_bedtool, method)() | |
139 | |
140 shuffle : bool | |
141 If True, then `orig_bedtool` will be shuffled at each iteration and | |
142 that shuffled version's `method` will be called with `method_args` and | |
143 `method_kwargs`. | |
144 | |
145 shuffle_kwargs : dict | |
146 If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`. | |
147 You do not need to pass the genome here; that's handled separately by | |
148 the `genome` and `genome_fn` kwargs. | |
149 | |
150 iterations : int | |
151 Number of iterations to perform | |
152 | |
153 genome : string or dict | |
154 If string, then assume it is the assembly name (e.g., hg19) and get | |
155 a dictionary of chromsizes for that assembly, then converts to | |
156 a filename. | |
157 | |
158 genome_fn : str | |
159 Mutually exclusive with `genome`; `genome_fn` must be an existing | |
160 filename with the chromsizes. Use the `genome` kwarg instead if you'd | |
161 rather supply an assembly or dict. | |
162 | |
163 reduce_func : callable | |
164 Function or other callable object that accepts, as its only argument, | |
165 the results from `orig_bedtool.method()`. For example, if you care | |
166 about the number of results, then you can use `reduce_func=len`. | |
167 | |
168 processes : int | |
169 Number of processes to run. If `processes=1`, then multiprocessing is | |
170 not used (making it much easier to debug). This argument is ignored if | |
171 `_orig_pool` is provided. | |
172 | |
173 sort : bool | |
174 If both `shuffle` and `sort` are True, then the shuffled BedTool will | |
175 then be sorted. Use this if `method` requires sorted input. | |
176 | |
177 _orig_pool : multiprocessing.Pool instance | |
178 If provided, uses `_orig_pool` instead of creating one. In this case, | |
179 `processes` will be ignored. | |
180 | |
181 debug : bool | |
182 If True, then use the current iteration index as the seed to shuffle. | |
183 | |
184 report_iterations : bool | |
185 If True, then report the number of iterations to stderr. | |
186 """ | |
187 | |
188 shuffle_kwargs = shuffle_kwargs or {} | |
189 method_args = method_args or () | |
190 if not isinstance(method_args, list) and not isinstance(method_args, tuple): | |
191 raise ValueError( | |
192 "method_args must be a list or tuple, got %s" % type(method_args) | |
193 ) | |
194 method_kwargs = method_kwargs or {} | |
195 | |
196 if genome_fn and genome: | |
197 raise ValueError("only of of genome_fn or genome should be provided") | |
198 | |
199 if shuffle: | |
200 if not genome_fn: | |
201 if not genome: | |
202 raise ValueError( | |
203 "shuffle=True, so either genome_fn" " or genome must be provided" | |
204 ) | |
205 genome_fn = pybedtools.chromsizes_to_file(genome) | |
206 | |
207 _parallel_wrap_kwargs = dict( | |
208 orig_bedtool=orig_bedtool, | |
209 shuffle_kwargs=shuffle_kwargs, | |
210 genome_fn=genome_fn, | |
211 method=method, | |
212 method_args=method_args, | |
213 method_kwargs=method_kwargs, | |
214 shuffle=shuffle, | |
215 reduce_func=reduce_func, | |
216 sort=sort, | |
217 ) | |
218 | |
219 def add_seed(i, kwargs): | |
220 if debug and shuffle: | |
221 kwargs["shuffle_kwargs"]["seed"] = i | |
222 return kwargs | |
223 | |
224 if processes == 1: | |
225 for it in range(iterations): | |
226 yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs)) | |
227 return | |
228 | |
229 if _orig_pool: | |
230 p = _orig_pool | |
231 else: | |
232 p = multiprocessing.Pool(processes) | |
233 | |
234 results = [ | |
235 p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs)) | |
236 for it in range(iterations) | |
237 ] | |
238 for i, r in enumerate(results): | |
239 yield r.get() | |
240 if report_iterations: | |
241 sys.stderr.write("%s\r" % i) | |
242 sys.stderr.flush() |