comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pybedtools/parallel.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 import sys
2 import multiprocessing
3 from . import helpers
4 import pybedtools
5
6
7 def _parallel_wrap(
8 orig_bedtool,
9 shuffle_kwargs,
10 genome_fn,
11 method,
12 method_args,
13 method_kwargs,
14 sort=False,
15 shuffle=True,
16 reduce_func=None,
17 ):
18 """
19 Given a BedTool object `orig_bedtool`, call its `method` with `args` and
20 `kwargs` and then call `reduce_func` on the results.
21
22 See parallel_apply docstring for details
23
24 """
25
26 # note: be careful about cleaning up tempfiles
27 if not shuffle:
28 to_use = orig_bedtool
29 else:
30 shuffled = orig_bedtool.shuffle(g=genome_fn, **shuffle_kwargs)
31 if sort:
32 to_use = shuffled.sort()
33 helpers.close_or_delete(shuffled)
34 else:
35 to_use = shuffled
36
37 result = getattr(to_use, method)(*method_args, **method_kwargs)
38
39 if shuffle:
40 helpers.close_or_delete(to_use)
41
42 if reduce_func:
43 reduced = reduce_func(result)
44 if isinstance(result, pybedtools.BedTool):
45 helpers.close_or_delete(result)
46 return reduced
47 else:
48 return result
49
50
51 def parallel_apply(
52 orig_bedtool,
53 method,
54 genome=None,
55 genome_fn=None,
56 method_args=None,
57 method_kwargs=None,
58 shuffle_kwargs=None,
59 shuffle=True,
60 reduce_func=None,
61 processes=1,
62 sort=False,
63 _orig_pool=None,
64 iterations=1000,
65 debug=False,
66 report_iterations=False,
67 ):
68 """
69 Call an arbitrary BedTool method many times in parallel.
70
71 An example use-case is to generate a null distribution of intersections,
72 and then compare this to the actual intersections.
73
74 **Important:** due to a known file handle leak in BedTool.__len__, it's
75 best to simply check the number of lines in the file, as in the below
76 function. This works because BEDTools programs strip any non-interval lines
77 in the results.
78
79 >>> # set up example BedTools
80 >>> a = pybedtools.example_bedtool('a.bed')
81 >>> b = pybedtools.example_bedtool('b.bed')
82
83 >>> # Method of `a` to call:
84 >>> method = 'intersect'
85
86 >>> # Kwargs provided to `a.intersect` each iteration
87 >>> method_kwargs = dict(b=b, u=True)
88
89 >>> # Function that will be called on the results of
90 >>> # `a.intersect(**method_kwargs)`.
91 >>> def reduce_func(x):
92 ... return sum(1 for _ in open(x.fn))
93
94 >>> # Create a small artificial genome for this test (generally you'd
95 >>> # use an assembly name, like "hg19"):
96 >>> genome = dict(chr1=(0, 1000))
97
98 >>> # Do 10 iterations using 1 process for this test (generally you'd
99 >>> # use 1000+ iterations, and as many processes as you have CPUs)
100 >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome,
101 ... method_kwargs=method_kwargs, iterations=10, processes=1,
102 ... reduce_func=reduce_func, debug=True, report_iterations=True)
103
104 >>> # get results
105 >>> print(list(results))
106 [1, 0, 1, 2, 4, 2, 2, 1, 2, 4]
107
108 >>> # We can compare this to the actual intersection:
109 >>> reduce_func(a.intersect(**method_kwargs))
110 3
111
112 Alternatively, we could use the `a.jaccard` method, which already does the
113 reduction to a dictionary. However, the Jaccard method requires the input
114 to be sorted. Here, we specify `sort=True` to sort each shuffled BedTool
115 before calling its `jaccard` method.
116
117 >>> from pybedtools.parallel import parallel_apply
118 >>> a = pybedtools.example_bedtool('a.bed')
119 >>> results = parallel_apply(a, method='jaccard', method_args=(b,),
120 ... genome=genome, iterations=3, processes=1, sort=True, debug=True)
121 >>> for i in results:
122 ... print(sorted(i.items()))
123 [('intersection', 12), ('jaccard', 0.0171184), ('n_intersections', 1), ('union', 701)]
124 [('intersection', 0), ('jaccard', 0.0), ('n_intersections', 0), ('union', 527)]
125 [('intersection', 73), ('jaccard', 0.137996), ('n_intersections', 1), ('union', 529)]
126
127 Parameters
128 ----------
129 orig_bedtool : BedTool
130
131 method : str
132 The method of `orig_bedtool` to run
133
134 method_args : tuple
135 Passed directly to getattr(orig_bedtool, method)()
136
137 method_kwargs : dict
138 Passed directly to getattr(orig_bedtool, method)()
139
140 shuffle : bool
141 If True, then `orig_bedtool` will be shuffled at each iteration and
142 that shuffled version's `method` will be called with `method_args` and
143 `method_kwargs`.
144
145 shuffle_kwargs : dict
146 If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`.
147 You do not need to pass the genome here; that's handled separately by
148 the `genome` and `genome_fn` kwargs.
149
150 iterations : int
151 Number of iterations to perform
152
153 genome : string or dict
154 If string, then assume it is the assembly name (e.g., hg19) and get
155 a dictionary of chromsizes for that assembly, then converts to
156 a filename.
157
158 genome_fn : str
159 Mutually exclusive with `genome`; `genome_fn` must be an existing
160 filename with the chromsizes. Use the `genome` kwarg instead if you'd
161 rather supply an assembly or dict.
162
163 reduce_func : callable
164 Function or other callable object that accepts, as its only argument,
165 the results from `orig_bedtool.method()`. For example, if you care
166 about the number of results, then you can use `reduce_func=len`.
167
168 processes : int
169 Number of processes to run. If `processes=1`, then multiprocessing is
170 not used (making it much easier to debug). This argument is ignored if
171 `_orig_pool` is provided.
172
173 sort : bool
174 If both `shuffle` and `sort` are True, then the shuffled BedTool will
175 then be sorted. Use this if `method` requires sorted input.
176
177 _orig_pool : multiprocessing.Pool instance
178 If provided, uses `_orig_pool` instead of creating one. In this case,
179 `processes` will be ignored.
180
181 debug : bool
182 If True, then use the current iteration index as the seed to shuffle.
183
184 report_iterations : bool
185 If True, then report the number of iterations to stderr.
186 """
187
188 shuffle_kwargs = shuffle_kwargs or {}
189 method_args = method_args or ()
190 if not isinstance(method_args, list) and not isinstance(method_args, tuple):
191 raise ValueError(
192 "method_args must be a list or tuple, got %s" % type(method_args)
193 )
194 method_kwargs = method_kwargs or {}
195
196 if genome_fn and genome:
197 raise ValueError("only of of genome_fn or genome should be provided")
198
199 if shuffle:
200 if not genome_fn:
201 if not genome:
202 raise ValueError(
203 "shuffle=True, so either genome_fn" " or genome must be provided"
204 )
205 genome_fn = pybedtools.chromsizes_to_file(genome)
206
207 _parallel_wrap_kwargs = dict(
208 orig_bedtool=orig_bedtool,
209 shuffle_kwargs=shuffle_kwargs,
210 genome_fn=genome_fn,
211 method=method,
212 method_args=method_args,
213 method_kwargs=method_kwargs,
214 shuffle=shuffle,
215 reduce_func=reduce_func,
216 sort=sort,
217 )
218
219 def add_seed(i, kwargs):
220 if debug and shuffle:
221 kwargs["shuffle_kwargs"]["seed"] = i
222 return kwargs
223
224 if processes == 1:
225 for it in range(iterations):
226 yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs))
227 return
228
229 if _orig_pool:
230 p = _orig_pool
231 else:
232 p = multiprocessing.Pool(processes)
233
234 results = [
235 p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs))
236 for it in range(iterations)
237 ]
238 for i, r in enumerate(results):
239 yield r.get()
240 if report_iterations:
241 sys.stderr.write("%s\r" % i)
242 sys.stderr.flush()