Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/gprofiler/gprofiler.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 from typing import Union, List, Dict, Any | |
2 | |
3 import requests | |
4 | |
5 from gprofiler.version import __version__ | |
6 | |
7 | |
8 class GProfiler(): | |
9 def __init__(self, user_agent: str = '', base_url: str = None, return_dataframe: bool = False): | |
10 ''' | |
11 A class representing the g:Profiler toolkit. Contains methods for | |
12 querying the g:GOSt, g:Convert, g:Orth and g:SNPense tools. Please see the | |
13 g:Profiler web tool (https://biit.cs.ut.ee/gprofiler/) for extensive documentation on all the options to | |
14 the methods. | |
15 | |
16 :param user_agent: the URL used for the g:Profiler service. | |
17 :param base_url: the URL used for the g:Profiler service. | |
18 :param return_dataframe: if True, query results are presented as pandas DataFrames. | |
19 ''' | |
20 self.user_agent = 'gprofiler-python {version}/{user_agent}'.format(version=__version__, user_agent=user_agent) | |
21 | |
22 if base_url is None: | |
23 self.base_url = 'https://biit.cs.ut.ee/gprofiler' | |
24 else: | |
25 self.base_url = base_url | |
26 | |
27 self.return_dataframe = return_dataframe | |
28 if return_dataframe: | |
29 self._pandas = self._get_pandas_module() | |
30 | |
31 self.meta = None | |
32 | |
33 @staticmethod | |
34 def _get_pandas_module(): | |
35 is_pandas_module = lambda x: getattr(x, '__name__', '') == 'pandas' | |
36 namespace = globals() | |
37 if 'pd' in namespace and is_pandas_module(namespace['pd']): | |
38 return namespace['pd'] | |
39 elif 'pandas' in namespace and is_pandas_module(namespace['pandas']): | |
40 return namespace['pandas'] | |
41 else: | |
42 import importlib | |
43 return importlib.import_module('pandas') | |
44 | |
45 | |
46 def __getattr__(self, item): | |
47 if item in ['gprofile', 'gorth', 'gconvert']: | |
48 raise NotImplementedError('''`{}` has been renamed `{}` and has a new interface | |
49 To use the previous version use the command `pip install --upgrade --no-deps --force-reinstall gprofiler-official==0.3.5` | |
50 '''.format(item, item[1:])) | |
51 raise AttributeError('{} is not an attribute of {}'.format(item, self.__class__.__name__)) | |
52 | |
53 | |
54 | |
55 def profile( | |
56 self, | |
57 query: Union[str, List[str], Dict[str, List[str]]], | |
58 organism: str = 'hsapiens', | |
59 sources: List[str] = tuple(), | |
60 user_threshold: float = 0.05, | |
61 all_results: bool = False, | |
62 ordered: bool = False, | |
63 no_evidences: bool = True, | |
64 combined: bool = False, | |
65 measure_underrepresentation: bool = False, | |
66 no_iea: bool = False, | |
67 domain_scope: str = 'annotated', | |
68 numeric_namespace: str = '', | |
69 significance_threshold_method: str = 'g_SCS', | |
70 background: str = None, | |
71 | |
72 ) -> List[Dict[str, Any]]: | |
73 """ | |
74 performs functional profiling of gene lists using various kinds of biological evidence. | |
75 The tool performs statistical enrichment analysis to find over-representation of information from Gene Ontology terms, | |
76 biological pathways, regulatory DNA elements, human disease gene annotations, and protein-protein interaction networks. | |
77 | |
78 | |
79 | |
80 :param query: list of genes to profile. For running multiple queries at once, accepts a dictionary of lists as well. | |
81 :param organism: Organism id for profiling. For full list see https://biit.cs.ut.ee/gprofiler/page/organism-list | |
82 :param sources: List of annotation sources to include in analysis. Defaults to all known. | |
83 :param user_threshold: Significance threshold for analysis. | |
84 :param all_results: If True, return all analysis results regardless of statistical significance. | |
85 :param ordered: If True, considers the order of input query to be significant. See https://biit.cs.ut.ee/gprofiler/page/docs#ordered_gene_lists | |
86 :param no_evidences: If False, the results include lists of intersections and evidences for the intersections | |
87 :param combined: If True, performs all queries and combines the results into a single table. NB! changes the output format. | |
88 :param measure_underrepresentation: if True, performs test for significantly under-represented functional terms. | |
89 :param no_iea: If True, excludes electronically annotated Gene Ontology terms before analysis. | |
90 :param domain_scope: "known" for using all known genes as background, "annotated" to use all genes annotated for particular datasource. | |
91 :param numeric_namespace: name for the numeric namespace to use if there are numeric values in the query. | |
92 :param significance_threshold_method: method for multiple correction. "g_SCS"|"bonferroni"|"fdr". https://biit.cs.ut.ee/gprofiler/page/docs#significance_threhshold | |
93 :param background: List of genes to use as a statistical background. | |
94 :return: | |
95 """ | |
96 | |
97 if background is not None: | |
98 domain_scope = 'custom' | |
99 | |
100 r = requests.post( | |
101 '{}/api/gost/profile/'.format(self.base_url.rstrip("/")), | |
102 json={ | |
103 'organism': organism, # string, eg "hsapiens" | |
104 'query': query, # whitespace-delimited string or list of strings or object of strings to lists of strings | |
105 'sources': sources, # list of strings, for example: | |
106 'user_threshold': user_threshold, # significance threshold, defaults to 0.05 | |
107 'all_results': all_results, # bool | |
108 'no_evidences': no_evidences, # bool - if set to true, saves on database lookups | |
109 'combined': combined, # bool, set to true for g:Cocoa output | |
110 'measure_underrepresentation': measure_underrepresentation, # bool | |
111 'no_iea': no_iea, # bool | |
112 'numeric_ns': numeric_namespace, # string | |
113 'domain_scope': domain_scope, # string 'known'|'annotated'|'custom' | |
114 'ordered': ordered, # bool, set to true for ordered query | |
115 'significance_threshold_method': significance_threshold_method, # string, "g_SCS"|"bonferroni"|"fdr", "g_SCS"by default | |
116 'background': background if background is not None else '' # string, background name or query string | |
117 | |
118 } | |
119 , headers={'User-Agent': self.user_agent}) | |
120 | |
121 if r.status_code != 200: | |
122 message = '' | |
123 try: | |
124 message = r.json()['message'] | |
125 except: | |
126 message = 'query failed with error {}'.format(r.status_code) | |
127 raise AssertionError(message) | |
128 res = r.json() | |
129 | |
130 meta = res['meta'] | |
131 self.meta = meta | |
132 | |
133 if not combined: | |
134 columns = ['source', | |
135 'native', | |
136 'name', | |
137 'p_value', | |
138 'significant', | |
139 'description', | |
140 'term_size', | |
141 'query_size', | |
142 'intersection_size', | |
143 'effective_domain_size', | |
144 'precision', | |
145 'recall', | |
146 'query', | |
147 'parents'] | |
148 if not no_evidences: | |
149 columns.append('intersections') | |
150 columns.append('evidences') | |
151 else: | |
152 columns = [ | |
153 'source', | |
154 'native', | |
155 'name', | |
156 'p_values', | |
157 'description', | |
158 'term_size', | |
159 'query_sizes', | |
160 'intersection_sizes', | |
161 'effective_domain_size', | |
162 'parents'] | |
163 | |
164 queries = (meta['query_metadata']['queries'].keys()) | |
165 | |
166 if not no_evidences and not combined: | |
167 reverse_mappings = {} | |
168 for query in queries: | |
169 mapping = (meta['genes_metadata']['query'][query]['mapping']) | |
170 reverse_mapping = {} | |
171 for k, v in mapping.items(): | |
172 if len(v) == 1: | |
173 # one-to-one mapping | |
174 reverse_mapping[v[0]] = k | |
175 else: | |
176 # one-to=many mapping, we'll use the gene ID | |
177 for i in v: | |
178 reverse_mapping[i] = i | |
179 reverse_mappings[query] = reverse_mapping | |
180 | |
181 for result in res['result']: | |
182 mapping = reverse_mappings[result['query']] | |
183 genes = [] | |
184 for i in meta['genes_metadata']['query'][result['query']]['ensgs']: | |
185 genes.append(mapping[i]) | |
186 result['evidences'] = [i for i in result['intersections'] if i] | |
187 result['intersections'] = ([gene for ev, gene in zip(result['intersections'], genes) if ev]) | |
188 | |
189 if not self.return_dataframe: | |
190 columns = set(columns) | |
191 | |
192 # filter the columns | |
193 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] | |
194 return result | |
195 | |
196 else: | |
197 | |
198 df = self._pandas.DataFrame(res['result']) | |
199 | |
200 if len(df) > 0: | |
201 df = df[columns] | |
202 | |
203 else: | |
204 return self._pandas.DataFrame(columns=columns) | |
205 return df | |
206 | |
207 def convert( | |
208 self, | |
209 query: Union[str, List[str], Dict[str, List[str]]], | |
210 organism: str = 'hsapiens', | |
211 target_namespace: str = 'ENSG', | |
212 numeric_namespace: str = 'ENTREZGENE' | |
213 ) -> List[Dict[str, Any]]: | |
214 """ | |
215 Query g:Convert. | |
216 | |
217 :param query: list of genes to convert | |
218 :param organism: organism id | |
219 :param target_namespace: namespace to convert into | |
220 :param numeric_namespace | |
221 """ | |
222 r = requests.post( | |
223 '{}/api/convert/convert'.format(self.base_url), | |
224 json={ | |
225 'organism': organism, | |
226 'query': query, | |
227 'target': target_namespace, | |
228 'numeric_ns': numeric_namespace, | |
229 'output': 'json' | |
230 }, | |
231 headers={'User-Agent': self.user_agent} | |
232 ) | |
233 | |
234 if r.status_code != 200: | |
235 message = '' | |
236 try: | |
237 message = r.json()['message'] | |
238 except: | |
239 message = 'query failed with error {}'.format(r.status_code) | |
240 raise AssertionError(message) | |
241 res = r.json() | |
242 | |
243 meta = res['meta'] | |
244 self.meta = meta | |
245 columns = ['incoming', 'converted', 'n_incoming', 'n_converted', 'name', 'description', 'namespaces', 'query'] | |
246 | |
247 if not self.return_dataframe: | |
248 columns = set(columns) | |
249 | |
250 # filter the columns | |
251 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] | |
252 return result | |
253 | |
254 df = self._pandas.DataFrame(res['result']) | |
255 df = df[columns] | |
256 | |
257 return df | |
258 | |
259 def orth(self, | |
260 query: List[str], | |
261 organism: str = "hsapiens", | |
262 target: str = "mmusculus", | |
263 aresolve: Dict[str, str] = None, | |
264 numeric_namespace: str = 'ENTREZGENE'): | |
265 """ | |
266 Query g:Orth. | |
267 | |
268 | |
269 :param query: | |
270 :param organism: | |
271 :param target: | |
272 :param aresolve: | |
273 :param numeric_namespace: | |
274 """ | |
275 r = requests.post( | |
276 '{}/api/orth/orth'.format(self.base_url), | |
277 json={ | |
278 'organism': organism, | |
279 'query': query, | |
280 'target': target, | |
281 'numeric_ns': numeric_namespace, | |
282 'aresolve': aresolve, | |
283 'output': 'json' | |
284 }, | |
285 headers={'User-Agent': self.user_agent} | |
286 ) | |
287 | |
288 if r.status_code != 200: | |
289 message = '' | |
290 try: | |
291 message = r.json()['message'] | |
292 except: | |
293 message = 'query failed with error {}'.format(r.status_code) | |
294 raise AssertionError(message) | |
295 res = r.json() | |
296 meta = res['meta'] | |
297 self.meta = meta | |
298 columns = ['incoming', 'converted', 'ortholog_ensg', 'n_incoming', 'n_converted', 'n_result', 'name', 'description', 'namespaces'] | |
299 if not self.return_dataframe: | |
300 columns = set(columns) | |
301 | |
302 # filter the columns | |
303 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] | |
304 return result | |
305 | |
306 | |
307 df = self._pandas.DataFrame(res['result']) | |
308 df = df[columns] | |
309 | |
310 return df | |
311 | |
312 def snpense(self, | |
313 query: List[str]): | |
314 """ | |
315 | |
316 :param query: | |
317 """ | |
318 r = requests.post( | |
319 '{}/api/snpense/snpense'.format(self.base_url), | |
320 json={ | |
321 'query': query, | |
322 'output': 'json+' | |
323 }, | |
324 headers={'User-Agent': self.user_agent} | |
325 ) | |
326 | |
327 if r.status_code != 200: | |
328 message = '' | |
329 try: | |
330 message = r.json()['message'] | |
331 except: | |
332 message = 'query failed with error {}'.format(r.status_code) | |
333 raise AssertionError(message) | |
334 res = r.json() | |
335 meta = res['meta'] | |
336 self.meta = meta | |
337 columns = ['rs_id', 'chromosome', 'strand', 'start', 'end', 'ensgs', 'gene_names', 'variants'] | |
338 if not self.return_dataframe: | |
339 columns = set(columns) | |
340 | |
341 # filter the columns | |
342 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] | |
343 return result | |
344 | |
345 df = self._pandas.DataFrame(res['result']) | |
346 df = df[columns] | |
347 | |
348 return df |