comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/gprofiler/gprofiler.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 from typing import Union, List, Dict, Any
2
3 import requests
4
5 from gprofiler.version import __version__
6
7
8 class GProfiler():
9 def __init__(self, user_agent: str = '', base_url: str = None, return_dataframe: bool = False):
10 '''
11 A class representing the g:Profiler toolkit. Contains methods for
12 querying the g:GOSt, g:Convert, g:Orth and g:SNPense tools. Please see the
13 g:Profiler web tool (https://biit.cs.ut.ee/gprofiler/) for extensive documentation on all the options to
14 the methods.
15
16 :param user_agent: the URL used for the g:Profiler service.
17 :param base_url: the URL used for the g:Profiler service.
18 :param return_dataframe: if True, query results are presented as pandas DataFrames.
19 '''
20 self.user_agent = 'gprofiler-python {version}/{user_agent}'.format(version=__version__, user_agent=user_agent)
21
22 if base_url is None:
23 self.base_url = 'https://biit.cs.ut.ee/gprofiler'
24 else:
25 self.base_url = base_url
26
27 self.return_dataframe = return_dataframe
28 if return_dataframe:
29 self._pandas = self._get_pandas_module()
30
31 self.meta = None
32
33 @staticmethod
34 def _get_pandas_module():
35 is_pandas_module = lambda x: getattr(x, '__name__', '') == 'pandas'
36 namespace = globals()
37 if 'pd' in namespace and is_pandas_module(namespace['pd']):
38 return namespace['pd']
39 elif 'pandas' in namespace and is_pandas_module(namespace['pandas']):
40 return namespace['pandas']
41 else:
42 import importlib
43 return importlib.import_module('pandas')
44
45
46 def __getattr__(self, item):
47 if item in ['gprofile', 'gorth', 'gconvert']:
48 raise NotImplementedError('''`{}` has been renamed `{}` and has a new interface
49 To use the previous version use the command `pip install --upgrade --no-deps --force-reinstall gprofiler-official==0.3.5`
50 '''.format(item, item[1:]))
51 raise AttributeError('{} is not an attribute of {}'.format(item, self.__class__.__name__))
52
53
54
55 def profile(
56 self,
57 query: Union[str, List[str], Dict[str, List[str]]],
58 organism: str = 'hsapiens',
59 sources: List[str] = tuple(),
60 user_threshold: float = 0.05,
61 all_results: bool = False,
62 ordered: bool = False,
63 no_evidences: bool = True,
64 combined: bool = False,
65 measure_underrepresentation: bool = False,
66 no_iea: bool = False,
67 domain_scope: str = 'annotated',
68 numeric_namespace: str = '',
69 significance_threshold_method: str = 'g_SCS',
70 background: str = None,
71
72 ) -> List[Dict[str, Any]]:
73 """
74 performs functional profiling of gene lists using various kinds of biological evidence.
75 The tool performs statistical enrichment analysis to find over-representation of information from Gene Ontology terms,
76 biological pathways, regulatory DNA elements, human disease gene annotations, and protein-protein interaction networks.
77
78
79
80 :param query: list of genes to profile. For running multiple queries at once, accepts a dictionary of lists as well.
81 :param organism: Organism id for profiling. For full list see https://biit.cs.ut.ee/gprofiler/page/organism-list
82 :param sources: List of annotation sources to include in analysis. Defaults to all known.
83 :param user_threshold: Significance threshold for analysis.
84 :param all_results: If True, return all analysis results regardless of statistical significance.
85 :param ordered: If True, considers the order of input query to be significant. See https://biit.cs.ut.ee/gprofiler/page/docs#ordered_gene_lists
86 :param no_evidences: If False, the results include lists of intersections and evidences for the intersections
87 :param combined: If True, performs all queries and combines the results into a single table. NB! changes the output format.
88 :param measure_underrepresentation: if True, performs test for significantly under-represented functional terms.
89 :param no_iea: If True, excludes electronically annotated Gene Ontology terms before analysis.
90 :param domain_scope: "known" for using all known genes as background, "annotated" to use all genes annotated for particular datasource.
91 :param numeric_namespace: name for the numeric namespace to use if there are numeric values in the query.
92 :param significance_threshold_method: method for multiple correction. "g_SCS"|"bonferroni"|"fdr". https://biit.cs.ut.ee/gprofiler/page/docs#significance_threhshold
93 :param background: List of genes to use as a statistical background.
94 :return:
95 """
96
97 if background is not None:
98 domain_scope = 'custom'
99
100 r = requests.post(
101 '{}/api/gost/profile/'.format(self.base_url.rstrip("/")),
102 json={
103 'organism': organism, # string, eg "hsapiens"
104 'query': query, # whitespace-delimited string or list of strings or object of strings to lists of strings
105 'sources': sources, # list of strings, for example:
106 'user_threshold': user_threshold, # significance threshold, defaults to 0.05
107 'all_results': all_results, # bool
108 'no_evidences': no_evidences, # bool - if set to true, saves on database lookups
109 'combined': combined, # bool, set to true for g:Cocoa output
110 'measure_underrepresentation': measure_underrepresentation, # bool
111 'no_iea': no_iea, # bool
112 'numeric_ns': numeric_namespace, # string
113 'domain_scope': domain_scope, # string 'known'|'annotated'|'custom'
114 'ordered': ordered, # bool, set to true for ordered query
115 'significance_threshold_method': significance_threshold_method, # string, "g_SCS"|"bonferroni"|"fdr", "g_SCS"by default
116 'background': background if background is not None else '' # string, background name or query string
117
118 }
119 , headers={'User-Agent': self.user_agent})
120
121 if r.status_code != 200:
122 message = ''
123 try:
124 message = r.json()['message']
125 except:
126 message = 'query failed with error {}'.format(r.status_code)
127 raise AssertionError(message)
128 res = r.json()
129
130 meta = res['meta']
131 self.meta = meta
132
133 if not combined:
134 columns = ['source',
135 'native',
136 'name',
137 'p_value',
138 'significant',
139 'description',
140 'term_size',
141 'query_size',
142 'intersection_size',
143 'effective_domain_size',
144 'precision',
145 'recall',
146 'query',
147 'parents']
148 if not no_evidences:
149 columns.append('intersections')
150 columns.append('evidences')
151 else:
152 columns = [
153 'source',
154 'native',
155 'name',
156 'p_values',
157 'description',
158 'term_size',
159 'query_sizes',
160 'intersection_sizes',
161 'effective_domain_size',
162 'parents']
163
164 queries = (meta['query_metadata']['queries'].keys())
165
166 if not no_evidences and not combined:
167 reverse_mappings = {}
168 for query in queries:
169 mapping = (meta['genes_metadata']['query'][query]['mapping'])
170 reverse_mapping = {}
171 for k, v in mapping.items():
172 if len(v) == 1:
173 # one-to-one mapping
174 reverse_mapping[v[0]] = k
175 else:
176 # one-to=many mapping, we'll use the gene ID
177 for i in v:
178 reverse_mapping[i] = i
179 reverse_mappings[query] = reverse_mapping
180
181 for result in res['result']:
182 mapping = reverse_mappings[result['query']]
183 genes = []
184 for i in meta['genes_metadata']['query'][result['query']]['ensgs']:
185 genes.append(mapping[i])
186 result['evidences'] = [i for i in result['intersections'] if i]
187 result['intersections'] = ([gene for ev, gene in zip(result['intersections'], genes) if ev])
188
189 if not self.return_dataframe:
190 columns = set(columns)
191
192 # filter the columns
193 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
194 return result
195
196 else:
197
198 df = self._pandas.DataFrame(res['result'])
199
200 if len(df) > 0:
201 df = df[columns]
202
203 else:
204 return self._pandas.DataFrame(columns=columns)
205 return df
206
207 def convert(
208 self,
209 query: Union[str, List[str], Dict[str, List[str]]],
210 organism: str = 'hsapiens',
211 target_namespace: str = 'ENSG',
212 numeric_namespace: str = 'ENTREZGENE'
213 ) -> List[Dict[str, Any]]:
214 """
215 Query g:Convert.
216
217 :param query: list of genes to convert
218 :param organism: organism id
219 :param target_namespace: namespace to convert into
220 :param numeric_namespace
221 """
222 r = requests.post(
223 '{}/api/convert/convert'.format(self.base_url),
224 json={
225 'organism': organism,
226 'query': query,
227 'target': target_namespace,
228 'numeric_ns': numeric_namespace,
229 'output': 'json'
230 },
231 headers={'User-Agent': self.user_agent}
232 )
233
234 if r.status_code != 200:
235 message = ''
236 try:
237 message = r.json()['message']
238 except:
239 message = 'query failed with error {}'.format(r.status_code)
240 raise AssertionError(message)
241 res = r.json()
242
243 meta = res['meta']
244 self.meta = meta
245 columns = ['incoming', 'converted', 'n_incoming', 'n_converted', 'name', 'description', 'namespaces', 'query']
246
247 if not self.return_dataframe:
248 columns = set(columns)
249
250 # filter the columns
251 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
252 return result
253
254 df = self._pandas.DataFrame(res['result'])
255 df = df[columns]
256
257 return df
258
259 def orth(self,
260 query: List[str],
261 organism: str = "hsapiens",
262 target: str = "mmusculus",
263 aresolve: Dict[str, str] = None,
264 numeric_namespace: str = 'ENTREZGENE'):
265 """
266 Query g:Orth.
267
268
269 :param query:
270 :param organism:
271 :param target:
272 :param aresolve:
273 :param numeric_namespace:
274 """
275 r = requests.post(
276 '{}/api/orth/orth'.format(self.base_url),
277 json={
278 'organism': organism,
279 'query': query,
280 'target': target,
281 'numeric_ns': numeric_namespace,
282 'aresolve': aresolve,
283 'output': 'json'
284 },
285 headers={'User-Agent': self.user_agent}
286 )
287
288 if r.status_code != 200:
289 message = ''
290 try:
291 message = r.json()['message']
292 except:
293 message = 'query failed with error {}'.format(r.status_code)
294 raise AssertionError(message)
295 res = r.json()
296 meta = res['meta']
297 self.meta = meta
298 columns = ['incoming', 'converted', 'ortholog_ensg', 'n_incoming', 'n_converted', 'n_result', 'name', 'description', 'namespaces']
299 if not self.return_dataframe:
300 columns = set(columns)
301
302 # filter the columns
303 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
304 return result
305
306
307 df = self._pandas.DataFrame(res['result'])
308 df = df[columns]
309
310 return df
311
312 def snpense(self,
313 query: List[str]):
314 """
315
316 :param query:
317 """
318 r = requests.post(
319 '{}/api/snpense/snpense'.format(self.base_url),
320 json={
321 'query': query,
322 'output': 'json+'
323 },
324 headers={'User-Agent': self.user_agent}
325 )
326
327 if r.status_code != 200:
328 message = ''
329 try:
330 message = r.json()['message']
331 except:
332 message = 'query failed with error {}'.format(r.status_code)
333 raise AssertionError(message)
334 res = r.json()
335 meta = res['meta']
336 self.meta = meta
337 columns = ['rs_id', 'chromosome', 'strand', 'start', 'end', 'ensgs', 'gene_names', 'variants']
338 if not self.return_dataframe:
339 columns = set(columns)
340
341 # filter the columns
342 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
343 return result
344
345 df = self._pandas.DataFrame(res['result'])
346 df = df[columns]
347
348 return df