jpayne@69
|
1 from typing import Union, List, Dict, Any
|
jpayne@69
|
2
|
jpayne@69
|
3 import requests
|
jpayne@69
|
4
|
jpayne@69
|
5 from gprofiler.version import __version__
|
jpayne@69
|
6
|
jpayne@69
|
7
|
jpayne@69
|
8 class GProfiler():
|
jpayne@69
|
9 def __init__(self, user_agent: str = '', base_url: str = None, return_dataframe: bool = False):
|
jpayne@69
|
10 '''
|
jpayne@69
|
11 A class representing the g:Profiler toolkit. Contains methods for
|
jpayne@69
|
12 querying the g:GOSt, g:Convert, g:Orth and g:SNPense tools. Please see the
|
jpayne@69
|
13 g:Profiler web tool (https://biit.cs.ut.ee/gprofiler/) for extensive documentation on all the options to
|
jpayne@69
|
14 the methods.
|
jpayne@69
|
15
|
jpayne@69
|
16 :param user_agent: the URL used for the g:Profiler service.
|
jpayne@69
|
17 :param base_url: the URL used for the g:Profiler service.
|
jpayne@69
|
18 :param return_dataframe: if True, query results are presented as pandas DataFrames.
|
jpayne@69
|
19 '''
|
jpayne@69
|
20 self.user_agent = 'gprofiler-python {version}/{user_agent}'.format(version=__version__, user_agent=user_agent)
|
jpayne@69
|
21
|
jpayne@69
|
22 if base_url is None:
|
jpayne@69
|
23 self.base_url = 'https://biit.cs.ut.ee/gprofiler'
|
jpayne@69
|
24 else:
|
jpayne@69
|
25 self.base_url = base_url
|
jpayne@69
|
26
|
jpayne@69
|
27 self.return_dataframe = return_dataframe
|
jpayne@69
|
28 if return_dataframe:
|
jpayne@69
|
29 self._pandas = self._get_pandas_module()
|
jpayne@69
|
30
|
jpayne@69
|
31 self.meta = None
|
jpayne@69
|
32
|
jpayne@69
|
33 @staticmethod
|
jpayne@69
|
34 def _get_pandas_module():
|
jpayne@69
|
35 is_pandas_module = lambda x: getattr(x, '__name__', '') == 'pandas'
|
jpayne@69
|
36 namespace = globals()
|
jpayne@69
|
37 if 'pd' in namespace and is_pandas_module(namespace['pd']):
|
jpayne@69
|
38 return namespace['pd']
|
jpayne@69
|
39 elif 'pandas' in namespace and is_pandas_module(namespace['pandas']):
|
jpayne@69
|
40 return namespace['pandas']
|
jpayne@69
|
41 else:
|
jpayne@69
|
42 import importlib
|
jpayne@69
|
43 return importlib.import_module('pandas')
|
jpayne@69
|
44
|
jpayne@69
|
45
|
jpayne@69
|
46 def __getattr__(self, item):
|
jpayne@69
|
47 if item in ['gprofile', 'gorth', 'gconvert']:
|
jpayne@69
|
48 raise NotImplementedError('''`{}` has been renamed `{}` and has a new interface
|
jpayne@69
|
49 To use the previous version use the command `pip install --upgrade --no-deps --force-reinstall gprofiler-official==0.3.5`
|
jpayne@69
|
50 '''.format(item, item[1:]))
|
jpayne@69
|
51 raise AttributeError('{} is not an attribute of {}'.format(item, self.__class__.__name__))
|
jpayne@69
|
52
|
jpayne@69
|
53
|
jpayne@69
|
54
|
jpayne@69
|
55 def profile(
|
jpayne@69
|
56 self,
|
jpayne@69
|
57 query: Union[str, List[str], Dict[str, List[str]]],
|
jpayne@69
|
58 organism: str = 'hsapiens',
|
jpayne@69
|
59 sources: List[str] = tuple(),
|
jpayne@69
|
60 user_threshold: float = 0.05,
|
jpayne@69
|
61 all_results: bool = False,
|
jpayne@69
|
62 ordered: bool = False,
|
jpayne@69
|
63 no_evidences: bool = True,
|
jpayne@69
|
64 combined: bool = False,
|
jpayne@69
|
65 measure_underrepresentation: bool = False,
|
jpayne@69
|
66 no_iea: bool = False,
|
jpayne@69
|
67 domain_scope: str = 'annotated',
|
jpayne@69
|
68 numeric_namespace: str = '',
|
jpayne@69
|
69 significance_threshold_method: str = 'g_SCS',
|
jpayne@69
|
70 background: str = None,
|
jpayne@69
|
71
|
jpayne@69
|
72 ) -> List[Dict[str, Any]]:
|
jpayne@69
|
73 """
|
jpayne@69
|
74 performs functional profiling of gene lists using various kinds of biological evidence.
|
jpayne@69
|
75 The tool performs statistical enrichment analysis to find over-representation of information from Gene Ontology terms,
|
jpayne@69
|
76 biological pathways, regulatory DNA elements, human disease gene annotations, and protein-protein interaction networks.
|
jpayne@69
|
77
|
jpayne@69
|
78
|
jpayne@69
|
79
|
jpayne@69
|
80 :param query: list of genes to profile. For running multiple queries at once, accepts a dictionary of lists as well.
|
jpayne@69
|
81 :param organism: Organism id for profiling. For full list see https://biit.cs.ut.ee/gprofiler/page/organism-list
|
jpayne@69
|
82 :param sources: List of annotation sources to include in analysis. Defaults to all known.
|
jpayne@69
|
83 :param user_threshold: Significance threshold for analysis.
|
jpayne@69
|
84 :param all_results: If True, return all analysis results regardless of statistical significance.
|
jpayne@69
|
85 :param ordered: If True, considers the order of input query to be significant. See https://biit.cs.ut.ee/gprofiler/page/docs#ordered_gene_lists
|
jpayne@69
|
86 :param no_evidences: If False, the results include lists of intersections and evidences for the intersections
|
jpayne@69
|
87 :param combined: If True, performs all queries and combines the results into a single table. NB! changes the output format.
|
jpayne@69
|
88 :param measure_underrepresentation: if True, performs test for significantly under-represented functional terms.
|
jpayne@69
|
89 :param no_iea: If True, excludes electronically annotated Gene Ontology terms before analysis.
|
jpayne@69
|
90 :param domain_scope: "known" for using all known genes as background, "annotated" to use all genes annotated for particular datasource.
|
jpayne@69
|
91 :param numeric_namespace: name for the numeric namespace to use if there are numeric values in the query.
|
jpayne@69
|
92 :param significance_threshold_method: method for multiple correction. "g_SCS"|"bonferroni"|"fdr". https://biit.cs.ut.ee/gprofiler/page/docs#significance_threhshold
|
jpayne@69
|
93 :param background: List of genes to use as a statistical background.
|
jpayne@69
|
94 :return:
|
jpayne@69
|
95 """
|
jpayne@69
|
96
|
jpayne@69
|
97 if background is not None:
|
jpayne@69
|
98 domain_scope = 'custom'
|
jpayne@69
|
99
|
jpayne@69
|
100 r = requests.post(
|
jpayne@69
|
101 '{}/api/gost/profile/'.format(self.base_url.rstrip("/")),
|
jpayne@69
|
102 json={
|
jpayne@69
|
103 'organism': organism, # string, eg "hsapiens"
|
jpayne@69
|
104 'query': query, # whitespace-delimited string or list of strings or object of strings to lists of strings
|
jpayne@69
|
105 'sources': sources, # list of strings, for example:
|
jpayne@69
|
106 'user_threshold': user_threshold, # significance threshold, defaults to 0.05
|
jpayne@69
|
107 'all_results': all_results, # bool
|
jpayne@69
|
108 'no_evidences': no_evidences, # bool - if set to true, saves on database lookups
|
jpayne@69
|
109 'combined': combined, # bool, set to true for g:Cocoa output
|
jpayne@69
|
110 'measure_underrepresentation': measure_underrepresentation, # bool
|
jpayne@69
|
111 'no_iea': no_iea, # bool
|
jpayne@69
|
112 'numeric_ns': numeric_namespace, # string
|
jpayne@69
|
113 'domain_scope': domain_scope, # string 'known'|'annotated'|'custom'
|
jpayne@69
|
114 'ordered': ordered, # bool, set to true for ordered query
|
jpayne@69
|
115 'significance_threshold_method': significance_threshold_method, # string, "g_SCS"|"bonferroni"|"fdr", "g_SCS"by default
|
jpayne@69
|
116 'background': background if background is not None else '' # string, background name or query string
|
jpayne@69
|
117
|
jpayne@69
|
118 }
|
jpayne@69
|
119 , headers={'User-Agent': self.user_agent})
|
jpayne@69
|
120
|
jpayne@69
|
121 if r.status_code != 200:
|
jpayne@69
|
122 message = ''
|
jpayne@69
|
123 try:
|
jpayne@69
|
124 message = r.json()['message']
|
jpayne@69
|
125 except:
|
jpayne@69
|
126 message = 'query failed with error {}'.format(r.status_code)
|
jpayne@69
|
127 raise AssertionError(message)
|
jpayne@69
|
128 res = r.json()
|
jpayne@69
|
129
|
jpayne@69
|
130 meta = res['meta']
|
jpayne@69
|
131 self.meta = meta
|
jpayne@69
|
132
|
jpayne@69
|
133 if not combined:
|
jpayne@69
|
134 columns = ['source',
|
jpayne@69
|
135 'native',
|
jpayne@69
|
136 'name',
|
jpayne@69
|
137 'p_value',
|
jpayne@69
|
138 'significant',
|
jpayne@69
|
139 'description',
|
jpayne@69
|
140 'term_size',
|
jpayne@69
|
141 'query_size',
|
jpayne@69
|
142 'intersection_size',
|
jpayne@69
|
143 'effective_domain_size',
|
jpayne@69
|
144 'precision',
|
jpayne@69
|
145 'recall',
|
jpayne@69
|
146 'query',
|
jpayne@69
|
147 'parents']
|
jpayne@69
|
148 if not no_evidences:
|
jpayne@69
|
149 columns.append('intersections')
|
jpayne@69
|
150 columns.append('evidences')
|
jpayne@69
|
151 else:
|
jpayne@69
|
152 columns = [
|
jpayne@69
|
153 'source',
|
jpayne@69
|
154 'native',
|
jpayne@69
|
155 'name',
|
jpayne@69
|
156 'p_values',
|
jpayne@69
|
157 'description',
|
jpayne@69
|
158 'term_size',
|
jpayne@69
|
159 'query_sizes',
|
jpayne@69
|
160 'intersection_sizes',
|
jpayne@69
|
161 'effective_domain_size',
|
jpayne@69
|
162 'parents']
|
jpayne@69
|
163
|
jpayne@69
|
164 queries = (meta['query_metadata']['queries'].keys())
|
jpayne@69
|
165
|
jpayne@69
|
166 if not no_evidences and not combined:
|
jpayne@69
|
167 reverse_mappings = {}
|
jpayne@69
|
168 for query in queries:
|
jpayne@69
|
169 mapping = (meta['genes_metadata']['query'][query]['mapping'])
|
jpayne@69
|
170 reverse_mapping = {}
|
jpayne@69
|
171 for k, v in mapping.items():
|
jpayne@69
|
172 if len(v) == 1:
|
jpayne@69
|
173 # one-to-one mapping
|
jpayne@69
|
174 reverse_mapping[v[0]] = k
|
jpayne@69
|
175 else:
|
jpayne@69
|
176 # one-to=many mapping, we'll use the gene ID
|
jpayne@69
|
177 for i in v:
|
jpayne@69
|
178 reverse_mapping[i] = i
|
jpayne@69
|
179 reverse_mappings[query] = reverse_mapping
|
jpayne@69
|
180
|
jpayne@69
|
181 for result in res['result']:
|
jpayne@69
|
182 mapping = reverse_mappings[result['query']]
|
jpayne@69
|
183 genes = []
|
jpayne@69
|
184 for i in meta['genes_metadata']['query'][result['query']]['ensgs']:
|
jpayne@69
|
185 genes.append(mapping[i])
|
jpayne@69
|
186 result['evidences'] = [i for i in result['intersections'] if i]
|
jpayne@69
|
187 result['intersections'] = ([gene for ev, gene in zip(result['intersections'], genes) if ev])
|
jpayne@69
|
188
|
jpayne@69
|
189 if not self.return_dataframe:
|
jpayne@69
|
190 columns = set(columns)
|
jpayne@69
|
191
|
jpayne@69
|
192 # filter the columns
|
jpayne@69
|
193 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
|
jpayne@69
|
194 return result
|
jpayne@69
|
195
|
jpayne@69
|
196 else:
|
jpayne@69
|
197
|
jpayne@69
|
198 df = self._pandas.DataFrame(res['result'])
|
jpayne@69
|
199
|
jpayne@69
|
200 if len(df) > 0:
|
jpayne@69
|
201 df = df[columns]
|
jpayne@69
|
202
|
jpayne@69
|
203 else:
|
jpayne@69
|
204 return self._pandas.DataFrame(columns=columns)
|
jpayne@69
|
205 return df
|
jpayne@69
|
206
|
jpayne@69
|
207 def convert(
|
jpayne@69
|
208 self,
|
jpayne@69
|
209 query: Union[str, List[str], Dict[str, List[str]]],
|
jpayne@69
|
210 organism: str = 'hsapiens',
|
jpayne@69
|
211 target_namespace: str = 'ENSG',
|
jpayne@69
|
212 numeric_namespace: str = 'ENTREZGENE'
|
jpayne@69
|
213 ) -> List[Dict[str, Any]]:
|
jpayne@69
|
214 """
|
jpayne@69
|
215 Query g:Convert.
|
jpayne@69
|
216
|
jpayne@69
|
217 :param query: list of genes to convert
|
jpayne@69
|
218 :param organism: organism id
|
jpayne@69
|
219 :param target_namespace: namespace to convert into
|
jpayne@69
|
220 :param numeric_namespace
|
jpayne@69
|
221 """
|
jpayne@69
|
222 r = requests.post(
|
jpayne@69
|
223 '{}/api/convert/convert'.format(self.base_url),
|
jpayne@69
|
224 json={
|
jpayne@69
|
225 'organism': organism,
|
jpayne@69
|
226 'query': query,
|
jpayne@69
|
227 'target': target_namespace,
|
jpayne@69
|
228 'numeric_ns': numeric_namespace,
|
jpayne@69
|
229 'output': 'json'
|
jpayne@69
|
230 },
|
jpayne@69
|
231 headers={'User-Agent': self.user_agent}
|
jpayne@69
|
232 )
|
jpayne@69
|
233
|
jpayne@69
|
234 if r.status_code != 200:
|
jpayne@69
|
235 message = ''
|
jpayne@69
|
236 try:
|
jpayne@69
|
237 message = r.json()['message']
|
jpayne@69
|
238 except:
|
jpayne@69
|
239 message = 'query failed with error {}'.format(r.status_code)
|
jpayne@69
|
240 raise AssertionError(message)
|
jpayne@69
|
241 res = r.json()
|
jpayne@69
|
242
|
jpayne@69
|
243 meta = res['meta']
|
jpayne@69
|
244 self.meta = meta
|
jpayne@69
|
245 columns = ['incoming', 'converted', 'n_incoming', 'n_converted', 'name', 'description', 'namespaces', 'query']
|
jpayne@69
|
246
|
jpayne@69
|
247 if not self.return_dataframe:
|
jpayne@69
|
248 columns = set(columns)
|
jpayne@69
|
249
|
jpayne@69
|
250 # filter the columns
|
jpayne@69
|
251 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
|
jpayne@69
|
252 return result
|
jpayne@69
|
253
|
jpayne@69
|
254 df = self._pandas.DataFrame(res['result'])
|
jpayne@69
|
255 df = df[columns]
|
jpayne@69
|
256
|
jpayne@69
|
257 return df
|
jpayne@69
|
258
|
jpayne@69
|
259 def orth(self,
|
jpayne@69
|
260 query: List[str],
|
jpayne@69
|
261 organism: str = "hsapiens",
|
jpayne@69
|
262 target: str = "mmusculus",
|
jpayne@69
|
263 aresolve: Dict[str, str] = None,
|
jpayne@69
|
264 numeric_namespace: str = 'ENTREZGENE'):
|
jpayne@69
|
265 """
|
jpayne@69
|
266 Query g:Orth.
|
jpayne@69
|
267
|
jpayne@69
|
268
|
jpayne@69
|
269 :param query:
|
jpayne@69
|
270 :param organism:
|
jpayne@69
|
271 :param target:
|
jpayne@69
|
272 :param aresolve:
|
jpayne@69
|
273 :param numeric_namespace:
|
jpayne@69
|
274 """
|
jpayne@69
|
275 r = requests.post(
|
jpayne@69
|
276 '{}/api/orth/orth'.format(self.base_url),
|
jpayne@69
|
277 json={
|
jpayne@69
|
278 'organism': organism,
|
jpayne@69
|
279 'query': query,
|
jpayne@69
|
280 'target': target,
|
jpayne@69
|
281 'numeric_ns': numeric_namespace,
|
jpayne@69
|
282 'aresolve': aresolve,
|
jpayne@69
|
283 'output': 'json'
|
jpayne@69
|
284 },
|
jpayne@69
|
285 headers={'User-Agent': self.user_agent}
|
jpayne@69
|
286 )
|
jpayne@69
|
287
|
jpayne@69
|
288 if r.status_code != 200:
|
jpayne@69
|
289 message = ''
|
jpayne@69
|
290 try:
|
jpayne@69
|
291 message = r.json()['message']
|
jpayne@69
|
292 except:
|
jpayne@69
|
293 message = 'query failed with error {}'.format(r.status_code)
|
jpayne@69
|
294 raise AssertionError(message)
|
jpayne@69
|
295 res = r.json()
|
jpayne@69
|
296 meta = res['meta']
|
jpayne@69
|
297 self.meta = meta
|
jpayne@69
|
298 columns = ['incoming', 'converted', 'ortholog_ensg', 'n_incoming', 'n_converted', 'n_result', 'name', 'description', 'namespaces']
|
jpayne@69
|
299 if not self.return_dataframe:
|
jpayne@69
|
300 columns = set(columns)
|
jpayne@69
|
301
|
jpayne@69
|
302 # filter the columns
|
jpayne@69
|
303 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
|
jpayne@69
|
304 return result
|
jpayne@69
|
305
|
jpayne@69
|
306
|
jpayne@69
|
307 df = self._pandas.DataFrame(res['result'])
|
jpayne@69
|
308 df = df[columns]
|
jpayne@69
|
309
|
jpayne@69
|
310 return df
|
jpayne@69
|
311
|
jpayne@69
|
312 def snpense(self,
|
jpayne@69
|
313 query: List[str]):
|
jpayne@69
|
314 """
|
jpayne@69
|
315
|
jpayne@69
|
316 :param query:
|
jpayne@69
|
317 """
|
jpayne@69
|
318 r = requests.post(
|
jpayne@69
|
319 '{}/api/snpense/snpense'.format(self.base_url),
|
jpayne@69
|
320 json={
|
jpayne@69
|
321 'query': query,
|
jpayne@69
|
322 'output': 'json+'
|
jpayne@69
|
323 },
|
jpayne@69
|
324 headers={'User-Agent': self.user_agent}
|
jpayne@69
|
325 )
|
jpayne@69
|
326
|
jpayne@69
|
327 if r.status_code != 200:
|
jpayne@69
|
328 message = ''
|
jpayne@69
|
329 try:
|
jpayne@69
|
330 message = r.json()['message']
|
jpayne@69
|
331 except:
|
jpayne@69
|
332 message = 'query failed with error {}'.format(r.status_code)
|
jpayne@69
|
333 raise AssertionError(message)
|
jpayne@69
|
334 res = r.json()
|
jpayne@69
|
335 meta = res['meta']
|
jpayne@69
|
336 self.meta = meta
|
jpayne@69
|
337 columns = ['rs_id', 'chromosome', 'strand', 'start', 'end', 'ensgs', 'gene_names', 'variants']
|
jpayne@69
|
338 if not self.return_dataframe:
|
jpayne@69
|
339 columns = set(columns)
|
jpayne@69
|
340
|
jpayne@69
|
341 # filter the columns
|
jpayne@69
|
342 result = [{k: v for k, v in i.items() if k in columns} for i in res['result']]
|
jpayne@69
|
343 return result
|
jpayne@69
|
344
|
jpayne@69
|
345 df = self._pandas.DataFrame(res['result'])
|
jpayne@69
|
346 df = df[columns]
|
jpayne@69
|
347
|
jpayne@69
|
348 return df
|