Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/gprofiler/gprofiler.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/gprofiler/gprofiler.py Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,348 @@ +from typing import Union, List, Dict, Any + +import requests + +from gprofiler.version import __version__ + + +class GProfiler(): + def __init__(self, user_agent: str = '', base_url: str = None, return_dataframe: bool = False): + ''' + A class representing the g:Profiler toolkit. Contains methods for + querying the g:GOSt, g:Convert, g:Orth and g:SNPense tools. Please see the + g:Profiler web tool (https://biit.cs.ut.ee/gprofiler/) for extensive documentation on all the options to + the methods. + + :param user_agent: the URL used for the g:Profiler service. + :param base_url: the URL used for the g:Profiler service. + :param return_dataframe: if True, query results are presented as pandas DataFrames. + ''' + self.user_agent = 'gprofiler-python {version}/{user_agent}'.format(version=__version__, user_agent=user_agent) + + if base_url is None: + self.base_url = 'https://biit.cs.ut.ee/gprofiler' + else: + self.base_url = base_url + + self.return_dataframe = return_dataframe + if return_dataframe: + self._pandas = self._get_pandas_module() + + self.meta = None + + @staticmethod + def _get_pandas_module(): + is_pandas_module = lambda x: getattr(x, '__name__', '') == 'pandas' + namespace = globals() + if 'pd' in namespace and is_pandas_module(namespace['pd']): + return namespace['pd'] + elif 'pandas' in namespace and is_pandas_module(namespace['pandas']): + return namespace['pandas'] + else: + import importlib + return importlib.import_module('pandas') + + + def __getattr__(self, item): + if item in ['gprofile', 'gorth', 'gconvert']: + raise NotImplementedError('''`{}` has been renamed `{}` and has a new interface +To use the previous version use the command `pip install --upgrade --no-deps --force-reinstall gprofiler-official==0.3.5` + '''.format(item, item[1:])) + raise AttributeError('{} is not an attribute of {}'.format(item, self.__class__.__name__)) + + + + def profile( + self, + query: Union[str, List[str], Dict[str, List[str]]], + organism: str = 'hsapiens', + sources: List[str] = tuple(), + user_threshold: float = 0.05, + all_results: bool = False, + ordered: bool = False, + no_evidences: bool = True, + combined: bool = False, + measure_underrepresentation: bool = False, + no_iea: bool = False, + domain_scope: str = 'annotated', + numeric_namespace: str = '', + significance_threshold_method: str = 'g_SCS', + background: str = None, + + ) -> List[Dict[str, Any]]: + """ + performs functional profiling of gene lists using various kinds of biological evidence. + The tool performs statistical enrichment analysis to find over-representation of information from Gene Ontology terms, + biological pathways, regulatory DNA elements, human disease gene annotations, and protein-protein interaction networks. + + + + :param query: list of genes to profile. For running multiple queries at once, accepts a dictionary of lists as well. + :param organism: Organism id for profiling. For full list see https://biit.cs.ut.ee/gprofiler/page/organism-list + :param sources: List of annotation sources to include in analysis. Defaults to all known. + :param user_threshold: Significance threshold for analysis. + :param all_results: If True, return all analysis results regardless of statistical significance. + :param ordered: If True, considers the order of input query to be significant. See https://biit.cs.ut.ee/gprofiler/page/docs#ordered_gene_lists + :param no_evidences: If False, the results include lists of intersections and evidences for the intersections + :param combined: If True, performs all queries and combines the results into a single table. NB! changes the output format. + :param measure_underrepresentation: if True, performs test for significantly under-represented functional terms. + :param no_iea: If True, excludes electronically annotated Gene Ontology terms before analysis. + :param domain_scope: "known" for using all known genes as background, "annotated" to use all genes annotated for particular datasource. + :param numeric_namespace: name for the numeric namespace to use if there are numeric values in the query. + :param significance_threshold_method: method for multiple correction. "g_SCS"|"bonferroni"|"fdr". https://biit.cs.ut.ee/gprofiler/page/docs#significance_threhshold + :param background: List of genes to use as a statistical background. + :return: + """ + + if background is not None: + domain_scope = 'custom' + + r = requests.post( + '{}/api/gost/profile/'.format(self.base_url.rstrip("/")), + json={ + 'organism': organism, # string, eg "hsapiens" + 'query': query, # whitespace-delimited string or list of strings or object of strings to lists of strings + 'sources': sources, # list of strings, for example: + 'user_threshold': user_threshold, # significance threshold, defaults to 0.05 + 'all_results': all_results, # bool + 'no_evidences': no_evidences, # bool - if set to true, saves on database lookups + 'combined': combined, # bool, set to true for g:Cocoa output + 'measure_underrepresentation': measure_underrepresentation, # bool + 'no_iea': no_iea, # bool + 'numeric_ns': numeric_namespace, # string + 'domain_scope': domain_scope, # string 'known'|'annotated'|'custom' + 'ordered': ordered, # bool, set to true for ordered query + 'significance_threshold_method': significance_threshold_method, # string, "g_SCS"|"bonferroni"|"fdr", "g_SCS"by default + 'background': background if background is not None else '' # string, background name or query string + + } + , headers={'User-Agent': self.user_agent}) + + if r.status_code != 200: + message = '' + try: + message = r.json()['message'] + except: + message = 'query failed with error {}'.format(r.status_code) + raise AssertionError(message) + res = r.json() + + meta = res['meta'] + self.meta = meta + + if not combined: + columns = ['source', + 'native', + 'name', + 'p_value', + 'significant', + 'description', + 'term_size', + 'query_size', + 'intersection_size', + 'effective_domain_size', + 'precision', + 'recall', + 'query', + 'parents'] + if not no_evidences: + columns.append('intersections') + columns.append('evidences') + else: + columns = [ + 'source', + 'native', + 'name', + 'p_values', + 'description', + 'term_size', + 'query_sizes', + 'intersection_sizes', + 'effective_domain_size', + 'parents'] + + queries = (meta['query_metadata']['queries'].keys()) + + if not no_evidences and not combined: + reverse_mappings = {} + for query in queries: + mapping = (meta['genes_metadata']['query'][query]['mapping']) + reverse_mapping = {} + for k, v in mapping.items(): + if len(v) == 1: + # one-to-one mapping + reverse_mapping[v[0]] = k + else: + # one-to=many mapping, we'll use the gene ID + for i in v: + reverse_mapping[i] = i + reverse_mappings[query] = reverse_mapping + + for result in res['result']: + mapping = reverse_mappings[result['query']] + genes = [] + for i in meta['genes_metadata']['query'][result['query']]['ensgs']: + genes.append(mapping[i]) + result['evidences'] = [i for i in result['intersections'] if i] + result['intersections'] = ([gene for ev, gene in zip(result['intersections'], genes) if ev]) + + if not self.return_dataframe: + columns = set(columns) + + # filter the columns + result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] + return result + + else: + + df = self._pandas.DataFrame(res['result']) + + if len(df) > 0: + df = df[columns] + + else: + return self._pandas.DataFrame(columns=columns) + return df + + def convert( + self, + query: Union[str, List[str], Dict[str, List[str]]], + organism: str = 'hsapiens', + target_namespace: str = 'ENSG', + numeric_namespace: str = 'ENTREZGENE' + ) -> List[Dict[str, Any]]: + """ + Query g:Convert. + + :param query: list of genes to convert + :param organism: organism id + :param target_namespace: namespace to convert into + :param numeric_namespace + """ + r = requests.post( + '{}/api/convert/convert'.format(self.base_url), + json={ + 'organism': organism, + 'query': query, + 'target': target_namespace, + 'numeric_ns': numeric_namespace, + 'output': 'json' + }, + headers={'User-Agent': self.user_agent} + ) + + if r.status_code != 200: + message = '' + try: + message = r.json()['message'] + except: + message = 'query failed with error {}'.format(r.status_code) + raise AssertionError(message) + res = r.json() + + meta = res['meta'] + self.meta = meta + columns = ['incoming', 'converted', 'n_incoming', 'n_converted', 'name', 'description', 'namespaces', 'query'] + + if not self.return_dataframe: + columns = set(columns) + + # filter the columns + result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] + return result + + df = self._pandas.DataFrame(res['result']) + df = df[columns] + + return df + + def orth(self, + query: List[str], + organism: str = "hsapiens", + target: str = "mmusculus", + aresolve: Dict[str, str] = None, + numeric_namespace: str = 'ENTREZGENE'): + """ + Query g:Orth. + + + :param query: + :param organism: + :param target: + :param aresolve: + :param numeric_namespace: + """ + r = requests.post( + '{}/api/orth/orth'.format(self.base_url), + json={ + 'organism': organism, + 'query': query, + 'target': target, + 'numeric_ns': numeric_namespace, + 'aresolve': aresolve, + 'output': 'json' + }, + headers={'User-Agent': self.user_agent} + ) + + if r.status_code != 200: + message = '' + try: + message = r.json()['message'] + except: + message = 'query failed with error {}'.format(r.status_code) + raise AssertionError(message) + res = r.json() + meta = res['meta'] + self.meta = meta + columns = ['incoming', 'converted', 'ortholog_ensg', 'n_incoming', 'n_converted', 'n_result', 'name', 'description', 'namespaces'] + if not self.return_dataframe: + columns = set(columns) + + # filter the columns + result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] + return result + + + df = self._pandas.DataFrame(res['result']) + df = df[columns] + + return df + + def snpense(self, + query: List[str]): + """ + + :param query: + """ + r = requests.post( + '{}/api/snpense/snpense'.format(self.base_url), + json={ + 'query': query, + 'output': 'json+' + }, + headers={'User-Agent': self.user_agent} + ) + + if r.status_code != 200: + message = '' + try: + message = r.json()['message'] + except: + message = 'query failed with error {}'.format(r.status_code) + raise AssertionError(message) + res = r.json() + meta = res['meta'] + self.meta = meta + columns = ['rs_id', 'chromosome', 'strand', 'start', 'end', 'ensgs', 'gene_names', 'variants'] + if not self.return_dataframe: + columns = set(columns) + + # filter the columns + result = [{k: v for k, v in i.items() if k in columns} for i in res['result']] + return result + + df = self._pandas.DataFrame(res['result']) + df = df[columns] + + return df