diff lexmapr/run_summary.py @ 3:be95a7ce968a tip

"planemo upload"
author kkonganti
date Tue, 13 Sep 2022 11:32:24 -0400
parents 5244e7465767
children
line wrap: on
line diff
--- a/lexmapr/run_summary.py	Wed Aug 31 14:32:14 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,173 +0,0 @@
-"""Reports and visualizes results"""
-
-import logging, os, pandas, re, shutil, time
-import matplotlib.pyplot as plt
-import seaborn as sns
-import lexmapr.ontology_reasoner as ontr
-
-logging.getLogger('matplotlib').setLevel(logging.WARNING)
-
-
-def _split_results(pandas_series, x_col, y_col, split_delim=True):
-    '''Format a value count series to a dataframe, spliting |-delimited terms'''
-    graph_dic = {}
-    for x in pandas_series.items():
-        for y in x[0].split('|'):
-            try:
-                graph_dic[y] += x[1]
-            except(KeyError):
-                graph_dic[y] = x[1]
-    if split_delim:
-        graph_pd=pandas.DataFrame({x_col:[':'.join(x.split(':')[:-1]) for x in graph_dic.keys()],
-                                   y_col:list(graph_dic.values())})
-    else:
-        graph_pd=pandas.DataFrame({x_col:list(graph_dic.keys()),
-                                   y_col:list(graph_dic.values())})
-    return(graph_pd)
-
-
-def _get_ontols(map_res, match_col, bin_col):
-    '''Make instances of Ontology_accessions and group as relevant'''
-    red_res = map_res[map_res[bin_col].notna()]
-    mapped_terms = _split_results(red_res[match_col].value_counts(), 'x', 'y', split_delim=False)
-    mapped_bins = _split_results(red_res[bin_col].value_counts(), 'x', 'y', split_delim=False)
-    ontol_sets = {}
-    lcp_set = set()
-    term_set = set()
-    for y in list(mapped_bins['x']):
-        ontol_sets[ontr.Ontology_accession.make_instance(y)] = set()
-        time.sleep(0.05)
-    for x in list(mapped_terms['x']):
-        if x == 'No Match':
-            continue
-        term_ontol = ontr.Ontology_accession.make_instance(x)
-        if term_ontol.ancestors == 'not assigned yet':
-            term_ontol.get_family('ancestors')
-            time.sleep(0.05)
-        if term_ontol.ancestors == ['none found']:
-            continue
-        for y in ontol_sets:
-            if y in term_ontol.ancestors:
-                ontol_sets[y].add(term_ontol)
-    for y in ontol_sets:
-       if ontol_sets[y] != set():
-          lcp_set.add(y)
-          term_set = term_set | ontol_sets[y]
-    if len(term_set) > 100:
-        term_list = [x.id for x in list(term_set)]
-        terms_string = ''
-        for a,b,c,d in zip(term_list[::4],term_list[1::4],term_list[2::4],term_list[3::4]):
-            terms_string += f'\n\t\t{a}\t{b}\t{c}\t{d}'
-        logging.info(f'Not drawing {bin_col} graph with {len(term_list)} child nodes:\n\
-                       {terms_string}\n')
-        return([],[])
-    return(list(lcp_set), list(term_set))
-
-
-def report_results(out_file, arg_bins):
-    '''Print mapping counts to log'''
-    mapping_results = pandas.read_csv(out_file, header=0, delimiter='\t')
-    match_status = mapping_results['Match_Status (Macro Level)'].value_counts()
-    logging.info(f'\t\tNo. unique terms: '+str(len(mapping_results['Sample_Desc'])))
-    for x in match_status.items():
-        logging.info(f'\t\tNo. {x[0]}: {x[1]}')
-    for x in arg_bins:
-        logging.info(f'\t\tNo. mapped under {x}: {mapping_results[x].count()}')
-
-
-def report_cache(term_cache):
-    # TODO: add counts for bins?
-    '''Print mapping counts to log from cache, only count unique terms'''
-    logging.info(f'\t\tNo. unique terms: {len(term_cache)-1}')
-    no_match = 0
-    full_match = 0
-    syno_match = 0
-    comp_match = 0
-    for x in term_cache:
-        if re.search('No Match', term_cache[x]):
-            no_match += 1
-        if re.search('Full Term Match', term_cache[x]):
-            full_match += 1
-        if re.search('Synonym Match', term_cache[x]):
-            syno_match += 1
-        if re.search('Component Match', term_cache[x]):
-            comp_match += 1
-    logging.info(f'\t\tNo. Unique Full Term Match: {full_match}')
-    logging.info(f'\t\tNo. Unique Synonym Match: {syno_match}')
-    logging.info(f'\t\tNo. Unique Component Match: {comp_match}')
-    logging.info(f'\t\tNo. Unique No Match: {no_match}')
-    return({'No Match':no_match, 'Full Term Match':full_match,
-            'Synonym Match':syno_match, 'Component Match':comp_match})
-
-
-def figure_folder():
-    '''Prepare figures folder'''
-    try:
-        shutil.rmtree('lexmapr_figures/')
-    except(FileNotFoundError):
-        pass
-    os.mkdir('lexmapr_figures/')
-
-
-def visualize_cache(match_counts):
-    '''Generate graph'''
-    # TODO: add graphing for bins?
-    x_col = 'Match status'
-    y_col = 'No. samples matched'
-    sns_fig = sns.barplot(x=list(match_counts.keys()),
-                          y=list(match_counts.values()), ci=None).get_figure()
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-    sns_fig.savefig('lexmapr_figures/mapping_results.png')
-    logging.info(f'Did not attempt to make bin graphs')
-
-
-def visualize_results(out_file, arg_bins):
-    '''Generate graphs'''
-    map_res = pandas.read_csv(out_file,delimiter='\t')
-    x_col = 'Match status'
-    y_col = 'No. samples matched'
-    match_status = map_res['Match_Status (Macro Level)'].value_counts()
-    match_res = _split_results(match_status, x_col, y_col, False)
-    match_res = match_res.sort_values(y_col,ascending=False)
-    sns_fig = sns.barplot(x=x_col, y=y_col, data=match_res, ci=None).get_figure()
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-    sns_fig.savefig('lexmapr_figures/mapping_results.png')
-
-    if map_res.shape[0] >= 1000:
-        logging.info(f'Did not attempt to make bin because too many rows')
-        return
-
-    if arg_bins != []:
-        x_col = 'Bin'
-        bin_counts = {}
-        for x in arg_bins:
-            bin_counts[x] = sum(map_res[x].value_counts())
-            bin_res = _split_results(map_res[x].value_counts(), x_col, y_col)
-            if not bin_res.empty:
-                bin_res = bin_res.sort_values(y_col,ascending=False)
-                plt.clf()
-                sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_res, ci=None).get_figure()
-                plt.xticks(rotation=90)
-                plt.tight_layout()
-                plt.savefig(f'lexmapr_figures/{x}_binning.png')
-
-        plt.clf()
-        bin_pd = pandas.DataFrame({x_col:list(bin_counts.keys()), 
-                                   y_col:list(bin_counts.values())})
-        bin_pd = bin_pd.sort_values(y_col,ascending=False)
-        sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_pd, ci=None).get_figure()
-        plt.xticks(rotation=90)
-        plt.tight_layout()
-        sns_fig.savefig('lexmapr_figures/binning_results.png')
-        
-        # TODO: make node colors vary with frequency and color ones that are both top and bottom?
-        for x in arg_bins:
-            print(f'\tMight generate {x} ontology graph...'.ljust(80),end='\r')
-            lcp_list, term_list = _get_ontols(map_res, 'Matched_Components', x)
-            if lcp_list != [] and term_list != []:
-                bin_package = ontr.Ontology_package('.', list(term_list))
-                bin_package.set_lcp(lcp_list)
-                bin_package.visualize_terms(f'lexmapr_figures/{x}_terms.png',
-                                            show_lcp=True, fill_out=True, trim_nodes=True)