Mercurial > repos > kkonganti > cfsan_lexmapr2
diff lexmapr/run_summary.py @ 3:be95a7ce968a tip
"planemo upload"
author | kkonganti |
---|---|
date | Tue, 13 Sep 2022 11:32:24 -0400 |
parents | 5244e7465767 |
children |
line wrap: on
line diff
--- a/lexmapr/run_summary.py Wed Aug 31 14:32:14 2022 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ -"""Reports and visualizes results""" - -import logging, os, pandas, re, shutil, time -import matplotlib.pyplot as plt -import seaborn as sns -import lexmapr.ontology_reasoner as ontr - -logging.getLogger('matplotlib').setLevel(logging.WARNING) - - -def _split_results(pandas_series, x_col, y_col, split_delim=True): - '''Format a value count series to a dataframe, spliting |-delimited terms''' - graph_dic = {} - for x in pandas_series.items(): - for y in x[0].split('|'): - try: - graph_dic[y] += x[1] - except(KeyError): - graph_dic[y] = x[1] - if split_delim: - graph_pd=pandas.DataFrame({x_col:[':'.join(x.split(':')[:-1]) for x in graph_dic.keys()], - y_col:list(graph_dic.values())}) - else: - graph_pd=pandas.DataFrame({x_col:list(graph_dic.keys()), - y_col:list(graph_dic.values())}) - return(graph_pd) - - -def _get_ontols(map_res, match_col, bin_col): - '''Make instances of Ontology_accessions and group as relevant''' - red_res = map_res[map_res[bin_col].notna()] - mapped_terms = _split_results(red_res[match_col].value_counts(), 'x', 'y', split_delim=False) - mapped_bins = _split_results(red_res[bin_col].value_counts(), 'x', 'y', split_delim=False) - ontol_sets = {} - lcp_set = set() - term_set = set() - for y in list(mapped_bins['x']): - ontol_sets[ontr.Ontology_accession.make_instance(y)] = set() - time.sleep(0.05) - for x in list(mapped_terms['x']): - if x == 'No Match': - continue - term_ontol = ontr.Ontology_accession.make_instance(x) - if term_ontol.ancestors == 'not assigned yet': - term_ontol.get_family('ancestors') - time.sleep(0.05) - if term_ontol.ancestors == ['none found']: - continue - for y in ontol_sets: - if y in term_ontol.ancestors: - ontol_sets[y].add(term_ontol) - for y in ontol_sets: - if ontol_sets[y] != set(): - lcp_set.add(y) - term_set = term_set | ontol_sets[y] - if len(term_set) > 100: - term_list = [x.id for x in list(term_set)] - terms_string = '' - for a,b,c,d in zip(term_list[::4],term_list[1::4],term_list[2::4],term_list[3::4]): - terms_string += f'\n\t\t{a}\t{b}\t{c}\t{d}' - logging.info(f'Not drawing {bin_col} graph with {len(term_list)} child nodes:\n\ - {terms_string}\n') - return([],[]) - return(list(lcp_set), list(term_set)) - - -def report_results(out_file, arg_bins): - '''Print mapping counts to log''' - mapping_results = pandas.read_csv(out_file, header=0, delimiter='\t') - match_status = mapping_results['Match_Status (Macro Level)'].value_counts() - logging.info(f'\t\tNo. unique terms: '+str(len(mapping_results['Sample_Desc']))) - for x in match_status.items(): - logging.info(f'\t\tNo. {x[0]}: {x[1]}') - for x in arg_bins: - logging.info(f'\t\tNo. mapped under {x}: {mapping_results[x].count()}') - - -def report_cache(term_cache): - # TODO: add counts for bins? - '''Print mapping counts to log from cache, only count unique terms''' - logging.info(f'\t\tNo. unique terms: {len(term_cache)-1}') - no_match = 0 - full_match = 0 - syno_match = 0 - comp_match = 0 - for x in term_cache: - if re.search('No Match', term_cache[x]): - no_match += 1 - if re.search('Full Term Match', term_cache[x]): - full_match += 1 - if re.search('Synonym Match', term_cache[x]): - syno_match += 1 - if re.search('Component Match', term_cache[x]): - comp_match += 1 - logging.info(f'\t\tNo. Unique Full Term Match: {full_match}') - logging.info(f'\t\tNo. Unique Synonym Match: {syno_match}') - logging.info(f'\t\tNo. Unique Component Match: {comp_match}') - logging.info(f'\t\tNo. Unique No Match: {no_match}') - return({'No Match':no_match, 'Full Term Match':full_match, - 'Synonym Match':syno_match, 'Component Match':comp_match}) - - -def figure_folder(): - '''Prepare figures folder''' - try: - shutil.rmtree('lexmapr_figures/') - except(FileNotFoundError): - pass - os.mkdir('lexmapr_figures/') - - -def visualize_cache(match_counts): - '''Generate graph''' - # TODO: add graphing for bins? - x_col = 'Match status' - y_col = 'No. samples matched' - sns_fig = sns.barplot(x=list(match_counts.keys()), - y=list(match_counts.values()), ci=None).get_figure() - plt.xticks(rotation=90) - plt.tight_layout() - sns_fig.savefig('lexmapr_figures/mapping_results.png') - logging.info(f'Did not attempt to make bin graphs') - - -def visualize_results(out_file, arg_bins): - '''Generate graphs''' - map_res = pandas.read_csv(out_file,delimiter='\t') - x_col = 'Match status' - y_col = 'No. samples matched' - match_status = map_res['Match_Status (Macro Level)'].value_counts() - match_res = _split_results(match_status, x_col, y_col, False) - match_res = match_res.sort_values(y_col,ascending=False) - sns_fig = sns.barplot(x=x_col, y=y_col, data=match_res, ci=None).get_figure() - plt.xticks(rotation=90) - plt.tight_layout() - sns_fig.savefig('lexmapr_figures/mapping_results.png') - - if map_res.shape[0] >= 1000: - logging.info(f'Did not attempt to make bin because too many rows') - return - - if arg_bins != []: - x_col = 'Bin' - bin_counts = {} - for x in arg_bins: - bin_counts[x] = sum(map_res[x].value_counts()) - bin_res = _split_results(map_res[x].value_counts(), x_col, y_col) - if not bin_res.empty: - bin_res = bin_res.sort_values(y_col,ascending=False) - plt.clf() - sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_res, ci=None).get_figure() - plt.xticks(rotation=90) - plt.tight_layout() - plt.savefig(f'lexmapr_figures/{x}_binning.png') - - plt.clf() - bin_pd = pandas.DataFrame({x_col:list(bin_counts.keys()), - y_col:list(bin_counts.values())}) - bin_pd = bin_pd.sort_values(y_col,ascending=False) - sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_pd, ci=None).get_figure() - plt.xticks(rotation=90) - plt.tight_layout() - sns_fig.savefig('lexmapr_figures/binning_results.png') - - # TODO: make node colors vary with frequency and color ones that are both top and bottom? - for x in arg_bins: - print(f'\tMight generate {x} ontology graph...'.ljust(80),end='\r') - lcp_list, term_list = _get_ontols(map_res, 'Matched_Components', x) - if lcp_list != [] and term_list != []: - bin_package = ontr.Ontology_package('.', list(term_list)) - bin_package.set_lcp(lcp_list) - bin_package.visualize_terms(f'lexmapr_figures/{x}_terms.png', - show_lcp=True, fill_out=True, trim_nodes=True)