annotate lexmapr/run_summary.py @ 4:819eff1bd7ac tip

"planemo upload"
author cstrittmatter
date Wed, 29 Jun 2022 15:30:52 -0400
parents f298f3e5c515
children
rev   line source
cstrittmatter@0 1 """Reports and visualizes results"""
cstrittmatter@0 2
cstrittmatter@0 3 import logging, os, pandas, re, shutil, time
cstrittmatter@0 4 import matplotlib.pyplot as plt
cstrittmatter@0 5 import seaborn as sns
cstrittmatter@0 6 import lexmapr.ontology_reasoner as ontr
cstrittmatter@0 7
cstrittmatter@0 8 logging.getLogger('matplotlib').setLevel(logging.WARNING)
cstrittmatter@0 9
cstrittmatter@0 10
cstrittmatter@0 11 def _split_results(pandas_series, x_col, y_col, split_delim=True):
cstrittmatter@0 12 '''Format a value count series to a dataframe, spliting |-delimited terms'''
cstrittmatter@0 13 graph_dic = {}
cstrittmatter@0 14 for x in pandas_series.items():
cstrittmatter@0 15 for y in x[0].split('|'):
cstrittmatter@0 16 try:
cstrittmatter@0 17 graph_dic[y] += x[1]
cstrittmatter@0 18 except(KeyError):
cstrittmatter@0 19 graph_dic[y] = x[1]
cstrittmatter@0 20 if split_delim:
cstrittmatter@0 21 graph_pd=pandas.DataFrame({x_col:[':'.join(x.split(':')[:-1]) for x in graph_dic.keys()],
cstrittmatter@0 22 y_col:list(graph_dic.values())})
cstrittmatter@0 23 else:
cstrittmatter@0 24 graph_pd=pandas.DataFrame({x_col:list(graph_dic.keys()),
cstrittmatter@0 25 y_col:list(graph_dic.values())})
cstrittmatter@0 26 return(graph_pd)
cstrittmatter@0 27
cstrittmatter@0 28
cstrittmatter@0 29 def _get_ontols(map_res, match_col, bin_col):
cstrittmatter@0 30 '''Make instances of Ontology_accessions and group as relevant'''
cstrittmatter@0 31 red_res = map_res[map_res[bin_col].notna()]
cstrittmatter@0 32 mapped_terms = _split_results(red_res[match_col].value_counts(), 'x', 'y', split_delim=False)
cstrittmatter@0 33 mapped_bins = _split_results(red_res[bin_col].value_counts(), 'x', 'y', split_delim=False)
cstrittmatter@0 34 ontol_sets = {}
cstrittmatter@0 35 lcp_set = set()
cstrittmatter@0 36 term_set = set()
cstrittmatter@0 37 for y in list(mapped_bins['x']):
cstrittmatter@0 38 ontol_sets[ontr.Ontology_accession.make_instance(y)] = set()
cstrittmatter@0 39 time.sleep(0.05)
cstrittmatter@0 40 for x in list(mapped_terms['x']):
cstrittmatter@0 41 if x == 'No Match':
cstrittmatter@0 42 continue
cstrittmatter@0 43 term_ontol = ontr.Ontology_accession.make_instance(x)
cstrittmatter@0 44 if term_ontol.ancestors == 'not assigned yet':
cstrittmatter@0 45 term_ontol.get_family('ancestors')
cstrittmatter@0 46 time.sleep(0.05)
cstrittmatter@0 47 if term_ontol.ancestors == ['none found']:
cstrittmatter@0 48 continue
cstrittmatter@0 49 for y in ontol_sets:
cstrittmatter@0 50 if y in term_ontol.ancestors:
cstrittmatter@0 51 ontol_sets[y].add(term_ontol)
cstrittmatter@0 52 for y in ontol_sets:
cstrittmatter@0 53 if ontol_sets[y] != set():
cstrittmatter@0 54 lcp_set.add(y)
cstrittmatter@0 55 term_set = term_set | ontol_sets[y]
cstrittmatter@0 56 if len(term_set) > 100:
cstrittmatter@0 57 term_list = [x.id for x in list(term_set)]
cstrittmatter@0 58 terms_string = ''
cstrittmatter@0 59 for a,b,c,d in zip(term_list[::4],term_list[1::4],term_list[2::4],term_list[3::4]):
cstrittmatter@0 60 terms_string += f'\n\t\t{a}\t{b}\t{c}\t{d}'
cstrittmatter@0 61 logging.info(f'Not drawing {bin_col} graph with {len(term_list)} child nodes:\n\
cstrittmatter@0 62 {terms_string}\n')
cstrittmatter@0 63 return([],[])
cstrittmatter@0 64 return(list(lcp_set), list(term_set))
cstrittmatter@0 65
cstrittmatter@0 66
cstrittmatter@0 67 def report_results(out_file, arg_bins):
cstrittmatter@0 68 '''Print mapping counts to log'''
cstrittmatter@0 69 mapping_results = pandas.read_csv(out_file, header=0, delimiter='\t')
cstrittmatter@0 70 match_status = mapping_results['Match_Status (Macro Level)'].value_counts()
cstrittmatter@0 71 logging.info(f'\t\tNo. unique terms: '+str(len(mapping_results['Sample_Desc'])))
cstrittmatter@0 72 for x in match_status.items():
cstrittmatter@0 73 logging.info(f'\t\tNo. {x[0]}: {x[1]}')
cstrittmatter@0 74 for x in arg_bins:
cstrittmatter@0 75 logging.info(f'\t\tNo. mapped under {x}: {mapping_results[x].count()}')
cstrittmatter@0 76
cstrittmatter@0 77
cstrittmatter@0 78 def report_cache(term_cache):
cstrittmatter@0 79 # TODO: add counts for bins?
cstrittmatter@0 80 '''Print mapping counts to log from cache, only count unique terms'''
cstrittmatter@0 81 logging.info(f'\t\tNo. unique terms: {len(term_cache)-1}')
cstrittmatter@0 82 no_match = 0
cstrittmatter@0 83 full_match = 0
cstrittmatter@0 84 syno_match = 0
cstrittmatter@0 85 comp_match = 0
cstrittmatter@0 86 for x in term_cache:
cstrittmatter@0 87 if re.search('No Match', term_cache[x]):
cstrittmatter@0 88 no_match += 1
cstrittmatter@0 89 if re.search('Full Term Match', term_cache[x]):
cstrittmatter@0 90 full_match += 1
cstrittmatter@0 91 if re.search('Synonym Match', term_cache[x]):
cstrittmatter@0 92 syno_match += 1
cstrittmatter@0 93 if re.search('Component Match', term_cache[x]):
cstrittmatter@0 94 comp_match += 1
cstrittmatter@0 95 logging.info(f'\t\tNo. Unique Full Term Match: {full_match}')
cstrittmatter@0 96 logging.info(f'\t\tNo. Unique Synonym Match: {syno_match}')
cstrittmatter@0 97 logging.info(f'\t\tNo. Unique Component Match: {comp_match}')
cstrittmatter@0 98 logging.info(f'\t\tNo. Unique No Match: {no_match}')
cstrittmatter@0 99 return({'No Match':no_match, 'Full Term Match':full_match,
cstrittmatter@0 100 'Synonym Match':syno_match, 'Component Match':comp_match})
cstrittmatter@0 101
cstrittmatter@0 102
cstrittmatter@0 103 def figure_folder():
cstrittmatter@0 104 '''Prepare figures folder'''
cstrittmatter@0 105 try:
cstrittmatter@0 106 shutil.rmtree('lexmapr_figures/')
cstrittmatter@0 107 except(FileNotFoundError):
cstrittmatter@0 108 pass
cstrittmatter@0 109 os.mkdir('lexmapr_figures/')
cstrittmatter@0 110
cstrittmatter@0 111
cstrittmatter@0 112 def visualize_cache(match_counts):
cstrittmatter@0 113 '''Generate graph'''
cstrittmatter@0 114 # TODO: add graphing for bins?
cstrittmatter@0 115 x_col = 'Match status'
cstrittmatter@0 116 y_col = 'No. samples matched'
cstrittmatter@0 117 sns_fig = sns.barplot(x=list(match_counts.keys()),
cstrittmatter@0 118 y=list(match_counts.values()), ci=None).get_figure()
cstrittmatter@0 119 plt.xticks(rotation=90)
cstrittmatter@0 120 plt.tight_layout()
cstrittmatter@0 121 sns_fig.savefig('lexmapr_figures/mapping_results.png')
cstrittmatter@0 122 logging.info(f'Did not attempt to make bin graphs')
cstrittmatter@0 123
cstrittmatter@0 124
cstrittmatter@0 125 def visualize_results(out_file, arg_bins):
cstrittmatter@0 126 '''Generate graphs'''
cstrittmatter@0 127 map_res = pandas.read_csv(out_file,delimiter='\t')
cstrittmatter@0 128 x_col = 'Match status'
cstrittmatter@0 129 y_col = 'No. samples matched'
cstrittmatter@0 130 match_status = map_res['Match_Status (Macro Level)'].value_counts()
cstrittmatter@0 131 match_res = _split_results(match_status, x_col, y_col, False)
cstrittmatter@0 132 match_res = match_res.sort_values(y_col,ascending=False)
cstrittmatter@0 133 sns_fig = sns.barplot(x=x_col, y=y_col, data=match_res, ci=None).get_figure()
cstrittmatter@0 134 plt.xticks(rotation=90)
cstrittmatter@0 135 plt.tight_layout()
cstrittmatter@0 136 sns_fig.savefig('lexmapr_figures/mapping_results.png')
cstrittmatter@0 137
cstrittmatter@0 138 if map_res.shape[0] >= 1000:
cstrittmatter@0 139 logging.info(f'Did not attempt to make bin because too many rows')
cstrittmatter@0 140 return
cstrittmatter@0 141
cstrittmatter@0 142 if arg_bins != []:
cstrittmatter@0 143 x_col = 'Bin'
cstrittmatter@0 144 bin_counts = {}
cstrittmatter@0 145 for x in arg_bins:
cstrittmatter@0 146 bin_counts[x] = sum(map_res[x].value_counts())
cstrittmatter@0 147 bin_res = _split_results(map_res[x].value_counts(), x_col, y_col)
cstrittmatter@0 148 if not bin_res.empty:
cstrittmatter@0 149 bin_res = bin_res.sort_values(y_col,ascending=False)
cstrittmatter@0 150 plt.clf()
cstrittmatter@0 151 sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_res, ci=None).get_figure()
cstrittmatter@0 152 plt.xticks(rotation=90)
cstrittmatter@0 153 plt.tight_layout()
cstrittmatter@0 154 plt.savefig(f'lexmapr_figures/{x}_binning.png')
cstrittmatter@0 155
cstrittmatter@0 156 plt.clf()
cstrittmatter@0 157 bin_pd = pandas.DataFrame({x_col:list(bin_counts.keys()),
cstrittmatter@0 158 y_col:list(bin_counts.values())})
cstrittmatter@0 159 bin_pd = bin_pd.sort_values(y_col,ascending=False)
cstrittmatter@0 160 sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_pd, ci=None).get_figure()
cstrittmatter@0 161 plt.xticks(rotation=90)
cstrittmatter@0 162 plt.tight_layout()
cstrittmatter@0 163 sns_fig.savefig('lexmapr_figures/binning_results.png')
cstrittmatter@0 164
cstrittmatter@0 165 # TODO: make node colors vary with frequency and color ones that are both top and bottom?
cstrittmatter@0 166 for x in arg_bins:
cstrittmatter@0 167 print(f'\tMight generate {x} ontology graph...'.ljust(80),end='\r')
cstrittmatter@0 168 lcp_list, term_list = _get_ontols(map_res, 'Matched_Components', x)
cstrittmatter@0 169 if lcp_list != [] and term_list != []:
cstrittmatter@0 170 bin_package = ontr.Ontology_package('.', list(term_list))
cstrittmatter@0 171 bin_package.set_lcp(lcp_list)
cstrittmatter@0 172 bin_package.visualize_terms(f'lexmapr_figures/{x}_terms.png',
cstrittmatter@0 173 show_lcp=True, fill_out=True, trim_nodes=True)