Mercurial > repos > kkonganti > cfsan_lexmapr2
comparison lexmapr/run_summary.py @ 0:f5c39d0447be
"planemo upload"
author | kkonganti |
---|---|
date | Wed, 31 Aug 2022 14:32:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f5c39d0447be |
---|---|
1 """Reports and visualizes results""" | |
2 | |
3 import logging, os, pandas, re, shutil, time | |
4 import matplotlib.pyplot as plt | |
5 import seaborn as sns | |
6 import lexmapr.ontology_reasoner as ontr | |
7 | |
8 logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
9 | |
10 | |
11 def _split_results(pandas_series, x_col, y_col, split_delim=True): | |
12 '''Format a value count series to a dataframe, spliting |-delimited terms''' | |
13 graph_dic = {} | |
14 for x in pandas_series.items(): | |
15 for y in x[0].split('|'): | |
16 try: | |
17 graph_dic[y] += x[1] | |
18 except(KeyError): | |
19 graph_dic[y] = x[1] | |
20 if split_delim: | |
21 graph_pd=pandas.DataFrame({x_col:[':'.join(x.split(':')[:-1]) for x in graph_dic.keys()], | |
22 y_col:list(graph_dic.values())}) | |
23 else: | |
24 graph_pd=pandas.DataFrame({x_col:list(graph_dic.keys()), | |
25 y_col:list(graph_dic.values())}) | |
26 return(graph_pd) | |
27 | |
28 | |
29 def _get_ontols(map_res, match_col, bin_col): | |
30 '''Make instances of Ontology_accessions and group as relevant''' | |
31 red_res = map_res[map_res[bin_col].notna()] | |
32 mapped_terms = _split_results(red_res[match_col].value_counts(), 'x', 'y', split_delim=False) | |
33 mapped_bins = _split_results(red_res[bin_col].value_counts(), 'x', 'y', split_delim=False) | |
34 ontol_sets = {} | |
35 lcp_set = set() | |
36 term_set = set() | |
37 for y in list(mapped_bins['x']): | |
38 ontol_sets[ontr.Ontology_accession.make_instance(y)] = set() | |
39 time.sleep(0.05) | |
40 for x in list(mapped_terms['x']): | |
41 if x == 'No Match': | |
42 continue | |
43 term_ontol = ontr.Ontology_accession.make_instance(x) | |
44 if term_ontol.ancestors == 'not assigned yet': | |
45 term_ontol.get_family('ancestors') | |
46 time.sleep(0.05) | |
47 if term_ontol.ancestors == ['none found']: | |
48 continue | |
49 for y in ontol_sets: | |
50 if y in term_ontol.ancestors: | |
51 ontol_sets[y].add(term_ontol) | |
52 for y in ontol_sets: | |
53 if ontol_sets[y] != set(): | |
54 lcp_set.add(y) | |
55 term_set = term_set | ontol_sets[y] | |
56 if len(term_set) > 100: | |
57 term_list = [x.id for x in list(term_set)] | |
58 terms_string = '' | |
59 for a,b,c,d in zip(term_list[::4],term_list[1::4],term_list[2::4],term_list[3::4]): | |
60 terms_string += f'\n\t\t{a}\t{b}\t{c}\t{d}' | |
61 logging.info(f'Not drawing {bin_col} graph with {len(term_list)} child nodes:\n\ | |
62 {terms_string}\n') | |
63 return([],[]) | |
64 return(list(lcp_set), list(term_set)) | |
65 | |
66 | |
67 def report_results(out_file, arg_bins): | |
68 '''Print mapping counts to log''' | |
69 mapping_results = pandas.read_csv(out_file, header=0, delimiter='\t') | |
70 match_status = mapping_results['Match_Status (Macro Level)'].value_counts() | |
71 logging.info(f'\t\tNo. unique terms: '+str(len(mapping_results['Sample_Desc']))) | |
72 for x in match_status.items(): | |
73 logging.info(f'\t\tNo. {x[0]}: {x[1]}') | |
74 for x in arg_bins: | |
75 logging.info(f'\t\tNo. mapped under {x}: {mapping_results[x].count()}') | |
76 | |
77 | |
78 def report_cache(term_cache): | |
79 # TODO: add counts for bins? | |
80 '''Print mapping counts to log from cache, only count unique terms''' | |
81 logging.info(f'\t\tNo. unique terms: {len(term_cache)-1}') | |
82 no_match = 0 | |
83 full_match = 0 | |
84 syno_match = 0 | |
85 comp_match = 0 | |
86 for x in term_cache: | |
87 if re.search('No Match', term_cache[x]): | |
88 no_match += 1 | |
89 if re.search('Full Term Match', term_cache[x]): | |
90 full_match += 1 | |
91 if re.search('Synonym Match', term_cache[x]): | |
92 syno_match += 1 | |
93 if re.search('Component Match', term_cache[x]): | |
94 comp_match += 1 | |
95 logging.info(f'\t\tNo. Unique Full Term Match: {full_match}') | |
96 logging.info(f'\t\tNo. Unique Synonym Match: {syno_match}') | |
97 logging.info(f'\t\tNo. Unique Component Match: {comp_match}') | |
98 logging.info(f'\t\tNo. Unique No Match: {no_match}') | |
99 return({'No Match':no_match, 'Full Term Match':full_match, | |
100 'Synonym Match':syno_match, 'Component Match':comp_match}) | |
101 | |
102 | |
103 def figure_folder(): | |
104 '''Prepare figures folder''' | |
105 try: | |
106 shutil.rmtree('lexmapr_figures/') | |
107 except(FileNotFoundError): | |
108 pass | |
109 os.mkdir('lexmapr_figures/') | |
110 | |
111 | |
112 def visualize_cache(match_counts): | |
113 '''Generate graph''' | |
114 # TODO: add graphing for bins? | |
115 x_col = 'Match status' | |
116 y_col = 'No. samples matched' | |
117 sns_fig = sns.barplot(x=list(match_counts.keys()), | |
118 y=list(match_counts.values()), ci=None).get_figure() | |
119 plt.xticks(rotation=90) | |
120 plt.tight_layout() | |
121 sns_fig.savefig('lexmapr_figures/mapping_results.png') | |
122 logging.info(f'Did not attempt to make bin graphs') | |
123 | |
124 | |
125 def visualize_results(out_file, arg_bins): | |
126 '''Generate graphs''' | |
127 map_res = pandas.read_csv(out_file,delimiter='\t') | |
128 x_col = 'Match status' | |
129 y_col = 'No. samples matched' | |
130 match_status = map_res['Match_Status (Macro Level)'].value_counts() | |
131 match_res = _split_results(match_status, x_col, y_col, False) | |
132 match_res = match_res.sort_values(y_col,ascending=False) | |
133 sns_fig = sns.barplot(x=x_col, y=y_col, data=match_res, ci=None).get_figure() | |
134 plt.xticks(rotation=90) | |
135 plt.tight_layout() | |
136 sns_fig.savefig('lexmapr_figures/mapping_results.png') | |
137 | |
138 if map_res.shape[0] >= 1000: | |
139 logging.info(f'Did not attempt to make bin because too many rows') | |
140 return | |
141 | |
142 if arg_bins != []: | |
143 x_col = 'Bin' | |
144 bin_counts = {} | |
145 for x in arg_bins: | |
146 bin_counts[x] = sum(map_res[x].value_counts()) | |
147 bin_res = _split_results(map_res[x].value_counts(), x_col, y_col) | |
148 if not bin_res.empty: | |
149 bin_res = bin_res.sort_values(y_col,ascending=False) | |
150 plt.clf() | |
151 sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_res, ci=None).get_figure() | |
152 plt.xticks(rotation=90) | |
153 plt.tight_layout() | |
154 plt.savefig(f'lexmapr_figures/{x}_binning.png') | |
155 | |
156 plt.clf() | |
157 bin_pd = pandas.DataFrame({x_col:list(bin_counts.keys()), | |
158 y_col:list(bin_counts.values())}) | |
159 bin_pd = bin_pd.sort_values(y_col,ascending=False) | |
160 sns_fig = sns.barplot(x=x_col, y=y_col, data=bin_pd, ci=None).get_figure() | |
161 plt.xticks(rotation=90) | |
162 plt.tight_layout() | |
163 sns_fig.savefig('lexmapr_figures/binning_results.png') | |
164 | |
165 # TODO: make node colors vary with frequency and color ones that are both top and bottom? | |
166 for x in arg_bins: | |
167 print(f'\tMight generate {x} ontology graph...'.ljust(80),end='\r') | |
168 lcp_list, term_list = _get_ontols(map_res, 'Matched_Components', x) | |
169 if lcp_list != [] and term_list != []: | |
170 bin_package = ontr.Ontology_package('.', list(term_list)) | |
171 bin_package.set_lcp(lcp_list) | |
172 bin_package.visualize_terms(f'lexmapr_figures/{x}_terms.png', | |
173 show_lcp=True, fill_out=True, trim_nodes=True) |