Mercurial > repos > kkonganti > cfsan_lexmapr2
comparison lexmapr/pipeline.py @ 0:f5c39d0447be
"planemo upload"
author | kkonganti |
---|---|
date | Wed, 31 Aug 2022 14:32:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f5c39d0447be |
---|---|
1 """Pipeline script""" | |
2 | |
3 import csv, datetime, logging, re, sqlite3, sys | |
4 import lexmapr.create_databases as cdkp | |
5 import lexmapr.ontology_reasoner as ontr | |
6 import lexmapr.pipeline_helpers as helpers | |
7 import lexmapr.pipeline_resources as pipeline_resources | |
8 import lexmapr.run_summary as summarize | |
9 from itertools import permutations | |
10 from collections import OrderedDict | |
11 from nltk.tokenize import word_tokenize | |
12 from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest | |
13 | |
14 | |
15 # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product) | |
16 # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function | |
17 # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition | |
18 # TODO: make web on database instead of pulling relationships from API? | |
19 # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms | |
20 | |
21 def run(run_args): | |
22 '''Main text processing and mapping pipeline''' | |
23 | |
24 # Add information from EMBL and predefined_resources folder | |
25 t0 = datetime.datetime.now() | |
26 global ontol_interest | |
27 if run_args.embl_ontol: | |
28 ontol_interest = run_args.embl_ontol | |
29 | |
30 print('\nBuilding databases...') | |
31 cdkp.get_synonyms(run_args.remake_cache, ontol_interest) | |
32 cdkp.get_resource_ids(run_args.remake_cache, ontol_interest) | |
33 lookup_table = pipeline_resources.get_predefined_resources() | |
34 t1 = datetime.datetime.now() | |
35 print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...') | |
36 logging.info(f'Database build/confirm: {t1}') | |
37 | |
38 # Apply other arguments and initiate mapping cache | |
39 term_cache = {'':'\t\t\t\t',} | |
40 output_fields = ['Sample_Id', | |
41 'Sample_Desc', | |
42 'Processed_Sample', | |
43 'Annotated_Sample', | |
44 'Matched_Components'] | |
45 if run_args.full: | |
46 output_fields += ['Match_Status (Macro Level)', | |
47 'Match_Status (Micro Level)', | |
48 'Sample_Transformations'] | |
49 term_cache[''] += '\t\t' | |
50 else: | |
51 output_fields += ['Match_Status (Macro Level)'] | |
52 | |
53 if run_args.bin: | |
54 global arg_bins | |
55 if run_args.user_bin is not None: | |
56 arg_bins = run_args.user_bin | |
57 for x in arg_bins: | |
58 arg_bins[x] = ontr.Ontology_package(x, arg_bins[x]) | |
59 term_cache[''] += '\t'*len(arg_bins) | |
60 output_fields += list(arg_bins.keys()) | |
61 else: | |
62 arg_bins = {} | |
63 | |
64 OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout | |
65 if OUT_file is sys.stdout: | |
66 OUT_file.write('\n') | |
67 OUT_file.write('\t'.join(output_fields)) | |
68 | |
69 IN_file = open(run_args.input, 'r') | |
70 if run_args.input[-4:] == '.csv': | |
71 fr_reader = csv.reader(IN_file, delimiter=',') | |
72 elif run_args.input[-4:] == '.tsv': | |
73 fr_reader = csv.reader(IN_file, delimiter='\t') | |
74 next(fr_reader) | |
75 | |
76 # Connect to primary database | |
77 conn = sqlite3.connect(ontol_db) | |
78 c = conn.cursor() | |
79 | |
80 # Iterate over samples in input file | |
81 for sample_row in fr_reader: | |
82 sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:]) | |
83 original_sample = ' '.join(sample_row[1:]).strip() | |
84 cleaned_sample = '' | |
85 cleaned_annotated = '' | |
86 macro_status = 'No Match' | |
87 matched_components = [] | |
88 synonym_match = [] | |
89 micro_status = [] | |
90 bin_class = {x:[] for x in arg_bins} | |
91 ancestors = set() | |
92 sample_conversion_status = {} | |
93 synonym_map = {} | |
94 treated_sample = helpers.punctuation_treatment(original_sample) | |
95 | |
96 # Determine if sample in predefined list of null values | |
97 if treated_sample in not_provided: | |
98 write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status | |
99 OUT_file.write(sample_id + write_line) | |
100 if run_args.full: | |
101 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status)) | |
102 if run_args.bin: | |
103 OUT_file.write('\t'*len(bin_class)) | |
104 continue | |
105 | |
106 # Remove negated words and some compound words, apply corrections | |
107 proc_sample = helpers.further_cleanup(treated_sample) | |
108 proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status) | |
109 | |
110 # Try finding processed sample in cache | |
111 try: | |
112 OUT_file.write(sample_id+term_cache[proc_sample]) | |
113 continue | |
114 except(KeyError): | |
115 pass | |
116 | |
117 # Attempt full term matches with and without suffixes | |
118 if OUT_file is not sys.stdout: | |
119 print('\tMatching '+sample_row[0].strip()+' '+ \ | |
120 '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r') | |
121 | |
122 full_term_match = helpers.map_term(treated_sample, lookup_table, c) | |
123 if full_term_match == []: | |
124 full_term_match = helpers.map_term(proc_sample, lookup_table, c) | |
125 if full_term_match != []: | |
126 micro_status.insert(0, 'Used Processed Sample') | |
127 if full_term_match == [] and 'FOODON' in ontol_interest: | |
128 full_term_match = helpers.map_term(proc_sample, lookup_table, c, True) | |
129 if full_term_match != []: | |
130 micro_status.insert(0, 'Used Processed Sample') | |
131 | |
132 # Attempt full term match with cleaned sample using suffixes | |
133 if full_term_match == []: | |
134 for sw_token in word_tokenize(proc_sample): | |
135 if helpers.is_date(sw_token) or helpers.is_number(sw_token): | |
136 continue | |
137 lemma, micro_status = helpers.singularize_token(sw_token, lookup_table, | |
138 micro_status, c) | |
139 if not sw_token == lemma: | |
140 sample_conversion_status[sw_token] = lemma | |
141 cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table) | |
142 # Not de-duplicating tokens because can't account for all legitimate double names | |
143 | |
144 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c) | |
145 if full_term_match == [] and 'FOODON' in ontol_interest: | |
146 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True) | |
147 if full_term_match != []: | |
148 micro_status.insert(0, 'Used Cleaned Sample') | |
149 | |
150 # Combine the matched terms | |
151 if full_term_match != []: | |
152 for x in full_term_match: | |
153 matched_components.append(x['term'] + ':' + x['id']) | |
154 macro_status = 'Full Term Match' | |
155 micro_status += x['status'] | |
156 | |
157 # Try matching permutations if full term match fails | |
158 # Functions mostly retained from v 0.7 | |
159 if macro_status == 'No Match': | |
160 covered_tokens = set() | |
161 for i in range(5, 0, -1): | |
162 for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): | |
163 concat_gram_chunk = ' '.join(gram_chunk) | |
164 gram_permutations =\ | |
165 list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split()))) | |
166 if set(gram_chunk) <= covered_tokens: | |
167 continue | |
168 for gram_permutation in gram_permutations: | |
169 gram_permutation_str = ' '.join(gram_permutation) | |
170 component_match = helpers.map_term(gram_permutation_str, lookup_table, c) | |
171 if not component_match and 'FOODON' in ontol_interest: | |
172 component_match = helpers.map_term(gram_permutation_str, | |
173 lookup_table, c, True) | |
174 if component_match: | |
175 for x in component_match: | |
176 matched_components.append(x['term'] + ':' + x['id']) | |
177 macro_status = 'Component Match' | |
178 micro_status += x['status'] | |
179 covered_tokens.update(gram_chunk) | |
180 | |
181 # Try matching annotated synonyms if component match fails | |
182 if macro_status == 'No Match': | |
183 for clean_token in cleaned_sample.split(): | |
184 cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token) | |
185 synonym_map.update(s_m) | |
186 cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map) | |
187 | |
188 for x in helpers.get_annotated_synonyms(cleaned_annotated): | |
189 synonym_match.extend(helpers.map_term(x, lookup_table, c)) | |
190 if synonym_match == [] and 'FOODON' in ontol_interest: | |
191 for x in helpers.get_annotated_synonyms(cleaned_annotated): | |
192 synonym_match.extend(helpers.map_term(x, lookup_table, c, True)) | |
193 if synonym_match != []: | |
194 macro_status = 'Synonym Match' | |
195 for x in synonym_match: | |
196 matched_components.append(x['term'] + ':' + x['id']) | |
197 micro_status += x['status'] | |
198 | |
199 # Remove matches that are ancestral to other matches | |
200 if run_args.no_ancestors: | |
201 for match_term in matched_components: | |
202 match_term = match_term.replace('NCBITAXON','NCBITaxon') | |
203 ontol_acc = ontr.Ontology_accession.make_instance(match_term) | |
204 ontol_anc = ontol_acc.get_family('ancestors') | |
205 try: | |
206 ontol_anc.remove('none found') | |
207 except(ValueError): | |
208 pass | |
209 ancestors |= set([x.id for x in ontol_anc]) | |
210 | |
211 final_matches = [] | |
212 for match_term in matched_components: | |
213 if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors: | |
214 final_matches.append(match_term) | |
215 | |
216 # Bin matches | |
217 for x in arg_bins: | |
218 for y in matched_components: | |
219 ontol_y = ontr.Ontology_accession.make_instance(y) | |
220 bin_class[x].extend(ontol_y.bin_term(arg_bins[x])) | |
221 | |
222 # Write to output | |
223 if cleaned_annotated == '': | |
224 cleaned_annotated = 'Not annotated' | |
225 write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\ | |
226 '\t' + '|'.join(sorted(set(matched_components))) + '\t' + macro_status | |
227 while re.search(' ', write_line): | |
228 write_line = write_line.replace(' ',' ') | |
229 term_cache[proc_sample] = write_line | |
230 OUT_file.write(sample_id + write_line) | |
231 if run_args.full: | |
232 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status)) | |
233 term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status) | |
234 if run_args.bin: | |
235 for x in list(bin_class): | |
236 OUT_file.write('\t' + '|'.join(sorted(set(bin_class[x]))).replace('[]','')) | |
237 term_cache[proc_sample] += '\t' + '|'.join(sorted(set(bin_class[x]))) | |
238 | |
239 | |
240 IN_file.close() | |
241 conn.close() | |
242 if OUT_file is not sys.stdout: | |
243 OUT_file.close() | |
244 else: | |
245 OUT_file.write('\n\n') | |
246 | |
247 # Report results to log and generate graphs | |
248 t2 = datetime.datetime.now() | |
249 print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...') | |
250 if run_args.output: | |
251 summarize.report_results(run_args.output, list(arg_bins.keys())) | |
252 if run_args.graph == True: | |
253 summarize.figure_folder() | |
254 summarize.visualize_results(run_args.output, list(arg_bins.keys())) | |
255 else: | |
256 match_counts = summarize.report_cache(term_cache) | |
257 if run_args.graph == True: | |
258 summarize.figure_folder() | |
259 summarize.visualize_cache(match_counts) | |
260 | |
261 print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n') |