comparison lexmapr/pipeline.py @ 0:f5c39d0447be

"planemo upload"
author kkonganti
date Wed, 31 Aug 2022 14:32:07 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f5c39d0447be
1 """Pipeline script"""
2
3 import csv, datetime, logging, re, sqlite3, sys
4 import lexmapr.create_databases as cdkp
5 import lexmapr.ontology_reasoner as ontr
6 import lexmapr.pipeline_helpers as helpers
7 import lexmapr.pipeline_resources as pipeline_resources
8 import lexmapr.run_summary as summarize
9 from itertools import permutations
10 from collections import OrderedDict
11 from nltk.tokenize import word_tokenize
12 from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest
13
14
15 # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product)
16 # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function
17 # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition
18 # TODO: make web on database instead of pulling relationships from API?
19 # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms
20
21 def run(run_args):
22 '''Main text processing and mapping pipeline'''
23
24 # Add information from EMBL and predefined_resources folder
25 t0 = datetime.datetime.now()
26 global ontol_interest
27 if run_args.embl_ontol:
28 ontol_interest = run_args.embl_ontol
29
30 print('\nBuilding databases...')
31 cdkp.get_synonyms(run_args.remake_cache, ontol_interest)
32 cdkp.get_resource_ids(run_args.remake_cache, ontol_interest)
33 lookup_table = pipeline_resources.get_predefined_resources()
34 t1 = datetime.datetime.now()
35 print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...')
36 logging.info(f'Database build/confirm: {t1}')
37
38 # Apply other arguments and initiate mapping cache
39 term_cache = {'':'\t\t\t\t',}
40 output_fields = ['Sample_Id',
41 'Sample_Desc',
42 'Processed_Sample',
43 'Annotated_Sample',
44 'Matched_Components']
45 if run_args.full:
46 output_fields += ['Match_Status (Macro Level)',
47 'Match_Status (Micro Level)',
48 'Sample_Transformations']
49 term_cache[''] += '\t\t'
50 else:
51 output_fields += ['Match_Status (Macro Level)']
52
53 if run_args.bin:
54 global arg_bins
55 if run_args.user_bin is not None:
56 arg_bins = run_args.user_bin
57 for x in arg_bins:
58 arg_bins[x] = ontr.Ontology_package(x, arg_bins[x])
59 term_cache[''] += '\t'*len(arg_bins)
60 output_fields += list(arg_bins.keys())
61 else:
62 arg_bins = {}
63
64 OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout
65 if OUT_file is sys.stdout:
66 OUT_file.write('\n')
67 OUT_file.write('\t'.join(output_fields))
68
69 IN_file = open(run_args.input, 'r')
70 if run_args.input[-4:] == '.csv':
71 fr_reader = csv.reader(IN_file, delimiter=',')
72 elif run_args.input[-4:] == '.tsv':
73 fr_reader = csv.reader(IN_file, delimiter='\t')
74 next(fr_reader)
75
76 # Connect to primary database
77 conn = sqlite3.connect(ontol_db)
78 c = conn.cursor()
79
80 # Iterate over samples in input file
81 for sample_row in fr_reader:
82 sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:])
83 original_sample = ' '.join(sample_row[1:]).strip()
84 cleaned_sample = ''
85 cleaned_annotated = ''
86 macro_status = 'No Match'
87 matched_components = []
88 synonym_match = []
89 micro_status = []
90 bin_class = {x:[] for x in arg_bins}
91 ancestors = set()
92 sample_conversion_status = {}
93 synonym_map = {}
94 treated_sample = helpers.punctuation_treatment(original_sample)
95
96 # Determine if sample in predefined list of null values
97 if treated_sample in not_provided:
98 write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status
99 OUT_file.write(sample_id + write_line)
100 if run_args.full:
101 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
102 if run_args.bin:
103 OUT_file.write('\t'*len(bin_class))
104 continue
105
106 # Remove negated words and some compound words, apply corrections
107 proc_sample = helpers.further_cleanup(treated_sample)
108 proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status)
109
110 # Try finding processed sample in cache
111 try:
112 OUT_file.write(sample_id+term_cache[proc_sample])
113 continue
114 except(KeyError):
115 pass
116
117 # Attempt full term matches with and without suffixes
118 if OUT_file is not sys.stdout:
119 print('\tMatching '+sample_row[0].strip()+' '+ \
120 '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r')
121
122 full_term_match = helpers.map_term(treated_sample, lookup_table, c)
123 if full_term_match == []:
124 full_term_match = helpers.map_term(proc_sample, lookup_table, c)
125 if full_term_match != []:
126 micro_status.insert(0, 'Used Processed Sample')
127 if full_term_match == [] and 'FOODON' in ontol_interest:
128 full_term_match = helpers.map_term(proc_sample, lookup_table, c, True)
129 if full_term_match != []:
130 micro_status.insert(0, 'Used Processed Sample')
131
132 # Attempt full term match with cleaned sample using suffixes
133 if full_term_match == []:
134 for sw_token in word_tokenize(proc_sample):
135 if helpers.is_date(sw_token) or helpers.is_number(sw_token):
136 continue
137 lemma, micro_status = helpers.singularize_token(sw_token, lookup_table,
138 micro_status, c)
139 if not sw_token == lemma:
140 sample_conversion_status[sw_token] = lemma
141 cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table)
142 # Not de-duplicating tokens because can't account for all legitimate double names
143
144 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c)
145 if full_term_match == [] and 'FOODON' in ontol_interest:
146 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True)
147 if full_term_match != []:
148 micro_status.insert(0, 'Used Cleaned Sample')
149
150 # Combine the matched terms
151 if full_term_match != []:
152 for x in full_term_match:
153 matched_components.append(x['term'] + ':' + x['id'])
154 macro_status = 'Full Term Match'
155 micro_status += x['status']
156
157 # Try matching permutations if full term match fails
158 # Functions mostly retained from v 0.7
159 if macro_status == 'No Match':
160 covered_tokens = set()
161 for i in range(5, 0, -1):
162 for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
163 concat_gram_chunk = ' '.join(gram_chunk)
164 gram_permutations =\
165 list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))
166 if set(gram_chunk) <= covered_tokens:
167 continue
168 for gram_permutation in gram_permutations:
169 gram_permutation_str = ' '.join(gram_permutation)
170 component_match = helpers.map_term(gram_permutation_str, lookup_table, c)
171 if not component_match and 'FOODON' in ontol_interest:
172 component_match = helpers.map_term(gram_permutation_str,
173 lookup_table, c, True)
174 if component_match:
175 for x in component_match:
176 matched_components.append(x['term'] + ':' + x['id'])
177 macro_status = 'Component Match'
178 micro_status += x['status']
179 covered_tokens.update(gram_chunk)
180
181 # Try matching annotated synonyms if component match fails
182 if macro_status == 'No Match':
183 for clean_token in cleaned_sample.split():
184 cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token)
185 synonym_map.update(s_m)
186 cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map)
187
188 for x in helpers.get_annotated_synonyms(cleaned_annotated):
189 synonym_match.extend(helpers.map_term(x, lookup_table, c))
190 if synonym_match == [] and 'FOODON' in ontol_interest:
191 for x in helpers.get_annotated_synonyms(cleaned_annotated):
192 synonym_match.extend(helpers.map_term(x, lookup_table, c, True))
193 if synonym_match != []:
194 macro_status = 'Synonym Match'
195 for x in synonym_match:
196 matched_components.append(x['term'] + ':' + x['id'])
197 micro_status += x['status']
198
199 # Remove matches that are ancestral to other matches
200 if run_args.no_ancestors:
201 for match_term in matched_components:
202 match_term = match_term.replace('NCBITAXON','NCBITaxon')
203 ontol_acc = ontr.Ontology_accession.make_instance(match_term)
204 ontol_anc = ontol_acc.get_family('ancestors')
205 try:
206 ontol_anc.remove('none found')
207 except(ValueError):
208 pass
209 ancestors |= set([x.id for x in ontol_anc])
210
211 final_matches = []
212 for match_term in matched_components:
213 if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors:
214 final_matches.append(match_term)
215
216 # Bin matches
217 for x in arg_bins:
218 for y in matched_components:
219 ontol_y = ontr.Ontology_accession.make_instance(y)
220 bin_class[x].extend(ontol_y.bin_term(arg_bins[x]))
221
222 # Write to output
223 if cleaned_annotated == '':
224 cleaned_annotated = 'Not annotated'
225 write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\
226 '\t' + '|'.join(sorted(set(matched_components))) + '\t' + macro_status
227 while re.search(' ', write_line):
228 write_line = write_line.replace(' ',' ')
229 term_cache[proc_sample] = write_line
230 OUT_file.write(sample_id + write_line)
231 if run_args.full:
232 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
233 term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status)
234 if run_args.bin:
235 for x in list(bin_class):
236 OUT_file.write('\t' + '|'.join(sorted(set(bin_class[x]))).replace('[]',''))
237 term_cache[proc_sample] += '\t' + '|'.join(sorted(set(bin_class[x])))
238
239
240 IN_file.close()
241 conn.close()
242 if OUT_file is not sys.stdout:
243 OUT_file.close()
244 else:
245 OUT_file.write('\n\n')
246
247 # Report results to log and generate graphs
248 t2 = datetime.datetime.now()
249 print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...')
250 if run_args.output:
251 summarize.report_results(run_args.output, list(arg_bins.keys()))
252 if run_args.graph == True:
253 summarize.figure_folder()
254 summarize.visualize_results(run_args.output, list(arg_bins.keys()))
255 else:
256 match_counts = summarize.report_cache(term_cache)
257 if run_args.graph == True:
258 summarize.figure_folder()
259 summarize.visualize_cache(match_counts)
260
261 print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n')