kkonganti@0
|
1 """Pipeline script"""
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import csv, datetime, logging, re, sqlite3, sys
|
kkonganti@0
|
4 import lexmapr.create_databases as cdkp
|
kkonganti@0
|
5 import lexmapr.ontology_reasoner as ontr
|
kkonganti@0
|
6 import lexmapr.pipeline_helpers as helpers
|
kkonganti@0
|
7 import lexmapr.pipeline_resources as pipeline_resources
|
kkonganti@0
|
8 import lexmapr.run_summary as summarize
|
kkonganti@0
|
9 from itertools import permutations
|
kkonganti@0
|
10 from collections import OrderedDict
|
kkonganti@0
|
11 from nltk.tokenize import word_tokenize
|
kkonganti@0
|
12 from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest
|
kkonganti@0
|
13
|
kkonganti@0
|
14
|
kkonganti@0
|
15 # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product)
|
kkonganti@0
|
16 # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function
|
kkonganti@0
|
17 # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition
|
kkonganti@0
|
18 # TODO: make web on database instead of pulling relationships from API?
|
kkonganti@0
|
19 # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms
|
kkonganti@0
|
20
|
kkonganti@0
|
21 def run(run_args):
|
kkonganti@0
|
22 '''Main text processing and mapping pipeline'''
|
kkonganti@0
|
23
|
kkonganti@0
|
24 # Add information from EMBL and predefined_resources folder
|
kkonganti@0
|
25 t0 = datetime.datetime.now()
|
kkonganti@0
|
26 global ontol_interest
|
kkonganti@0
|
27 if run_args.embl_ontol:
|
kkonganti@0
|
28 ontol_interest = run_args.embl_ontol
|
kkonganti@0
|
29
|
kkonganti@0
|
30 print('\nBuilding databases...')
|
kkonganti@0
|
31 cdkp.get_synonyms(run_args.remake_cache, ontol_interest)
|
kkonganti@0
|
32 cdkp.get_resource_ids(run_args.remake_cache, ontol_interest)
|
kkonganti@0
|
33 lookup_table = pipeline_resources.get_predefined_resources()
|
kkonganti@0
|
34 t1 = datetime.datetime.now()
|
kkonganti@0
|
35 print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...')
|
kkonganti@0
|
36 logging.info(f'Database build/confirm: {t1}')
|
kkonganti@0
|
37
|
kkonganti@0
|
38 # Apply other arguments and initiate mapping cache
|
kkonganti@0
|
39 term_cache = {'':'\t\t\t\t',}
|
kkonganti@0
|
40 output_fields = ['Sample_Id',
|
kkonganti@0
|
41 'Sample_Desc',
|
kkonganti@0
|
42 'Processed_Sample',
|
kkonganti@0
|
43 'Annotated_Sample',
|
kkonganti@0
|
44 'Matched_Components']
|
kkonganti@0
|
45 if run_args.full:
|
kkonganti@0
|
46 output_fields += ['Match_Status (Macro Level)',
|
kkonganti@0
|
47 'Match_Status (Micro Level)',
|
kkonganti@0
|
48 'Sample_Transformations']
|
kkonganti@0
|
49 term_cache[''] += '\t\t'
|
kkonganti@0
|
50 else:
|
kkonganti@0
|
51 output_fields += ['Match_Status (Macro Level)']
|
kkonganti@0
|
52
|
kkonganti@0
|
53 if run_args.bin:
|
kkonganti@0
|
54 global arg_bins
|
kkonganti@0
|
55 if run_args.user_bin is not None:
|
kkonganti@0
|
56 arg_bins = run_args.user_bin
|
kkonganti@0
|
57 for x in arg_bins:
|
kkonganti@0
|
58 arg_bins[x] = ontr.Ontology_package(x, arg_bins[x])
|
kkonganti@0
|
59 term_cache[''] += '\t'*len(arg_bins)
|
kkonganti@0
|
60 output_fields += list(arg_bins.keys())
|
kkonganti@0
|
61 else:
|
kkonganti@0
|
62 arg_bins = {}
|
kkonganti@0
|
63
|
kkonganti@0
|
64 OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout
|
kkonganti@0
|
65 if OUT_file is sys.stdout:
|
kkonganti@0
|
66 OUT_file.write('\n')
|
kkonganti@0
|
67 OUT_file.write('\t'.join(output_fields))
|
kkonganti@0
|
68
|
kkonganti@0
|
69 IN_file = open(run_args.input, 'r')
|
kkonganti@0
|
70 if run_args.input[-4:] == '.csv':
|
kkonganti@0
|
71 fr_reader = csv.reader(IN_file, delimiter=',')
|
kkonganti@0
|
72 elif run_args.input[-4:] == '.tsv':
|
kkonganti@0
|
73 fr_reader = csv.reader(IN_file, delimiter='\t')
|
kkonganti@0
|
74 next(fr_reader)
|
kkonganti@0
|
75
|
kkonganti@0
|
76 # Connect to primary database
|
kkonganti@0
|
77 conn = sqlite3.connect(ontol_db)
|
kkonganti@0
|
78 c = conn.cursor()
|
kkonganti@0
|
79
|
kkonganti@0
|
80 # Iterate over samples in input file
|
kkonganti@0
|
81 for sample_row in fr_reader:
|
kkonganti@0
|
82 sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:])
|
kkonganti@0
|
83 original_sample = ' '.join(sample_row[1:]).strip()
|
kkonganti@0
|
84 cleaned_sample = ''
|
kkonganti@0
|
85 cleaned_annotated = ''
|
kkonganti@0
|
86 macro_status = 'No Match'
|
kkonganti@0
|
87 matched_components = []
|
kkonganti@0
|
88 synonym_match = []
|
kkonganti@0
|
89 micro_status = []
|
kkonganti@0
|
90 bin_class = {x:[] for x in arg_bins}
|
kkonganti@0
|
91 ancestors = set()
|
kkonganti@0
|
92 sample_conversion_status = {}
|
kkonganti@0
|
93 synonym_map = {}
|
kkonganti@0
|
94 treated_sample = helpers.punctuation_treatment(original_sample)
|
kkonganti@0
|
95
|
kkonganti@0
|
96 # Determine if sample in predefined list of null values
|
kkonganti@0
|
97 if treated_sample in not_provided:
|
kkonganti@0
|
98 write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status
|
kkonganti@0
|
99 OUT_file.write(sample_id + write_line)
|
kkonganti@0
|
100 if run_args.full:
|
kkonganti@0
|
101 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
|
kkonganti@0
|
102 if run_args.bin:
|
kkonganti@0
|
103 OUT_file.write('\t'*len(bin_class))
|
kkonganti@0
|
104 continue
|
kkonganti@0
|
105
|
kkonganti@0
|
106 # Remove negated words and some compound words, apply corrections
|
kkonganti@0
|
107 proc_sample = helpers.further_cleanup(treated_sample)
|
kkonganti@0
|
108 proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status)
|
kkonganti@0
|
109
|
kkonganti@0
|
110 # Try finding processed sample in cache
|
kkonganti@0
|
111 try:
|
kkonganti@0
|
112 OUT_file.write(sample_id+term_cache[proc_sample])
|
kkonganti@0
|
113 continue
|
kkonganti@0
|
114 except(KeyError):
|
kkonganti@0
|
115 pass
|
kkonganti@0
|
116
|
kkonganti@0
|
117 # Attempt full term matches with and without suffixes
|
kkonganti@0
|
118 if OUT_file is not sys.stdout:
|
kkonganti@0
|
119 print('\tMatching '+sample_row[0].strip()+' '+ \
|
kkonganti@0
|
120 '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r')
|
kkonganti@0
|
121
|
kkonganti@0
|
122 full_term_match = helpers.map_term(treated_sample, lookup_table, c)
|
kkonganti@0
|
123 if full_term_match == []:
|
kkonganti@0
|
124 full_term_match = helpers.map_term(proc_sample, lookup_table, c)
|
kkonganti@0
|
125 if full_term_match != []:
|
kkonganti@0
|
126 micro_status.insert(0, 'Used Processed Sample')
|
kkonganti@0
|
127 if full_term_match == [] and 'FOODON' in ontol_interest:
|
kkonganti@0
|
128 full_term_match = helpers.map_term(proc_sample, lookup_table, c, True)
|
kkonganti@0
|
129 if full_term_match != []:
|
kkonganti@0
|
130 micro_status.insert(0, 'Used Processed Sample')
|
kkonganti@0
|
131
|
kkonganti@0
|
132 # Attempt full term match with cleaned sample using suffixes
|
kkonganti@0
|
133 if full_term_match == []:
|
kkonganti@0
|
134 for sw_token in word_tokenize(proc_sample):
|
kkonganti@0
|
135 if helpers.is_date(sw_token) or helpers.is_number(sw_token):
|
kkonganti@0
|
136 continue
|
kkonganti@0
|
137 lemma, micro_status = helpers.singularize_token(sw_token, lookup_table,
|
kkonganti@0
|
138 micro_status, c)
|
kkonganti@0
|
139 if not sw_token == lemma:
|
kkonganti@0
|
140 sample_conversion_status[sw_token] = lemma
|
kkonganti@0
|
141 cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table)
|
kkonganti@0
|
142 # Not de-duplicating tokens because can't account for all legitimate double names
|
kkonganti@0
|
143
|
kkonganti@0
|
144 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c)
|
kkonganti@0
|
145 if full_term_match == [] and 'FOODON' in ontol_interest:
|
kkonganti@0
|
146 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True)
|
kkonganti@0
|
147 if full_term_match != []:
|
kkonganti@0
|
148 micro_status.insert(0, 'Used Cleaned Sample')
|
kkonganti@0
|
149
|
kkonganti@0
|
150 # Combine the matched terms
|
kkonganti@0
|
151 if full_term_match != []:
|
kkonganti@0
|
152 for x in full_term_match:
|
kkonganti@0
|
153 matched_components.append(x['term'] + ':' + x['id'])
|
kkonganti@0
|
154 macro_status = 'Full Term Match'
|
kkonganti@0
|
155 micro_status += x['status']
|
kkonganti@0
|
156
|
kkonganti@0
|
157 # Try matching permutations if full term match fails
|
kkonganti@0
|
158 # Functions mostly retained from v 0.7
|
kkonganti@0
|
159 if macro_status == 'No Match':
|
kkonganti@0
|
160 covered_tokens = set()
|
kkonganti@0
|
161 for i in range(5, 0, -1):
|
kkonganti@0
|
162 for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
|
kkonganti@0
|
163 concat_gram_chunk = ' '.join(gram_chunk)
|
kkonganti@0
|
164 gram_permutations =\
|
kkonganti@0
|
165 list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))
|
kkonganti@0
|
166 if set(gram_chunk) <= covered_tokens:
|
kkonganti@0
|
167 continue
|
kkonganti@0
|
168 for gram_permutation in gram_permutations:
|
kkonganti@0
|
169 gram_permutation_str = ' '.join(gram_permutation)
|
kkonganti@0
|
170 component_match = helpers.map_term(gram_permutation_str, lookup_table, c)
|
kkonganti@0
|
171 if not component_match and 'FOODON' in ontol_interest:
|
kkonganti@0
|
172 component_match = helpers.map_term(gram_permutation_str,
|
kkonganti@0
|
173 lookup_table, c, True)
|
kkonganti@0
|
174 if component_match:
|
kkonganti@0
|
175 for x in component_match:
|
kkonganti@0
|
176 matched_components.append(x['term'] + ':' + x['id'])
|
kkonganti@0
|
177 macro_status = 'Component Match'
|
kkonganti@0
|
178 micro_status += x['status']
|
kkonganti@0
|
179 covered_tokens.update(gram_chunk)
|
kkonganti@0
|
180
|
kkonganti@0
|
181 # Try matching annotated synonyms if component match fails
|
kkonganti@0
|
182 if macro_status == 'No Match':
|
kkonganti@0
|
183 for clean_token in cleaned_sample.split():
|
kkonganti@0
|
184 cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token)
|
kkonganti@0
|
185 synonym_map.update(s_m)
|
kkonganti@0
|
186 cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map)
|
kkonganti@0
|
187
|
kkonganti@0
|
188 for x in helpers.get_annotated_synonyms(cleaned_annotated):
|
kkonganti@0
|
189 synonym_match.extend(helpers.map_term(x, lookup_table, c))
|
kkonganti@0
|
190 if synonym_match == [] and 'FOODON' in ontol_interest:
|
kkonganti@0
|
191 for x in helpers.get_annotated_synonyms(cleaned_annotated):
|
kkonganti@0
|
192 synonym_match.extend(helpers.map_term(x, lookup_table, c, True))
|
kkonganti@0
|
193 if synonym_match != []:
|
kkonganti@0
|
194 macro_status = 'Synonym Match'
|
kkonganti@0
|
195 for x in synonym_match:
|
kkonganti@0
|
196 matched_components.append(x['term'] + ':' + x['id'])
|
kkonganti@0
|
197 micro_status += x['status']
|
kkonganti@0
|
198
|
kkonganti@0
|
199 # Remove matches that are ancestral to other matches
|
kkonganti@0
|
200 if run_args.no_ancestors:
|
kkonganti@0
|
201 for match_term in matched_components:
|
kkonganti@0
|
202 match_term = match_term.replace('NCBITAXON','NCBITaxon')
|
kkonganti@0
|
203 ontol_acc = ontr.Ontology_accession.make_instance(match_term)
|
kkonganti@0
|
204 ontol_anc = ontol_acc.get_family('ancestors')
|
kkonganti@0
|
205 try:
|
kkonganti@0
|
206 ontol_anc.remove('none found')
|
kkonganti@0
|
207 except(ValueError):
|
kkonganti@0
|
208 pass
|
kkonganti@0
|
209 ancestors |= set([x.id for x in ontol_anc])
|
kkonganti@0
|
210
|
kkonganti@0
|
211 final_matches = []
|
kkonganti@0
|
212 for match_term in matched_components:
|
kkonganti@0
|
213 if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors:
|
kkonganti@0
|
214 final_matches.append(match_term)
|
kkonganti@0
|
215
|
kkonganti@0
|
216 # Bin matches
|
kkonganti@0
|
217 for x in arg_bins:
|
kkonganti@0
|
218 for y in matched_components:
|
kkonganti@0
|
219 ontol_y = ontr.Ontology_accession.make_instance(y)
|
kkonganti@0
|
220 bin_class[x].extend(ontol_y.bin_term(arg_bins[x]))
|
kkonganti@0
|
221
|
kkonganti@0
|
222 # Write to output
|
kkonganti@0
|
223 if cleaned_annotated == '':
|
kkonganti@0
|
224 cleaned_annotated = 'Not annotated'
|
kkonganti@0
|
225 write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\
|
kkonganti@0
|
226 '\t' + '|'.join(sorted(set(matched_components))) + '\t' + macro_status
|
kkonganti@0
|
227 while re.search(' ', write_line):
|
kkonganti@0
|
228 write_line = write_line.replace(' ',' ')
|
kkonganti@0
|
229 term_cache[proc_sample] = write_line
|
kkonganti@0
|
230 OUT_file.write(sample_id + write_line)
|
kkonganti@0
|
231 if run_args.full:
|
kkonganti@0
|
232 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
|
kkonganti@0
|
233 term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status)
|
kkonganti@0
|
234 if run_args.bin:
|
kkonganti@0
|
235 for x in list(bin_class):
|
kkonganti@0
|
236 OUT_file.write('\t' + '|'.join(sorted(set(bin_class[x]))).replace('[]',''))
|
kkonganti@0
|
237 term_cache[proc_sample] += '\t' + '|'.join(sorted(set(bin_class[x])))
|
kkonganti@0
|
238
|
kkonganti@0
|
239
|
kkonganti@0
|
240 IN_file.close()
|
kkonganti@0
|
241 conn.close()
|
kkonganti@0
|
242 if OUT_file is not sys.stdout:
|
kkonganti@0
|
243 OUT_file.close()
|
kkonganti@0
|
244 else:
|
kkonganti@0
|
245 OUT_file.write('\n\n')
|
kkonganti@0
|
246
|
kkonganti@0
|
247 # Report results to log and generate graphs
|
kkonganti@0
|
248 t2 = datetime.datetime.now()
|
kkonganti@0
|
249 print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...')
|
kkonganti@0
|
250 if run_args.output:
|
kkonganti@0
|
251 summarize.report_results(run_args.output, list(arg_bins.keys()))
|
kkonganti@0
|
252 if run_args.graph == True:
|
kkonganti@0
|
253 summarize.figure_folder()
|
kkonganti@0
|
254 summarize.visualize_results(run_args.output, list(arg_bins.keys()))
|
kkonganti@0
|
255 else:
|
kkonganti@0
|
256 match_counts = summarize.report_cache(term_cache)
|
kkonganti@0
|
257 if run_args.graph == True:
|
kkonganti@0
|
258 summarize.figure_folder()
|
kkonganti@0
|
259 summarize.visualize_cache(match_counts)
|
kkonganti@0
|
260
|
kkonganti@0
|
261 print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n')
|