annotate lexmapr/pipeline.py @ 0:f5c39d0447be

"planemo upload"
author kkonganti
date Wed, 31 Aug 2022 14:32:07 -0400
parents
children
rev   line source
kkonganti@0 1 """Pipeline script"""
kkonganti@0 2
kkonganti@0 3 import csv, datetime, logging, re, sqlite3, sys
kkonganti@0 4 import lexmapr.create_databases as cdkp
kkonganti@0 5 import lexmapr.ontology_reasoner as ontr
kkonganti@0 6 import lexmapr.pipeline_helpers as helpers
kkonganti@0 7 import lexmapr.pipeline_resources as pipeline_resources
kkonganti@0 8 import lexmapr.run_summary as summarize
kkonganti@0 9 from itertools import permutations
kkonganti@0 10 from collections import OrderedDict
kkonganti@0 11 from nltk.tokenize import word_tokenize
kkonganti@0 12 from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest
kkonganti@0 13
kkonganti@0 14
kkonganti@0 15 # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product)
kkonganti@0 16 # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function
kkonganti@0 17 # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition
kkonganti@0 18 # TODO: make web on database instead of pulling relationships from API?
kkonganti@0 19 # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms
kkonganti@0 20
kkonganti@0 21 def run(run_args):
kkonganti@0 22 '''Main text processing and mapping pipeline'''
kkonganti@0 23
kkonganti@0 24 # Add information from EMBL and predefined_resources folder
kkonganti@0 25 t0 = datetime.datetime.now()
kkonganti@0 26 global ontol_interest
kkonganti@0 27 if run_args.embl_ontol:
kkonganti@0 28 ontol_interest = run_args.embl_ontol
kkonganti@0 29
kkonganti@0 30 print('\nBuilding databases...')
kkonganti@0 31 cdkp.get_synonyms(run_args.remake_cache, ontol_interest)
kkonganti@0 32 cdkp.get_resource_ids(run_args.remake_cache, ontol_interest)
kkonganti@0 33 lookup_table = pipeline_resources.get_predefined_resources()
kkonganti@0 34 t1 = datetime.datetime.now()
kkonganti@0 35 print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...')
kkonganti@0 36 logging.info(f'Database build/confirm: {t1}')
kkonganti@0 37
kkonganti@0 38 # Apply other arguments and initiate mapping cache
kkonganti@0 39 term_cache = {'':'\t\t\t\t',}
kkonganti@0 40 output_fields = ['Sample_Id',
kkonganti@0 41 'Sample_Desc',
kkonganti@0 42 'Processed_Sample',
kkonganti@0 43 'Annotated_Sample',
kkonganti@0 44 'Matched_Components']
kkonganti@0 45 if run_args.full:
kkonganti@0 46 output_fields += ['Match_Status (Macro Level)',
kkonganti@0 47 'Match_Status (Micro Level)',
kkonganti@0 48 'Sample_Transformations']
kkonganti@0 49 term_cache[''] += '\t\t'
kkonganti@0 50 else:
kkonganti@0 51 output_fields += ['Match_Status (Macro Level)']
kkonganti@0 52
kkonganti@0 53 if run_args.bin:
kkonganti@0 54 global arg_bins
kkonganti@0 55 if run_args.user_bin is not None:
kkonganti@0 56 arg_bins = run_args.user_bin
kkonganti@0 57 for x in arg_bins:
kkonganti@0 58 arg_bins[x] = ontr.Ontology_package(x, arg_bins[x])
kkonganti@0 59 term_cache[''] += '\t'*len(arg_bins)
kkonganti@0 60 output_fields += list(arg_bins.keys())
kkonganti@0 61 else:
kkonganti@0 62 arg_bins = {}
kkonganti@0 63
kkonganti@0 64 OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout
kkonganti@0 65 if OUT_file is sys.stdout:
kkonganti@0 66 OUT_file.write('\n')
kkonganti@0 67 OUT_file.write('\t'.join(output_fields))
kkonganti@0 68
kkonganti@0 69 IN_file = open(run_args.input, 'r')
kkonganti@0 70 if run_args.input[-4:] == '.csv':
kkonganti@0 71 fr_reader = csv.reader(IN_file, delimiter=',')
kkonganti@0 72 elif run_args.input[-4:] == '.tsv':
kkonganti@0 73 fr_reader = csv.reader(IN_file, delimiter='\t')
kkonganti@0 74 next(fr_reader)
kkonganti@0 75
kkonganti@0 76 # Connect to primary database
kkonganti@0 77 conn = sqlite3.connect(ontol_db)
kkonganti@0 78 c = conn.cursor()
kkonganti@0 79
kkonganti@0 80 # Iterate over samples in input file
kkonganti@0 81 for sample_row in fr_reader:
kkonganti@0 82 sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:])
kkonganti@0 83 original_sample = ' '.join(sample_row[1:]).strip()
kkonganti@0 84 cleaned_sample = ''
kkonganti@0 85 cleaned_annotated = ''
kkonganti@0 86 macro_status = 'No Match'
kkonganti@0 87 matched_components = []
kkonganti@0 88 synonym_match = []
kkonganti@0 89 micro_status = []
kkonganti@0 90 bin_class = {x:[] for x in arg_bins}
kkonganti@0 91 ancestors = set()
kkonganti@0 92 sample_conversion_status = {}
kkonganti@0 93 synonym_map = {}
kkonganti@0 94 treated_sample = helpers.punctuation_treatment(original_sample)
kkonganti@0 95
kkonganti@0 96 # Determine if sample in predefined list of null values
kkonganti@0 97 if treated_sample in not_provided:
kkonganti@0 98 write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status
kkonganti@0 99 OUT_file.write(sample_id + write_line)
kkonganti@0 100 if run_args.full:
kkonganti@0 101 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
kkonganti@0 102 if run_args.bin:
kkonganti@0 103 OUT_file.write('\t'*len(bin_class))
kkonganti@0 104 continue
kkonganti@0 105
kkonganti@0 106 # Remove negated words and some compound words, apply corrections
kkonganti@0 107 proc_sample = helpers.further_cleanup(treated_sample)
kkonganti@0 108 proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status)
kkonganti@0 109
kkonganti@0 110 # Try finding processed sample in cache
kkonganti@0 111 try:
kkonganti@0 112 OUT_file.write(sample_id+term_cache[proc_sample])
kkonganti@0 113 continue
kkonganti@0 114 except(KeyError):
kkonganti@0 115 pass
kkonganti@0 116
kkonganti@0 117 # Attempt full term matches with and without suffixes
kkonganti@0 118 if OUT_file is not sys.stdout:
kkonganti@0 119 print('\tMatching '+sample_row[0].strip()+' '+ \
kkonganti@0 120 '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r')
kkonganti@0 121
kkonganti@0 122 full_term_match = helpers.map_term(treated_sample, lookup_table, c)
kkonganti@0 123 if full_term_match == []:
kkonganti@0 124 full_term_match = helpers.map_term(proc_sample, lookup_table, c)
kkonganti@0 125 if full_term_match != []:
kkonganti@0 126 micro_status.insert(0, 'Used Processed Sample')
kkonganti@0 127 if full_term_match == [] and 'FOODON' in ontol_interest:
kkonganti@0 128 full_term_match = helpers.map_term(proc_sample, lookup_table, c, True)
kkonganti@0 129 if full_term_match != []:
kkonganti@0 130 micro_status.insert(0, 'Used Processed Sample')
kkonganti@0 131
kkonganti@0 132 # Attempt full term match with cleaned sample using suffixes
kkonganti@0 133 if full_term_match == []:
kkonganti@0 134 for sw_token in word_tokenize(proc_sample):
kkonganti@0 135 if helpers.is_date(sw_token) or helpers.is_number(sw_token):
kkonganti@0 136 continue
kkonganti@0 137 lemma, micro_status = helpers.singularize_token(sw_token, lookup_table,
kkonganti@0 138 micro_status, c)
kkonganti@0 139 if not sw_token == lemma:
kkonganti@0 140 sample_conversion_status[sw_token] = lemma
kkonganti@0 141 cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table)
kkonganti@0 142 # Not de-duplicating tokens because can't account for all legitimate double names
kkonganti@0 143
kkonganti@0 144 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c)
kkonganti@0 145 if full_term_match == [] and 'FOODON' in ontol_interest:
kkonganti@0 146 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True)
kkonganti@0 147 if full_term_match != []:
kkonganti@0 148 micro_status.insert(0, 'Used Cleaned Sample')
kkonganti@0 149
kkonganti@0 150 # Combine the matched terms
kkonganti@0 151 if full_term_match != []:
kkonganti@0 152 for x in full_term_match:
kkonganti@0 153 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0 154 macro_status = 'Full Term Match'
kkonganti@0 155 micro_status += x['status']
kkonganti@0 156
kkonganti@0 157 # Try matching permutations if full term match fails
kkonganti@0 158 # Functions mostly retained from v 0.7
kkonganti@0 159 if macro_status == 'No Match':
kkonganti@0 160 covered_tokens = set()
kkonganti@0 161 for i in range(5, 0, -1):
kkonganti@0 162 for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
kkonganti@0 163 concat_gram_chunk = ' '.join(gram_chunk)
kkonganti@0 164 gram_permutations =\
kkonganti@0 165 list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))
kkonganti@0 166 if set(gram_chunk) <= covered_tokens:
kkonganti@0 167 continue
kkonganti@0 168 for gram_permutation in gram_permutations:
kkonganti@0 169 gram_permutation_str = ' '.join(gram_permutation)
kkonganti@0 170 component_match = helpers.map_term(gram_permutation_str, lookup_table, c)
kkonganti@0 171 if not component_match and 'FOODON' in ontol_interest:
kkonganti@0 172 component_match = helpers.map_term(gram_permutation_str,
kkonganti@0 173 lookup_table, c, True)
kkonganti@0 174 if component_match:
kkonganti@0 175 for x in component_match:
kkonganti@0 176 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0 177 macro_status = 'Component Match'
kkonganti@0 178 micro_status += x['status']
kkonganti@0 179 covered_tokens.update(gram_chunk)
kkonganti@0 180
kkonganti@0 181 # Try matching annotated synonyms if component match fails
kkonganti@0 182 if macro_status == 'No Match':
kkonganti@0 183 for clean_token in cleaned_sample.split():
kkonganti@0 184 cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token)
kkonganti@0 185 synonym_map.update(s_m)
kkonganti@0 186 cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map)
kkonganti@0 187
kkonganti@0 188 for x in helpers.get_annotated_synonyms(cleaned_annotated):
kkonganti@0 189 synonym_match.extend(helpers.map_term(x, lookup_table, c))
kkonganti@0 190 if synonym_match == [] and 'FOODON' in ontol_interest:
kkonganti@0 191 for x in helpers.get_annotated_synonyms(cleaned_annotated):
kkonganti@0 192 synonym_match.extend(helpers.map_term(x, lookup_table, c, True))
kkonganti@0 193 if synonym_match != []:
kkonganti@0 194 macro_status = 'Synonym Match'
kkonganti@0 195 for x in synonym_match:
kkonganti@0 196 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0 197 micro_status += x['status']
kkonganti@0 198
kkonganti@0 199 # Remove matches that are ancestral to other matches
kkonganti@0 200 if run_args.no_ancestors:
kkonganti@0 201 for match_term in matched_components:
kkonganti@0 202 match_term = match_term.replace('NCBITAXON','NCBITaxon')
kkonganti@0 203 ontol_acc = ontr.Ontology_accession.make_instance(match_term)
kkonganti@0 204 ontol_anc = ontol_acc.get_family('ancestors')
kkonganti@0 205 try:
kkonganti@0 206 ontol_anc.remove('none found')
kkonganti@0 207 except(ValueError):
kkonganti@0 208 pass
kkonganti@0 209 ancestors |= set([x.id for x in ontol_anc])
kkonganti@0 210
kkonganti@0 211 final_matches = []
kkonganti@0 212 for match_term in matched_components:
kkonganti@0 213 if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors:
kkonganti@0 214 final_matches.append(match_term)
kkonganti@0 215
kkonganti@0 216 # Bin matches
kkonganti@0 217 for x in arg_bins:
kkonganti@0 218 for y in matched_components:
kkonganti@0 219 ontol_y = ontr.Ontology_accession.make_instance(y)
kkonganti@0 220 bin_class[x].extend(ontol_y.bin_term(arg_bins[x]))
kkonganti@0 221
kkonganti@0 222 # Write to output
kkonganti@0 223 if cleaned_annotated == '':
kkonganti@0 224 cleaned_annotated = 'Not annotated'
kkonganti@0 225 write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\
kkonganti@0 226 '\t' + '|'.join(sorted(set(matched_components))) + '\t' + macro_status
kkonganti@0 227 while re.search(' ', write_line):
kkonganti@0 228 write_line = write_line.replace(' ',' ')
kkonganti@0 229 term_cache[proc_sample] = write_line
kkonganti@0 230 OUT_file.write(sample_id + write_line)
kkonganti@0 231 if run_args.full:
kkonganti@0 232 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
kkonganti@0 233 term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status)
kkonganti@0 234 if run_args.bin:
kkonganti@0 235 for x in list(bin_class):
kkonganti@0 236 OUT_file.write('\t' + '|'.join(sorted(set(bin_class[x]))).replace('[]',''))
kkonganti@0 237 term_cache[proc_sample] += '\t' + '|'.join(sorted(set(bin_class[x])))
kkonganti@0 238
kkonganti@0 239
kkonganti@0 240 IN_file.close()
kkonganti@0 241 conn.close()
kkonganti@0 242 if OUT_file is not sys.stdout:
kkonganti@0 243 OUT_file.close()
kkonganti@0 244 else:
kkonganti@0 245 OUT_file.write('\n\n')
kkonganti@0 246
kkonganti@0 247 # Report results to log and generate graphs
kkonganti@0 248 t2 = datetime.datetime.now()
kkonganti@0 249 print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...')
kkonganti@0 250 if run_args.output:
kkonganti@0 251 summarize.report_results(run_args.output, list(arg_bins.keys()))
kkonganti@0 252 if run_args.graph == True:
kkonganti@0 253 summarize.figure_folder()
kkonganti@0 254 summarize.visualize_results(run_args.output, list(arg_bins.keys()))
kkonganti@0 255 else:
kkonganti@0 256 match_counts = summarize.report_cache(term_cache)
kkonganti@0 257 if run_args.graph == True:
kkonganti@0 258 summarize.figure_folder()
kkonganti@0 259 summarize.visualize_cache(match_counts)
kkonganti@0 260
kkonganti@0 261 print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n')