cfsan_lexmapr2: lexmapr/pipeline.py annotate

annotate lexmapr/pipeline.py @ 0:f5c39d0447be

"planemo upload"

author	kkonganti
date	Wed, 31 Aug 2022 14:32:07 -0400
parents
children

rev	line source
kkonganti@0	1 """Pipeline script"""
kkonganti@0	2
kkonganti@0	3 import csv, datetime, logging, re, sqlite3, sys
kkonganti@0	4 import lexmapr.create_databases as cdkp
kkonganti@0	5 import lexmapr.ontology_reasoner as ontr
kkonganti@0	6 import lexmapr.pipeline_helpers as helpers
kkonganti@0	7 import lexmapr.pipeline_resources as pipeline_resources
kkonganti@0	8 import lexmapr.run_summary as summarize
kkonganti@0	9 from itertools import permutations
kkonganti@0	10 from collections import OrderedDict
kkonganti@0	11 from nltk.tokenize import word_tokenize
kkonganti@0	12 from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest
kkonganti@0	13
kkonganti@0	14
kkonganti@0	15 # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product)
kkonganti@0	16 # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function
kkonganti@0	17 # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition
kkonganti@0	18 # TODO: make web on database instead of pulling relationships from API?
kkonganti@0	19 # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms
kkonganti@0	20
kkonganti@0	21 def run(run_args):
kkonganti@0	22 '''Main text processing and mapping pipeline'''
kkonganti@0	23
kkonganti@0	24 # Add information from EMBL and predefined_resources folder
kkonganti@0	25 t0 = datetime.datetime.now()
kkonganti@0	26 global ontol_interest
kkonganti@0	27 if run_args.embl_ontol:
kkonganti@0	28 ontol_interest = run_args.embl_ontol
kkonganti@0	29
kkonganti@0	30 print('\nBuilding databases...')
kkonganti@0	31 cdkp.get_synonyms(run_args.remake_cache, ontol_interest)
kkonganti@0	32 cdkp.get_resource_ids(run_args.remake_cache, ontol_interest)
kkonganti@0	33 lookup_table = pipeline_resources.get_predefined_resources()
kkonganti@0	34 t1 = datetime.datetime.now()
kkonganti@0	35 print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...')
kkonganti@0	36 logging.info(f'Database build/confirm: {t1}')
kkonganti@0	37
kkonganti@0	38 # Apply other arguments and initiate mapping cache
kkonganti@0	39 term_cache = {'':'\t\t\t\t',}
kkonganti@0	40 output_fields = ['Sample_Id',
kkonganti@0	41 'Sample_Desc',
kkonganti@0	42 'Processed_Sample',
kkonganti@0	43 'Annotated_Sample',
kkonganti@0	44 'Matched_Components']
kkonganti@0	45 if run_args.full:
kkonganti@0	46 output_fields += ['Match_Status (Macro Level)',
kkonganti@0	47 'Match_Status (Micro Level)',
kkonganti@0	48 'Sample_Transformations']
kkonganti@0	49 term_cache[''] += '\t\t'
kkonganti@0	50 else:
kkonganti@0	51 output_fields += ['Match_Status (Macro Level)']
kkonganti@0	52
kkonganti@0	53 if run_args.bin:
kkonganti@0	54 global arg_bins
kkonganti@0	55 if run_args.user_bin is not None:
kkonganti@0	56 arg_bins = run_args.user_bin
kkonganti@0	57 for x in arg_bins:
kkonganti@0	58 arg_bins[x] = ontr.Ontology_package(x, arg_bins[x])
kkonganti@0	59 term_cache[''] += '\t'*len(arg_bins)
kkonganti@0	60 output_fields += list(arg_bins.keys())
kkonganti@0	61 else:
kkonganti@0	62 arg_bins = {}
kkonganti@0	63
kkonganti@0	64 OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout
kkonganti@0	65 if OUT_file is sys.stdout:
kkonganti@0	66 OUT_file.write('\n')
kkonganti@0	67 OUT_file.write('\t'.join(output_fields))
kkonganti@0	68
kkonganti@0	69 IN_file = open(run_args.input, 'r')
kkonganti@0	70 if run_args.input[-4:] == '.csv':
kkonganti@0	71 fr_reader = csv.reader(IN_file, delimiter=',')
kkonganti@0	72 elif run_args.input[-4:] == '.tsv':
kkonganti@0	73 fr_reader = csv.reader(IN_file, delimiter='\t')
kkonganti@0	74 next(fr_reader)
kkonganti@0	75
kkonganti@0	76 # Connect to primary database
kkonganti@0	77 conn = sqlite3.connect(ontol_db)
kkonganti@0	78 c = conn.cursor()
kkonganti@0	79
kkonganti@0	80 # Iterate over samples in input file
kkonganti@0	81 for sample_row in fr_reader:
kkonganti@0	82 sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:])
kkonganti@0	83 original_sample = ' '.join(sample_row[1:]).strip()
kkonganti@0	84 cleaned_sample = ''
kkonganti@0	85 cleaned_annotated = ''
kkonganti@0	86 macro_status = 'No Match'
kkonganti@0	87 matched_components = []
kkonganti@0	88 synonym_match = []
kkonganti@0	89 micro_status = []
kkonganti@0	90 bin_class = {x:[] for x in arg_bins}
kkonganti@0	91 ancestors = set()
kkonganti@0	92 sample_conversion_status = {}
kkonganti@0	93 synonym_map = {}
kkonganti@0	94 treated_sample = helpers.punctuation_treatment(original_sample)
kkonganti@0	95
kkonganti@0	96 # Determine if sample in predefined list of null values
kkonganti@0	97 if treated_sample in not_provided:
kkonganti@0	98 write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status
kkonganti@0	99 OUT_file.write(sample_id + write_line)
kkonganti@0	100 if run_args.full:
kkonganti@0	101 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
kkonganti@0	102 if run_args.bin:
kkonganti@0	103 OUT_file.write('\t'*len(bin_class))
kkonganti@0	104 continue
kkonganti@0	105
kkonganti@0	106 # Remove negated words and some compound words, apply corrections
kkonganti@0	107 proc_sample = helpers.further_cleanup(treated_sample)
kkonganti@0	108 proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status)
kkonganti@0	109
kkonganti@0	110 # Try finding processed sample in cache
kkonganti@0	111 try:
kkonganti@0	112 OUT_file.write(sample_id+term_cache[proc_sample])
kkonganti@0	113 continue
kkonganti@0	114 except(KeyError):
kkonganti@0	115 pass
kkonganti@0	116
kkonganti@0	117 # Attempt full term matches with and without suffixes
kkonganti@0	118 if OUT_file is not sys.stdout:
kkonganti@0	119 print('\tMatching '+sample_row[0].strip()+' '+ \
kkonganti@0	120 '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r')
kkonganti@0	121
kkonganti@0	122 full_term_match = helpers.map_term(treated_sample, lookup_table, c)
kkonganti@0	123 if full_term_match == []:
kkonganti@0	124 full_term_match = helpers.map_term(proc_sample, lookup_table, c)
kkonganti@0	125 if full_term_match != []:
kkonganti@0	126 micro_status.insert(0, 'Used Processed Sample')
kkonganti@0	127 if full_term_match == [] and 'FOODON' in ontol_interest:
kkonganti@0	128 full_term_match = helpers.map_term(proc_sample, lookup_table, c, True)
kkonganti@0	129 if full_term_match != []:
kkonganti@0	130 micro_status.insert(0, 'Used Processed Sample')
kkonganti@0	131
kkonganti@0	132 # Attempt full term match with cleaned sample using suffixes
kkonganti@0	133 if full_term_match == []:
kkonganti@0	134 for sw_token in word_tokenize(proc_sample):
kkonganti@0	135 if helpers.is_date(sw_token) or helpers.is_number(sw_token):
kkonganti@0	136 continue
kkonganti@0	137 lemma, micro_status = helpers.singularize_token(sw_token, lookup_table,
kkonganti@0	138 micro_status, c)
kkonganti@0	139 if not sw_token == lemma:
kkonganti@0	140 sample_conversion_status[sw_token] = lemma
kkonganti@0	141 cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table)
kkonganti@0	142 # Not de-duplicating tokens because can't account for all legitimate double names
kkonganti@0	143
kkonganti@0	144 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c)
kkonganti@0	145 if full_term_match == [] and 'FOODON' in ontol_interest:
kkonganti@0	146 full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True)
kkonganti@0	147 if full_term_match != []:
kkonganti@0	148 micro_status.insert(0, 'Used Cleaned Sample')
kkonganti@0	149
kkonganti@0	150 # Combine the matched terms
kkonganti@0	151 if full_term_match != []:
kkonganti@0	152 for x in full_term_match:
kkonganti@0	153 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0	154 macro_status = 'Full Term Match'
kkonganti@0	155 micro_status += x['status']
kkonganti@0	156
kkonganti@0	157 # Try matching permutations if full term match fails
kkonganti@0	158 # Functions mostly retained from v 0.7
kkonganti@0	159 if macro_status == 'No Match':
kkonganti@0	160 covered_tokens = set()
kkonganti@0	161 for i in range(5, 0, -1):
kkonganti@0	162 for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
kkonganti@0	163 concat_gram_chunk = ' '.join(gram_chunk)
kkonganti@0	164 gram_permutations =\
kkonganti@0	165 list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))
kkonganti@0	166 if set(gram_chunk) <= covered_tokens:
kkonganti@0	167 continue
kkonganti@0	168 for gram_permutation in gram_permutations:
kkonganti@0	169 gram_permutation_str = ' '.join(gram_permutation)
kkonganti@0	170 component_match = helpers.map_term(gram_permutation_str, lookup_table, c)
kkonganti@0	171 if not component_match and 'FOODON' in ontol_interest:
kkonganti@0	172 component_match = helpers.map_term(gram_permutation_str,
kkonganti@0	173 lookup_table, c, True)
kkonganti@0	174 if component_match:
kkonganti@0	175 for x in component_match:
kkonganti@0	176 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0	177 macro_status = 'Component Match'
kkonganti@0	178 micro_status += x['status']
kkonganti@0	179 covered_tokens.update(gram_chunk)
kkonganti@0	180
kkonganti@0	181 # Try matching annotated synonyms if component match fails
kkonganti@0	182 if macro_status == 'No Match':
kkonganti@0	183 for clean_token in cleaned_sample.split():
kkonganti@0	184 cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token)
kkonganti@0	185 synonym_map.update(s_m)
kkonganti@0	186 cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map)
kkonganti@0	187
kkonganti@0	188 for x in helpers.get_annotated_synonyms(cleaned_annotated):
kkonganti@0	189 synonym_match.extend(helpers.map_term(x, lookup_table, c))
kkonganti@0	190 if synonym_match == [] and 'FOODON' in ontol_interest:
kkonganti@0	191 for x in helpers.get_annotated_synonyms(cleaned_annotated):
kkonganti@0	192 synonym_match.extend(helpers.map_term(x, lookup_table, c, True))
kkonganti@0	193 if synonym_match != []:
kkonganti@0	194 macro_status = 'Synonym Match'
kkonganti@0	195 for x in synonym_match:
kkonganti@0	196 matched_components.append(x['term'] + ':' + x['id'])
kkonganti@0	197 micro_status += x['status']
kkonganti@0	198
kkonganti@0	199 # Remove matches that are ancestral to other matches
kkonganti@0	200 if run_args.no_ancestors:
kkonganti@0	201 for match_term in matched_components:
kkonganti@0	202 match_term = match_term.replace('NCBITAXON','NCBITaxon')
kkonganti@0	203 ontol_acc = ontr.Ontology_accession.make_instance(match_term)
kkonganti@0	204 ontol_anc = ontol_acc.get_family('ancestors')
kkonganti@0	205 try:
kkonganti@0	206 ontol_anc.remove('none found')
kkonganti@0	207 except(ValueError):
kkonganti@0	208 pass
kkonganti@0	209 ancestors \|= set([x.id for x in ontol_anc])
kkonganti@0	210
kkonganti@0	211 final_matches = []
kkonganti@0	212 for match_term in matched_components:
kkonganti@0	213 if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors:
kkonganti@0	214 final_matches.append(match_term)
kkonganti@0	215
kkonganti@0	216 # Bin matches
kkonganti@0	217 for x in arg_bins:
kkonganti@0	218 for y in matched_components:
kkonganti@0	219 ontol_y = ontr.Ontology_accession.make_instance(y)
kkonganti@0	220 bin_class[x].extend(ontol_y.bin_term(arg_bins[x]))
kkonganti@0	221
kkonganti@0	222 # Write to output
kkonganti@0	223 if cleaned_annotated == '':
kkonganti@0	224 cleaned_annotated = 'Not annotated'
kkonganti@0	225 write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\
kkonganti@0	226 '\t' + '\|'.join(sorted(set(matched_components))) + '\t' + macro_status
kkonganti@0	227 while re.search(' ', write_line):
kkonganti@0	228 write_line = write_line.replace(' ',' ')
kkonganti@0	229 term_cache[proc_sample] = write_line
kkonganti@0	230 OUT_file.write(sample_id + write_line)
kkonganti@0	231 if run_args.full:
kkonganti@0	232 OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status))
kkonganti@0	233 term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status)
kkonganti@0	234 if run_args.bin:
kkonganti@0	235 for x in list(bin_class):
kkonganti@0	236 OUT_file.write('\t' + '\|'.join(sorted(set(bin_class[x]))).replace('[]',''))
kkonganti@0	237 term_cache[proc_sample] += '\t' + '\|'.join(sorted(set(bin_class[x])))
kkonganti@0	238
kkonganti@0	239
kkonganti@0	240 IN_file.close()
kkonganti@0	241 conn.close()
kkonganti@0	242 if OUT_file is not sys.stdout:
kkonganti@0	243 OUT_file.close()
kkonganti@0	244 else:
kkonganti@0	245 OUT_file.write('\n\n')
kkonganti@0	246
kkonganti@0	247 # Report results to log and generate graphs
kkonganti@0	248 t2 = datetime.datetime.now()
kkonganti@0	249 print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...')
kkonganti@0	250 if run_args.output:
kkonganti@0	251 summarize.report_results(run_args.output, list(arg_bins.keys()))
kkonganti@0	252 if run_args.graph == True:
kkonganti@0	253 summarize.figure_folder()
kkonganti@0	254 summarize.visualize_results(run_args.output, list(arg_bins.keys()))
kkonganti@0	255 else:
kkonganti@0	256 match_counts = summarize.report_cache(term_cache)
kkonganti@0	257 if run_args.graph == True:
kkonganti@0	258 summarize.figure_folder()
kkonganti@0	259 summarize.visualize_cache(match_counts)
kkonganti@0	260
kkonganti@0	261 print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n')

Mercurial > repos > kkonganti > cfsan_lexmapr2

annotate lexmapr/pipeline.py @ 0:f5c39d0447be