cstrittmatter@0: """Pipeline script""" cstrittmatter@0: cstrittmatter@0: import csv, datetime, logging, re, sqlite3, sys cstrittmatter@0: import lexmapr.create_databases as cdkp cstrittmatter@0: import lexmapr.ontology_reasoner as ontr cstrittmatter@0: import lexmapr.pipeline_helpers as helpers cstrittmatter@0: import lexmapr.pipeline_resources as pipeline_resources cstrittmatter@0: import lexmapr.run_summary as summarize cstrittmatter@0: from itertools import permutations cstrittmatter@0: from collections import OrderedDict cstrittmatter@0: from nltk.tokenize import word_tokenize cstrittmatter@0: from lexmapr.definitions import not_provided, arg_bins, ontol_db, ontol_interest cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: # TODO: deal with ambiguous terms: nut milk, dairy (building ENVO_00003862 v dairy food product) cstrittmatter@0: # TODO: combine ScorLex.csv and misspellings.csv and edit _add_predefined_resources function cstrittmatter@0: # TODO: decide what to do with same synonym, ex custom-customize vs custom-tradition cstrittmatter@0: # TODO: make web on database instead of pulling relationships from API? cstrittmatter@0: # TODO: remove synonyms over time from SynLex and resource_synonyms.csv; edit get_synonyms cstrittmatter@0: cstrittmatter@0: def run(run_args): cstrittmatter@0: '''Main text processing and mapping pipeline''' cstrittmatter@0: cstrittmatter@0: # Add information from EMBL and predefined_resources folder cstrittmatter@0: t0 = datetime.datetime.now() cstrittmatter@0: global ontol_interest cstrittmatter@0: if run_args.embl_ontol: cstrittmatter@0: ontol_interest = run_args.embl_ontol cstrittmatter@0: cstrittmatter@0: print('\nBuilding databases...') cstrittmatter@0: cdkp.get_synonyms(run_args.remake_cache, ontol_interest) cstrittmatter@0: cdkp.get_resource_ids(run_args.remake_cache, ontol_interest) cstrittmatter@0: lookup_table = pipeline_resources.get_predefined_resources() cstrittmatter@0: t1 = datetime.datetime.now() cstrittmatter@0: print(f'\tDone! {t1-t0} passed'.ljust(60) + '\nMapping terms...') cstrittmatter@0: logging.info(f'Database build/confirm: {t1}') cstrittmatter@0: cstrittmatter@0: # Apply other arguments and initiate mapping cache cstrittmatter@0: term_cache = {'':'\t\t\t\t',} cstrittmatter@0: output_fields = ['Sample_Id', cstrittmatter@0: 'Sample_Desc', cstrittmatter@0: 'Processed_Sample', cstrittmatter@0: 'Annotated_Sample', cstrittmatter@0: 'Matched_Components'] cstrittmatter@0: if run_args.full: cstrittmatter@0: output_fields += ['Match_Status (Macro Level)', cstrittmatter@0: 'Match_Status (Micro Level)', cstrittmatter@0: 'Sample_Transformations'] cstrittmatter@0: term_cache[''] += '\t\t' cstrittmatter@0: else: cstrittmatter@0: output_fields += ['Match_Status (Macro Level)'] cstrittmatter@0: cstrittmatter@0: if run_args.bin: cstrittmatter@0: global arg_bins cstrittmatter@0: if run_args.user_bin is not None: cstrittmatter@0: arg_bins = run_args.user_bin cstrittmatter@0: for x in arg_bins: cstrittmatter@0: arg_bins[x] = ontr.Ontology_package(x, arg_bins[x]) cstrittmatter@0: term_cache[''] += '\t'*len(arg_bins) cstrittmatter@0: output_fields += list(arg_bins.keys()) cstrittmatter@0: else: cstrittmatter@0: arg_bins = {} cstrittmatter@0: cstrittmatter@0: OUT_file = open(run_args.output, 'w') if run_args.output else sys.stdout cstrittmatter@0: if OUT_file is sys.stdout: cstrittmatter@0: OUT_file.write('\n') cstrittmatter@0: OUT_file.write('\t'.join(output_fields)) cstrittmatter@0: cstrittmatter@0: IN_file = open(run_args.input, 'r') cstrittmatter@0: if run_args.input[-4:] == '.csv': cstrittmatter@0: fr_reader = csv.reader(IN_file, delimiter=',') cstrittmatter@0: elif run_args.input[-4:] == '.tsv': cstrittmatter@0: fr_reader = csv.reader(IN_file, delimiter='\t') cstrittmatter@0: next(fr_reader) cstrittmatter@0: cstrittmatter@0: # Connect to primary database cstrittmatter@0: conn = sqlite3.connect(ontol_db) cstrittmatter@0: c = conn.cursor() cstrittmatter@0: cstrittmatter@0: # Iterate over samples in input file cstrittmatter@0: for sample_row in fr_reader: cstrittmatter@0: sample_id = '\n' + sample_row[0].strip() + '\t' + ','.join(sample_row[1:]) cstrittmatter@0: original_sample = ' '.join(sample_row[1:]).strip() cstrittmatter@0: cleaned_sample = '' cstrittmatter@0: cleaned_annotated = '' cstrittmatter@0: macro_status = 'No Match' cstrittmatter@0: matched_components = [] cstrittmatter@0: synonym_match = [] cstrittmatter@0: micro_status = [] cstrittmatter@0: bin_class = {x:[] for x in arg_bins} cstrittmatter@0: ancestors = set() cstrittmatter@0: sample_conversion_status = {} cstrittmatter@0: synonym_map = {} cstrittmatter@0: treated_sample = helpers.punctuation_treatment(original_sample) cstrittmatter@0: cstrittmatter@0: # Determine if sample in predefined list of null values cstrittmatter@0: if treated_sample in not_provided: cstrittmatter@0: write_line = '\t' + treated_sample + '\tNot annotated\t\t' + macro_status cstrittmatter@0: OUT_file.write(sample_id + write_line) cstrittmatter@0: if run_args.full: cstrittmatter@0: OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status)) cstrittmatter@0: if run_args.bin: cstrittmatter@0: OUT_file.write('\t'*len(bin_class)) cstrittmatter@0: continue cstrittmatter@0: cstrittmatter@0: # Remove negated words and some compound words, apply corrections cstrittmatter@0: proc_sample = helpers.further_cleanup(treated_sample) cstrittmatter@0: proc_sample, micro_status = helpers.process_sample(proc_sample,lookup_table,micro_status) cstrittmatter@0: cstrittmatter@0: # Try finding processed sample in cache cstrittmatter@0: try: cstrittmatter@0: OUT_file.write(sample_id+term_cache[proc_sample]) cstrittmatter@0: continue cstrittmatter@0: except(KeyError): cstrittmatter@0: pass cstrittmatter@0: cstrittmatter@0: # Attempt full term matches with and without suffixes cstrittmatter@0: if OUT_file is not sys.stdout: cstrittmatter@0: print('\tMatching '+sample_row[0].strip()+' '+ \ cstrittmatter@0: '{:<40}'.format(proc_sample[:40]).ljust(60), end='\r') cstrittmatter@0: cstrittmatter@0: full_term_match = helpers.map_term(treated_sample, lookup_table, c) cstrittmatter@0: if full_term_match == []: cstrittmatter@0: full_term_match = helpers.map_term(proc_sample, lookup_table, c) cstrittmatter@0: if full_term_match != []: cstrittmatter@0: micro_status.insert(0, 'Used Processed Sample') cstrittmatter@0: if full_term_match == [] and 'FOODON' in ontol_interest: cstrittmatter@0: full_term_match = helpers.map_term(proc_sample, lookup_table, c, True) cstrittmatter@0: if full_term_match != []: cstrittmatter@0: micro_status.insert(0, 'Used Processed Sample') cstrittmatter@0: cstrittmatter@0: # Attempt full term match with cleaned sample using suffixes cstrittmatter@0: if full_term_match == []: cstrittmatter@0: for sw_token in word_tokenize(proc_sample): cstrittmatter@0: if helpers.is_date(sw_token) or helpers.is_number(sw_token): cstrittmatter@0: continue cstrittmatter@0: lemma, micro_status = helpers.singularize_token(sw_token, lookup_table, cstrittmatter@0: micro_status, c) cstrittmatter@0: if not sw_token == lemma: cstrittmatter@0: sample_conversion_status[sw_token] = lemma cstrittmatter@0: cleaned_sample = helpers.get_cleaned_sample(cleaned_sample, lemma, lookup_table) cstrittmatter@0: # Not de-duplicating tokens because can't account for all legitimate double names cstrittmatter@0: cstrittmatter@0: full_term_match = helpers.map_term(cleaned_sample, lookup_table, c) cstrittmatter@0: if full_term_match == [] and 'FOODON' in ontol_interest: cstrittmatter@0: full_term_match = helpers.map_term(cleaned_sample, lookup_table, c, True) cstrittmatter@0: if full_term_match != []: cstrittmatter@0: micro_status.insert(0, 'Used Cleaned Sample') cstrittmatter@0: cstrittmatter@0: # Combine the matched terms cstrittmatter@0: if full_term_match != []: cstrittmatter@0: for x in full_term_match: cstrittmatter@0: matched_components.append(x['term'] + ':' + x['id']) cstrittmatter@0: macro_status = 'Full Term Match' cstrittmatter@0: micro_status += x['status'] cstrittmatter@0: cstrittmatter@0: # Try matching permutations if full term match fails cstrittmatter@0: # Functions mostly retained from v 0.7 cstrittmatter@0: if macro_status == 'No Match': cstrittmatter@0: covered_tokens = set() cstrittmatter@0: for i in range(5, 0, -1): cstrittmatter@0: for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): cstrittmatter@0: concat_gram_chunk = ' '.join(gram_chunk) cstrittmatter@0: gram_permutations =\ cstrittmatter@0: list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split()))) cstrittmatter@0: if set(gram_chunk) <= covered_tokens: cstrittmatter@0: continue cstrittmatter@0: for gram_permutation in gram_permutations: cstrittmatter@0: gram_permutation_str = ' '.join(gram_permutation) cstrittmatter@0: component_match = helpers.map_term(gram_permutation_str, lookup_table, c) cstrittmatter@0: if not component_match and 'FOODON' in ontol_interest: cstrittmatter@0: component_match = helpers.map_term(gram_permutation_str, cstrittmatter@0: lookup_table, c, True) cstrittmatter@0: if component_match: cstrittmatter@0: for x in component_match: cstrittmatter@0: matched_components.append(x['term'] + ':' + x['id']) cstrittmatter@0: macro_status = 'Component Match' cstrittmatter@0: micro_status += x['status'] cstrittmatter@0: covered_tokens.update(gram_chunk) cstrittmatter@0: cstrittmatter@0: # Try matching annotated synonyms if component match fails cstrittmatter@0: if macro_status == 'No Match': cstrittmatter@0: for clean_token in cleaned_sample.split(): cstrittmatter@0: cleaned_annotated,s_m=helpers.get_annotated_sample(cleaned_annotated,clean_token) cstrittmatter@0: synonym_map.update(s_m) cstrittmatter@0: cleaned_annotated = helpers.annotation_reduce(cleaned_annotated, synonym_map) cstrittmatter@0: cstrittmatter@0: for x in helpers.get_annotated_synonyms(cleaned_annotated): cstrittmatter@0: synonym_match.extend(helpers.map_term(x, lookup_table, c)) cstrittmatter@0: if synonym_match == [] and 'FOODON' in ontol_interest: cstrittmatter@0: for x in helpers.get_annotated_synonyms(cleaned_annotated): cstrittmatter@0: synonym_match.extend(helpers.map_term(x, lookup_table, c, True)) cstrittmatter@0: if synonym_match != []: cstrittmatter@0: macro_status = 'Synonym Match' cstrittmatter@0: for x in synonym_match: cstrittmatter@0: matched_components.append(x['term'] + ':' + x['id']) cstrittmatter@0: micro_status += x['status'] cstrittmatter@0: cstrittmatter@0: # Remove matches that are ancestral to other matches cstrittmatter@0: if run_args.no_ancestors: cstrittmatter@0: for match_term in matched_components: cstrittmatter@0: match_term = match_term.replace('NCBITAXON','NCBITaxon') cstrittmatter@0: ontol_acc = ontr.Ontology_accession.make_instance(match_term) cstrittmatter@0: ontol_anc = ontol_acc.get_family('ancestors') cstrittmatter@0: try: cstrittmatter@0: ontol_anc.remove('none found') cstrittmatter@0: except(ValueError): cstrittmatter@0: pass cstrittmatter@0: ancestors |= set([x.id for x in ontol_anc]) cstrittmatter@0: cstrittmatter@0: final_matches = [] cstrittmatter@0: for match_term in matched_components: cstrittmatter@0: if match_term.split(':')[-1].replace('NCBITAXON','NCBITaxon') not in ancestors: cstrittmatter@0: final_matches.append(match_term) cstrittmatter@0: cstrittmatter@0: # Bin matches cstrittmatter@0: for x in arg_bins: cstrittmatter@0: for y in matched_components: cstrittmatter@0: ontol_y = ontr.Ontology_accession.make_instance(y) cstrittmatter@0: bin_class[x].extend(ontol_y.bin_term(arg_bins[x])) cstrittmatter@0: cstrittmatter@0: # Write to output cstrittmatter@0: if cleaned_annotated == '': cstrittmatter@0: cleaned_annotated = 'Not annotated' cstrittmatter@0: write_line = '\t' + cleaned_sample + '\t' + cleaned_annotated +\ cstrittmatter@0: '\t' + '|'.join(sorted(set(matched_components))) + '\t' + macro_status cstrittmatter@0: while re.search(' ', write_line): cstrittmatter@0: write_line = write_line.replace(' ',' ') cstrittmatter@0: term_cache[proc_sample] = write_line cstrittmatter@0: OUT_file.write(sample_id + write_line) cstrittmatter@0: if run_args.full: cstrittmatter@0: OUT_file.write('\t' + str(micro_status) + '\t' + str(sample_conversion_status)) cstrittmatter@0: term_cache[proc_sample] += '\t'+str(micro_status)+'\t'+str(sample_conversion_status) cstrittmatter@0: if run_args.bin: cstrittmatter@0: for x in list(bin_class): cstrittmatter@0: OUT_file.write('\t' + '|'.join(sorted(set(bin_class[x]))).replace('[]','')) cstrittmatter@0: term_cache[proc_sample] += '\t' + '|'.join(sorted(set(bin_class[x]))) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: IN_file.close() cstrittmatter@0: conn.close() cstrittmatter@0: if OUT_file is not sys.stdout: cstrittmatter@0: OUT_file.close() cstrittmatter@0: else: cstrittmatter@0: OUT_file.write('\n\n') cstrittmatter@0: cstrittmatter@0: # Report results to log and generate graphs cstrittmatter@0: t2 = datetime.datetime.now() cstrittmatter@0: print(f'\tDone! {t2-t1} passed'.ljust(60) + '\nReporting results...') cstrittmatter@0: if run_args.output: cstrittmatter@0: summarize.report_results(run_args.output, list(arg_bins.keys())) cstrittmatter@0: if run_args.graph == True: cstrittmatter@0: summarize.figure_folder() cstrittmatter@0: summarize.visualize_results(run_args.output, list(arg_bins.keys())) cstrittmatter@0: else: cstrittmatter@0: match_counts = summarize.report_cache(term_cache) cstrittmatter@0: if run_args.graph == True: cstrittmatter@0: summarize.figure_folder() cstrittmatter@0: summarize.visualize_cache(match_counts) cstrittmatter@0: cstrittmatter@0: print('\t'+f'Done! {datetime.datetime.now()-t2} passed'.ljust(60)+'\n')