Mercurial > repos > kkonganti > cfsan_lexmapr2
diff lexmapr/pipeline_helpers.py @ 3:be95a7ce968a tip
"planemo upload"
author | kkonganti |
---|---|
date | Tue, 13 Sep 2022 11:32:24 -0400 |
parents | 5244e7465767 |
children |
line wrap: on
line diff
--- a/lexmapr/pipeline_helpers.py Wed Aug 31 14:32:14 2022 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,281 +0,0 @@ -"""Helper functions for main pipeline""" - -import inflection, re, unicodedata, sqlite3 -from collections import OrderedDict -from itertools import combinations -from dateutil.parser import parse -from lexmapr.definitions import synonym_db -from nltk import pos_tag -from nltk.tokenize import word_tokenize -from nltk.tokenize.treebank import TreebankWordDetokenizer - - -def _lookup_correction(sample, lookup_table, lookup_x, micro_status, status_title): - '''Apply corections, if available in resource''' - sample = ' ' + sample + ' ' - for x in lookup_table[lookup_x]: - find_x = re.findall(' '+x+' ', sample) - if find_x != []: - micro_status.append(status_title + x) - sample = sample.replace(' '+x+' ', ' '+lookup_table[lookup_x][x]+' ') - return(' '.join(sample.split()), micro_status) - - -def _remove_annotated_synonyms(input_annotations): - '''Remove annotations to see original phrase''' - output_sample = '' - copy_char = True - for x in input_annotations: - if x == '{': - copy_char = False - elif x == '}': - copy_char = True - else: - if copy_char == True: - output_sample += x - while re.search(' ', output_sample): - output_sample = output_sample.replace(' ', ' ') - return(output_sample) - - -def _retrieve_map_id(search_results, c): - '''Get resource id from database''' - return_list = [] - for x in search_results: - c.execute('SELECT * FROM non_standard_resource_ids WHERE key=:key', {'key':x[1]}) - for y in c.fetchall(): - result_dic = {'term':y[1], 'id':y[0], 'status':[]} - if not result_dic in return_list: - return_list.append(result_dic) - return(return_list) - - -def _map_term_helper(term, c): - '''Maps term to resource or resource permutation''' - c.execute('SELECT * FROM standard_resource_labels WHERE key=:key', {'key':term}) - search_results = c.fetchall() - if len(search_results) == 0: - c.execute('SELECT * FROM standard_resource_permutations WHERE key=:key', {'key':term}) - search_results = c.fetchall() - if len(search_results) != 0: - return(_retrieve_map_id(search_results, c)) - else: - return(_retrieve_map_id(search_results, c)) - return(None) - - -def _ngrams(input_phrase, gram_value): - '''Get ngrams with a given value of gram_value''' - input_phrase = input_phrase.split() - output = [] - for i in range(len(input_phrase) - gram_value + 1): - output.append(input_phrase[i:i + gram_value]) - return(output) - - -def process_sample(sample, lookup_table, micro_status): - '''Apply corrections to input sample''' - sample, micro_status = _lookup_correction(sample, lookup_table, 'spelling_mistakes', - micro_status, 'Spelling Correction Treatment: ') - sample, micro_status = _lookup_correction(sample, lookup_table, 'abbreviations', - micro_status, 'Abbreviation-Acronym Treatment: ') - sample, micro_status = _lookup_correction(sample, lookup_table, 'non_english_words', - micro_status, 'Non English Language Words Treatment: ') - return(sample, micro_status) - - -def punctuation_treatment(untreated_term): - '''Remove punctuations from term''' - punctuations_regex_char_class = '[~`!@#$%^*()_\|/{}:;,.<>?]' - ret_term = '' - for word_token in untreated_term.split(): - if word_token.count('-') > 1: - ret_term += word_token.replace('-',' ') + ' ' - else: - ret_term += word_token + ' ' - ret_term = ret_term.lower().replace('\"','').replace('\'ve','').replace('\'m','') - ret_term = ret_term.replace('\'s','').replace('\'t','').replace('\'ll','').replace('\'re','') - ret_term = ret_term.replace('\'','').replace('-','').replace('[','').replace(']','') - ret_term = ret_term.replace('&',' and ').replace('+',' and ').replace('=',' is ') - ret_term = re.sub(punctuations_regex_char_class, ' ', ret_term).lower() - return(' '.join(ret_term.split())) - - -def further_cleanup(sample_text): - '''Remove terms indicated to not be relevant and some compound words''' - new_text = [] - neg_words = [r'no ',r'non',r'not',r'neither',r'nor',r'without'] - stt_words = ['animal','cb','chicken','environmental','food','human','large','medium','necropsy', - 'organic','other','poultry','product','sausage','small','stool','swab','wild',] - end_words = ['aspirate','culture','environmental','fluid','food','intestine','large','meal','medium', - 'mixed','necropsy','other','poultry','product','research','sample','sausage','slaughter', - 'small','swab','water','wild',] - not_replace = ['agriculture','apiculture','aquaculture','aquiculture','aviculture', - 'coculture','hemoculture','mariculture','monoculture','sericulture', - 'subculture','viniculture','viticulture', - 'semifluid','subfluid','superfluid', - 'superlarge','reenlarge','enlarge','overlarge','largemouth','larges', - 'bonemeal','cornmeal','fishmeal','inchmeal','oatmeal','piecemeal','premeal', - 'wholemeal','biosample','ensample','resample','subsample','backwater', - 'another','bother','brother','foremother','frother','godmother','grandmother', - 'housemother','mother','otherguess','otherness','othernesse','otherwhere', - 'otherwhile','otherworld','pother','soother','smoother','smother','stepbrother', - 'stepmother','tother', - 'byproduct','coproduct','production','productive','subproduct', - 'ultrasmall','smaller','smallmouth','smalltime','smallpox','smallpoxe', - 'smallsword','smallsholder','mediumship', - 'bathwater','bilgewater','blackwater','breakwater','cutwater','deepwater', - 'dewater','dishwater','eyewater','firewater','floodwater','freshwater', - 'graywater','groundwater','headwater','jerkwater','limewater','meltwater', - 'overwater','polywater','rainwater','rosewater','saltwater','seawater', - 'shearwater','springwater','tailwater','tidewater','underwater','wastewater', - 'semiwild','wildcard','wildcat','wildcatter','wildcatted','wildebeest','wilded', - 'wilder','wilderment','wilderness','wildernesse','wildest','wildfire','wildflower', - 'wildfowl','wildfowler','wildish','wildland','wildling','wildlife','wildwood', - ] - - found_comp = [] - for comp_word in stt_words: - found_comp.extend(re.findall(f'({comp_word})(\w+)', sample_text)) - for comp_word in end_words: - found_comp.extend(re.findall(f'(\w+)({comp_word})', sample_text)) - for x in found_comp: - if x[0]+x[1] not in not_replace and x[0]+x[1]+'s' not in not_replace: - sample_text = sample_text.replace(x[0]+x[1], x[0]+' '+x[1]) - - for sample_word in sample_text.split(): - if len(sample_word) > 1: - new_text.append(sample_word.strip()) - - if 'nor' in new_text: - if 'neither' not in new_text: - word_ind = new_text.index('nor') - new_text.insert(max[0,word_ind-2], 'neither') - - for neg_word in neg_words: - if neg_word in new_text: - word_ind = new_text.index(neg_word) - del(new_text[word_ind:word_ind+2]) - return(' '.join(new_text)) - - -def is_number(input_string): - '''Determine whether a string is a number''' - try: - unicodedata.numeric(input_string) - return(True) - except(TypeError, ValueError): - return(False) - - -def is_date(input_string): - '''Determine whether a string is a date or day''' - try: - parse(input_string) - return(True) - except(ValueError, OverflowError): - return(False) - - -def singularize_token(token, lookup_table, micro_status, c): - '''Singularize the string token, if applicable''' - if token in lookup_table['inflection_exceptions']: - return(token, micro_status) - - exception_tail_chars_list = ['us', 'ia', 'ta', 'ss'] # TODO: add as, is? - for char in exception_tail_chars_list: - if token.endswith(char): - return(token, micro_status) - - taxon_names = c.execute('''SELECT * FROM standard_resource_labels WHERE key LIKE :key AND - value LIKE :value''', - {'key':'% '+token,'value':'NCBITaxon%'}).fetchall() - remove_finds = [] - for x in taxon_names: - if len(x[0].split()) > 2: - remove_finds.append(x) - for x in remove_finds: - taxon_names.remove(x) - if taxon_names != []: - return(token, micro_status) - - lemma = inflection.singularize(token) - micro_status.append('Inflection (Plural) Treatment: ' + token) - return(lemma, micro_status) - - -def get_cleaned_sample(input_sample, token, lookup_table): - '''Prepare the cleaned sample phrase using the input token''' - if input_sample == '' and token not in lookup_table['stop_words']: - return(token) - elif token not in lookup_table['stop_words']: - return(input_sample + ' ' + token) - else: - return(input_sample) - - -def get_annotated_sample(annotated_sample, lemma): - '''Embed synonyms in the sample, if available''' - # TODO: able to annotate permuatations instead of just left to right? - synonym_map = {} - if not annotated_sample: - annotated_sample = lemma - else: - annotated_sample = f'{annotated_sample} {lemma}' - - conn_syn = sqlite3.connect(synonym_db) - d = conn_syn.cursor() - for y in [lemma, _remove_annotated_synonyms(annotated_sample)]: - d.execute('SELECT * FROM label_synonyms WHERE key=:key', {'key':y}) - for x in d.fetchall(): - if not re.search(x[1], annotated_sample): - annotated_sample = annotated_sample+' {'+x[1]+'}' - synonym_map[y] = x[1] - conn_syn.close() - return(annotated_sample, synonym_map) - - -def map_term(term, lookup_table, c, consider_suffixes=False): - '''Map term to some resource in database''' - if consider_suffixes: - for suffix in lookup_table['suffixes']: - mapping = _map_term_helper(term+' '+suffix, c) - if mapping: - for x in mapping: - x['status'].insert(-2, 'Suffix Addition') - return(mapping) - else: - mapping = _map_term_helper(term, c) - if mapping: - return(mapping) - return([]) - - -def annotation_reduce(annotated_sample, synonym_map): - '''Remove annotations on shorter phrases included in longer phrases with annotations''' - remove_list = [] - for x in list(synonym_map.keys()): - for y in list(synonym_map.keys()): - if x != y: - if x.startswith(y) or x.endswith(y) == True: - remove_list.append(y) - for x in remove_list: - annotated_sample = annotated_sample.replace('{'+synonym_map[x]+'}',' ') - return(' '.join(annotated_sample.split())) - - -def get_annotated_synonyms(input_annotations): - '''Get list of the annotations''' - synonym_list = [] - for x in input_annotations.split('{')[1:]: - synonym_list.append(x.split('}')[0]) - return(synonym_list) - - -def get_gram_chunks(input_phrase, num): - '''Make num-gram chunks from input''' - input_tokens = input_phrase.split() - if len(input_tokens) < 15: - return(list(combinations(input_tokens, num))) - else: - return(_ngrams(input_phrase, num))