annotate lexmapr/pipeline_helpers.py @ 0:f5c39d0447be

"planemo upload"
author kkonganti
date Wed, 31 Aug 2022 14:32:07 -0400
parents
children
rev   line source
kkonganti@0 1 """Helper functions for main pipeline"""
kkonganti@0 2
kkonganti@0 3 import inflection, re, unicodedata, sqlite3
kkonganti@0 4 from collections import OrderedDict
kkonganti@0 5 from itertools import combinations
kkonganti@0 6 from dateutil.parser import parse
kkonganti@0 7 from lexmapr.definitions import synonym_db
kkonganti@0 8 from nltk import pos_tag
kkonganti@0 9 from nltk.tokenize import word_tokenize
kkonganti@0 10 from nltk.tokenize.treebank import TreebankWordDetokenizer
kkonganti@0 11
kkonganti@0 12
kkonganti@0 13 def _lookup_correction(sample, lookup_table, lookup_x, micro_status, status_title):
kkonganti@0 14 '''Apply corections, if available in resource'''
kkonganti@0 15 sample = ' ' + sample + ' '
kkonganti@0 16 for x in lookup_table[lookup_x]:
kkonganti@0 17 find_x = re.findall(' '+x+' ', sample)
kkonganti@0 18 if find_x != []:
kkonganti@0 19 micro_status.append(status_title + x)
kkonganti@0 20 sample = sample.replace(' '+x+' ', ' '+lookup_table[lookup_x][x]+' ')
kkonganti@0 21 return(' '.join(sample.split()), micro_status)
kkonganti@0 22
kkonganti@0 23
kkonganti@0 24 def _remove_annotated_synonyms(input_annotations):
kkonganti@0 25 '''Remove annotations to see original phrase'''
kkonganti@0 26 output_sample = ''
kkonganti@0 27 copy_char = True
kkonganti@0 28 for x in input_annotations:
kkonganti@0 29 if x == '{':
kkonganti@0 30 copy_char = False
kkonganti@0 31 elif x == '}':
kkonganti@0 32 copy_char = True
kkonganti@0 33 else:
kkonganti@0 34 if copy_char == True:
kkonganti@0 35 output_sample += x
kkonganti@0 36 while re.search(' ', output_sample):
kkonganti@0 37 output_sample = output_sample.replace(' ', ' ')
kkonganti@0 38 return(output_sample)
kkonganti@0 39
kkonganti@0 40
kkonganti@0 41 def _retrieve_map_id(search_results, c):
kkonganti@0 42 '''Get resource id from database'''
kkonganti@0 43 return_list = []
kkonganti@0 44 for x in search_results:
kkonganti@0 45 c.execute('SELECT * FROM non_standard_resource_ids WHERE key=:key', {'key':x[1]})
kkonganti@0 46 for y in c.fetchall():
kkonganti@0 47 result_dic = {'term':y[1], 'id':y[0], 'status':[]}
kkonganti@0 48 if not result_dic in return_list:
kkonganti@0 49 return_list.append(result_dic)
kkonganti@0 50 return(return_list)
kkonganti@0 51
kkonganti@0 52
kkonganti@0 53 def _map_term_helper(term, c):
kkonganti@0 54 '''Maps term to resource or resource permutation'''
kkonganti@0 55 c.execute('SELECT * FROM standard_resource_labels WHERE key=:key', {'key':term})
kkonganti@0 56 search_results = c.fetchall()
kkonganti@0 57 if len(search_results) == 0:
kkonganti@0 58 c.execute('SELECT * FROM standard_resource_permutations WHERE key=:key', {'key':term})
kkonganti@0 59 search_results = c.fetchall()
kkonganti@0 60 if len(search_results) != 0:
kkonganti@0 61 return(_retrieve_map_id(search_results, c))
kkonganti@0 62 else:
kkonganti@0 63 return(_retrieve_map_id(search_results, c))
kkonganti@0 64 return(None)
kkonganti@0 65
kkonganti@0 66
kkonganti@0 67 def _ngrams(input_phrase, gram_value):
kkonganti@0 68 '''Get ngrams with a given value of gram_value'''
kkonganti@0 69 input_phrase = input_phrase.split()
kkonganti@0 70 output = []
kkonganti@0 71 for i in range(len(input_phrase) - gram_value + 1):
kkonganti@0 72 output.append(input_phrase[i:i + gram_value])
kkonganti@0 73 return(output)
kkonganti@0 74
kkonganti@0 75
kkonganti@0 76 def process_sample(sample, lookup_table, micro_status):
kkonganti@0 77 '''Apply corrections to input sample'''
kkonganti@0 78 sample, micro_status = _lookup_correction(sample, lookup_table, 'spelling_mistakes',
kkonganti@0 79 micro_status, 'Spelling Correction Treatment: ')
kkonganti@0 80 sample, micro_status = _lookup_correction(sample, lookup_table, 'abbreviations',
kkonganti@0 81 micro_status, 'Abbreviation-Acronym Treatment: ')
kkonganti@0 82 sample, micro_status = _lookup_correction(sample, lookup_table, 'non_english_words',
kkonganti@0 83 micro_status, 'Non English Language Words Treatment: ')
kkonganti@0 84 return(sample, micro_status)
kkonganti@0 85
kkonganti@0 86
kkonganti@0 87 def punctuation_treatment(untreated_term):
kkonganti@0 88 '''Remove punctuations from term'''
kkonganti@0 89 punctuations_regex_char_class = '[~`!@#$%^*()_\|/{}:;,.<>?]'
kkonganti@0 90 ret_term = ''
kkonganti@0 91 for word_token in untreated_term.split():
kkonganti@0 92 if word_token.count('-') > 1:
kkonganti@0 93 ret_term += word_token.replace('-',' ') + ' '
kkonganti@0 94 else:
kkonganti@0 95 ret_term += word_token + ' '
kkonganti@0 96 ret_term = ret_term.lower().replace('\"','').replace('\'ve','').replace('\'m','')
kkonganti@0 97 ret_term = ret_term.replace('\'s','').replace('\'t','').replace('\'ll','').replace('\'re','')
kkonganti@0 98 ret_term = ret_term.replace('\'','').replace('-','').replace('[','').replace(']','')
kkonganti@0 99 ret_term = ret_term.replace('&',' and ').replace('+',' and ').replace('=',' is ')
kkonganti@0 100 ret_term = re.sub(punctuations_regex_char_class, ' ', ret_term).lower()
kkonganti@0 101 return(' '.join(ret_term.split()))
kkonganti@0 102
kkonganti@0 103
kkonganti@0 104 def further_cleanup(sample_text):
kkonganti@0 105 '''Remove terms indicated to not be relevant and some compound words'''
kkonganti@0 106 new_text = []
kkonganti@0 107 neg_words = [r'no ',r'non',r'not',r'neither',r'nor',r'without']
kkonganti@0 108 stt_words = ['animal','cb','chicken','environmental','food','human','large','medium','necropsy',
kkonganti@0 109 'organic','other','poultry','product','sausage','small','stool','swab','wild',]
kkonganti@0 110 end_words = ['aspirate','culture','environmental','fluid','food','intestine','large','meal','medium',
kkonganti@0 111 'mixed','necropsy','other','poultry','product','research','sample','sausage','slaughter',
kkonganti@0 112 'small','swab','water','wild',]
kkonganti@0 113 not_replace = ['agriculture','apiculture','aquaculture','aquiculture','aviculture',
kkonganti@0 114 'coculture','hemoculture','mariculture','monoculture','sericulture',
kkonganti@0 115 'subculture','viniculture','viticulture',
kkonganti@0 116 'semifluid','subfluid','superfluid',
kkonganti@0 117 'superlarge','reenlarge','enlarge','overlarge','largemouth','larges',
kkonganti@0 118 'bonemeal','cornmeal','fishmeal','inchmeal','oatmeal','piecemeal','premeal',
kkonganti@0 119 'wholemeal','biosample','ensample','resample','subsample','backwater',
kkonganti@0 120 'another','bother','brother','foremother','frother','godmother','grandmother',
kkonganti@0 121 'housemother','mother','otherguess','otherness','othernesse','otherwhere',
kkonganti@0 122 'otherwhile','otherworld','pother','soother','smoother','smother','stepbrother',
kkonganti@0 123 'stepmother','tother',
kkonganti@0 124 'byproduct','coproduct','production','productive','subproduct',
kkonganti@0 125 'ultrasmall','smaller','smallmouth','smalltime','smallpox','smallpoxe',
kkonganti@0 126 'smallsword','smallsholder','mediumship',
kkonganti@0 127 'bathwater','bilgewater','blackwater','breakwater','cutwater','deepwater',
kkonganti@0 128 'dewater','dishwater','eyewater','firewater','floodwater','freshwater',
kkonganti@0 129 'graywater','groundwater','headwater','jerkwater','limewater','meltwater',
kkonganti@0 130 'overwater','polywater','rainwater','rosewater','saltwater','seawater',
kkonganti@0 131 'shearwater','springwater','tailwater','tidewater','underwater','wastewater',
kkonganti@0 132 'semiwild','wildcard','wildcat','wildcatter','wildcatted','wildebeest','wilded',
kkonganti@0 133 'wilder','wilderment','wilderness','wildernesse','wildest','wildfire','wildflower',
kkonganti@0 134 'wildfowl','wildfowler','wildish','wildland','wildling','wildlife','wildwood',
kkonganti@0 135 ]
kkonganti@0 136
kkonganti@0 137 found_comp = []
kkonganti@0 138 for comp_word in stt_words:
kkonganti@0 139 found_comp.extend(re.findall(f'({comp_word})(\w+)', sample_text))
kkonganti@0 140 for comp_word in end_words:
kkonganti@0 141 found_comp.extend(re.findall(f'(\w+)({comp_word})', sample_text))
kkonganti@0 142 for x in found_comp:
kkonganti@0 143 if x[0]+x[1] not in not_replace and x[0]+x[1]+'s' not in not_replace:
kkonganti@0 144 sample_text = sample_text.replace(x[0]+x[1], x[0]+' '+x[1])
kkonganti@0 145
kkonganti@0 146 for sample_word in sample_text.split():
kkonganti@0 147 if len(sample_word) > 1:
kkonganti@0 148 new_text.append(sample_word.strip())
kkonganti@0 149
kkonganti@0 150 if 'nor' in new_text:
kkonganti@0 151 if 'neither' not in new_text:
kkonganti@0 152 word_ind = new_text.index('nor')
kkonganti@0 153 new_text.insert(max[0,word_ind-2], 'neither')
kkonganti@0 154
kkonganti@0 155 for neg_word in neg_words:
kkonganti@0 156 if neg_word in new_text:
kkonganti@0 157 word_ind = new_text.index(neg_word)
kkonganti@0 158 del(new_text[word_ind:word_ind+2])
kkonganti@0 159 return(' '.join(new_text))
kkonganti@0 160
kkonganti@0 161
kkonganti@0 162 def is_number(input_string):
kkonganti@0 163 '''Determine whether a string is a number'''
kkonganti@0 164 try:
kkonganti@0 165 unicodedata.numeric(input_string)
kkonganti@0 166 return(True)
kkonganti@0 167 except(TypeError, ValueError):
kkonganti@0 168 return(False)
kkonganti@0 169
kkonganti@0 170
kkonganti@0 171 def is_date(input_string):
kkonganti@0 172 '''Determine whether a string is a date or day'''
kkonganti@0 173 try:
kkonganti@0 174 parse(input_string)
kkonganti@0 175 return(True)
kkonganti@0 176 except(ValueError, OverflowError):
kkonganti@0 177 return(False)
kkonganti@0 178
kkonganti@0 179
kkonganti@0 180 def singularize_token(token, lookup_table, micro_status, c):
kkonganti@0 181 '''Singularize the string token, if applicable'''
kkonganti@0 182 if token in lookup_table['inflection_exceptions']:
kkonganti@0 183 return(token, micro_status)
kkonganti@0 184
kkonganti@0 185 exception_tail_chars_list = ['us', 'ia', 'ta', 'ss'] # TODO: add as, is?
kkonganti@0 186 for char in exception_tail_chars_list:
kkonganti@0 187 if token.endswith(char):
kkonganti@0 188 return(token, micro_status)
kkonganti@0 189
kkonganti@0 190 taxon_names = c.execute('''SELECT * FROM standard_resource_labels WHERE key LIKE :key AND
kkonganti@0 191 value LIKE :value''',
kkonganti@0 192 {'key':'% '+token,'value':'NCBITaxon%'}).fetchall()
kkonganti@0 193 remove_finds = []
kkonganti@0 194 for x in taxon_names:
kkonganti@0 195 if len(x[0].split()) > 2:
kkonganti@0 196 remove_finds.append(x)
kkonganti@0 197 for x in remove_finds:
kkonganti@0 198 taxon_names.remove(x)
kkonganti@0 199 if taxon_names != []:
kkonganti@0 200 return(token, micro_status)
kkonganti@0 201
kkonganti@0 202 lemma = inflection.singularize(token)
kkonganti@0 203 micro_status.append('Inflection (Plural) Treatment: ' + token)
kkonganti@0 204 return(lemma, micro_status)
kkonganti@0 205
kkonganti@0 206
kkonganti@0 207 def get_cleaned_sample(input_sample, token, lookup_table):
kkonganti@0 208 '''Prepare the cleaned sample phrase using the input token'''
kkonganti@0 209 if input_sample == '' and token not in lookup_table['stop_words']:
kkonganti@0 210 return(token)
kkonganti@0 211 elif token not in lookup_table['stop_words']:
kkonganti@0 212 return(input_sample + ' ' + token)
kkonganti@0 213 else:
kkonganti@0 214 return(input_sample)
kkonganti@0 215
kkonganti@0 216
kkonganti@0 217 def get_annotated_sample(annotated_sample, lemma):
kkonganti@0 218 '''Embed synonyms in the sample, if available'''
kkonganti@0 219 # TODO: able to annotate permuatations instead of just left to right?
kkonganti@0 220 synonym_map = {}
kkonganti@0 221 if not annotated_sample:
kkonganti@0 222 annotated_sample = lemma
kkonganti@0 223 else:
kkonganti@0 224 annotated_sample = f'{annotated_sample} {lemma}'
kkonganti@0 225
kkonganti@0 226 conn_syn = sqlite3.connect(synonym_db)
kkonganti@0 227 d = conn_syn.cursor()
kkonganti@0 228 for y in [lemma, _remove_annotated_synonyms(annotated_sample)]:
kkonganti@0 229 d.execute('SELECT * FROM label_synonyms WHERE key=:key', {'key':y})
kkonganti@0 230 for x in d.fetchall():
kkonganti@0 231 if not re.search(x[1], annotated_sample):
kkonganti@0 232 annotated_sample = annotated_sample+' {'+x[1]+'}'
kkonganti@0 233 synonym_map[y] = x[1]
kkonganti@0 234 conn_syn.close()
kkonganti@0 235 return(annotated_sample, synonym_map)
kkonganti@0 236
kkonganti@0 237
kkonganti@0 238 def map_term(term, lookup_table, c, consider_suffixes=False):
kkonganti@0 239 '''Map term to some resource in database'''
kkonganti@0 240 if consider_suffixes:
kkonganti@0 241 for suffix in lookup_table['suffixes']:
kkonganti@0 242 mapping = _map_term_helper(term+' '+suffix, c)
kkonganti@0 243 if mapping:
kkonganti@0 244 for x in mapping:
kkonganti@0 245 x['status'].insert(-2, 'Suffix Addition')
kkonganti@0 246 return(mapping)
kkonganti@0 247 else:
kkonganti@0 248 mapping = _map_term_helper(term, c)
kkonganti@0 249 if mapping:
kkonganti@0 250 return(mapping)
kkonganti@0 251 return([])
kkonganti@0 252
kkonganti@0 253
kkonganti@0 254 def annotation_reduce(annotated_sample, synonym_map):
kkonganti@0 255 '''Remove annotations on shorter phrases included in longer phrases with annotations'''
kkonganti@0 256 remove_list = []
kkonganti@0 257 for x in list(synonym_map.keys()):
kkonganti@0 258 for y in list(synonym_map.keys()):
kkonganti@0 259 if x != y:
kkonganti@0 260 if x.startswith(y) or x.endswith(y) == True:
kkonganti@0 261 remove_list.append(y)
kkonganti@0 262 for x in remove_list:
kkonganti@0 263 annotated_sample = annotated_sample.replace('{'+synonym_map[x]+'}',' ')
kkonganti@0 264 return(' '.join(annotated_sample.split()))
kkonganti@0 265
kkonganti@0 266
kkonganti@0 267 def get_annotated_synonyms(input_annotations):
kkonganti@0 268 '''Get list of the annotations'''
kkonganti@0 269 synonym_list = []
kkonganti@0 270 for x in input_annotations.split('{')[1:]:
kkonganti@0 271 synonym_list.append(x.split('}')[0])
kkonganti@0 272 return(synonym_list)
kkonganti@0 273
kkonganti@0 274
kkonganti@0 275 def get_gram_chunks(input_phrase, num):
kkonganti@0 276 '''Make num-gram chunks from input'''
kkonganti@0 277 input_tokens = input_phrase.split()
kkonganti@0 278 if len(input_tokens) < 15:
kkonganti@0 279 return(list(combinations(input_tokens, num)))
kkonganti@0 280 else:
kkonganti@0 281 return(_ngrams(input_phrase, num))