kkonganti@0
|
1 """Helper functions for main pipeline"""
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import inflection, re, unicodedata, sqlite3
|
kkonganti@0
|
4 from collections import OrderedDict
|
kkonganti@0
|
5 from itertools import combinations
|
kkonganti@0
|
6 from dateutil.parser import parse
|
kkonganti@0
|
7 from lexmapr.definitions import synonym_db
|
kkonganti@0
|
8 from nltk import pos_tag
|
kkonganti@0
|
9 from nltk.tokenize import word_tokenize
|
kkonganti@0
|
10 from nltk.tokenize.treebank import TreebankWordDetokenizer
|
kkonganti@0
|
11
|
kkonganti@0
|
12
|
kkonganti@0
|
13 def _lookup_correction(sample, lookup_table, lookup_x, micro_status, status_title):
|
kkonganti@0
|
14 '''Apply corections, if available in resource'''
|
kkonganti@0
|
15 sample = ' ' + sample + ' '
|
kkonganti@0
|
16 for x in lookup_table[lookup_x]:
|
kkonganti@0
|
17 find_x = re.findall(' '+x+' ', sample)
|
kkonganti@0
|
18 if find_x != []:
|
kkonganti@0
|
19 micro_status.append(status_title + x)
|
kkonganti@0
|
20 sample = sample.replace(' '+x+' ', ' '+lookup_table[lookup_x][x]+' ')
|
kkonganti@0
|
21 return(' '.join(sample.split()), micro_status)
|
kkonganti@0
|
22
|
kkonganti@0
|
23
|
kkonganti@0
|
24 def _remove_annotated_synonyms(input_annotations):
|
kkonganti@0
|
25 '''Remove annotations to see original phrase'''
|
kkonganti@0
|
26 output_sample = ''
|
kkonganti@0
|
27 copy_char = True
|
kkonganti@0
|
28 for x in input_annotations:
|
kkonganti@0
|
29 if x == '{':
|
kkonganti@0
|
30 copy_char = False
|
kkonganti@0
|
31 elif x == '}':
|
kkonganti@0
|
32 copy_char = True
|
kkonganti@0
|
33 else:
|
kkonganti@0
|
34 if copy_char == True:
|
kkonganti@0
|
35 output_sample += x
|
kkonganti@0
|
36 while re.search(' ', output_sample):
|
kkonganti@0
|
37 output_sample = output_sample.replace(' ', ' ')
|
kkonganti@0
|
38 return(output_sample)
|
kkonganti@0
|
39
|
kkonganti@0
|
40
|
kkonganti@0
|
41 def _retrieve_map_id(search_results, c):
|
kkonganti@0
|
42 '''Get resource id from database'''
|
kkonganti@0
|
43 return_list = []
|
kkonganti@0
|
44 for x in search_results:
|
kkonganti@0
|
45 c.execute('SELECT * FROM non_standard_resource_ids WHERE key=:key', {'key':x[1]})
|
kkonganti@0
|
46 for y in c.fetchall():
|
kkonganti@0
|
47 result_dic = {'term':y[1], 'id':y[0], 'status':[]}
|
kkonganti@0
|
48 if not result_dic in return_list:
|
kkonganti@0
|
49 return_list.append(result_dic)
|
kkonganti@0
|
50 return(return_list)
|
kkonganti@0
|
51
|
kkonganti@0
|
52
|
kkonganti@0
|
53 def _map_term_helper(term, c):
|
kkonganti@0
|
54 '''Maps term to resource or resource permutation'''
|
kkonganti@0
|
55 c.execute('SELECT * FROM standard_resource_labels WHERE key=:key', {'key':term})
|
kkonganti@0
|
56 search_results = c.fetchall()
|
kkonganti@0
|
57 if len(search_results) == 0:
|
kkonganti@0
|
58 c.execute('SELECT * FROM standard_resource_permutations WHERE key=:key', {'key':term})
|
kkonganti@0
|
59 search_results = c.fetchall()
|
kkonganti@0
|
60 if len(search_results) != 0:
|
kkonganti@0
|
61 return(_retrieve_map_id(search_results, c))
|
kkonganti@0
|
62 else:
|
kkonganti@0
|
63 return(_retrieve_map_id(search_results, c))
|
kkonganti@0
|
64 return(None)
|
kkonganti@0
|
65
|
kkonganti@0
|
66
|
kkonganti@0
|
67 def _ngrams(input_phrase, gram_value):
|
kkonganti@0
|
68 '''Get ngrams with a given value of gram_value'''
|
kkonganti@0
|
69 input_phrase = input_phrase.split()
|
kkonganti@0
|
70 output = []
|
kkonganti@0
|
71 for i in range(len(input_phrase) - gram_value + 1):
|
kkonganti@0
|
72 output.append(input_phrase[i:i + gram_value])
|
kkonganti@0
|
73 return(output)
|
kkonganti@0
|
74
|
kkonganti@0
|
75
|
kkonganti@0
|
76 def process_sample(sample, lookup_table, micro_status):
|
kkonganti@0
|
77 '''Apply corrections to input sample'''
|
kkonganti@0
|
78 sample, micro_status = _lookup_correction(sample, lookup_table, 'spelling_mistakes',
|
kkonganti@0
|
79 micro_status, 'Spelling Correction Treatment: ')
|
kkonganti@0
|
80 sample, micro_status = _lookup_correction(sample, lookup_table, 'abbreviations',
|
kkonganti@0
|
81 micro_status, 'Abbreviation-Acronym Treatment: ')
|
kkonganti@0
|
82 sample, micro_status = _lookup_correction(sample, lookup_table, 'non_english_words',
|
kkonganti@0
|
83 micro_status, 'Non English Language Words Treatment: ')
|
kkonganti@0
|
84 return(sample, micro_status)
|
kkonganti@0
|
85
|
kkonganti@0
|
86
|
kkonganti@0
|
87 def punctuation_treatment(untreated_term):
|
kkonganti@0
|
88 '''Remove punctuations from term'''
|
kkonganti@0
|
89 punctuations_regex_char_class = '[~`!@#$%^*()_\|/{}:;,.<>?]'
|
kkonganti@0
|
90 ret_term = ''
|
kkonganti@0
|
91 for word_token in untreated_term.split():
|
kkonganti@0
|
92 if word_token.count('-') > 1:
|
kkonganti@0
|
93 ret_term += word_token.replace('-',' ') + ' '
|
kkonganti@0
|
94 else:
|
kkonganti@0
|
95 ret_term += word_token + ' '
|
kkonganti@0
|
96 ret_term = ret_term.lower().replace('\"','').replace('\'ve','').replace('\'m','')
|
kkonganti@0
|
97 ret_term = ret_term.replace('\'s','').replace('\'t','').replace('\'ll','').replace('\'re','')
|
kkonganti@0
|
98 ret_term = ret_term.replace('\'','').replace('-','').replace('[','').replace(']','')
|
kkonganti@0
|
99 ret_term = ret_term.replace('&',' and ').replace('+',' and ').replace('=',' is ')
|
kkonganti@0
|
100 ret_term = re.sub(punctuations_regex_char_class, ' ', ret_term).lower()
|
kkonganti@0
|
101 return(' '.join(ret_term.split()))
|
kkonganti@0
|
102
|
kkonganti@0
|
103
|
kkonganti@0
|
104 def further_cleanup(sample_text):
|
kkonganti@0
|
105 '''Remove terms indicated to not be relevant and some compound words'''
|
kkonganti@0
|
106 new_text = []
|
kkonganti@0
|
107 neg_words = [r'no ',r'non',r'not',r'neither',r'nor',r'without']
|
kkonganti@0
|
108 stt_words = ['animal','cb','chicken','environmental','food','human','large','medium','necropsy',
|
kkonganti@0
|
109 'organic','other','poultry','product','sausage','small','stool','swab','wild',]
|
kkonganti@0
|
110 end_words = ['aspirate','culture','environmental','fluid','food','intestine','large','meal','medium',
|
kkonganti@0
|
111 'mixed','necropsy','other','poultry','product','research','sample','sausage','slaughter',
|
kkonganti@0
|
112 'small','swab','water','wild',]
|
kkonganti@0
|
113 not_replace = ['agriculture','apiculture','aquaculture','aquiculture','aviculture',
|
kkonganti@0
|
114 'coculture','hemoculture','mariculture','monoculture','sericulture',
|
kkonganti@0
|
115 'subculture','viniculture','viticulture',
|
kkonganti@0
|
116 'semifluid','subfluid','superfluid',
|
kkonganti@0
|
117 'superlarge','reenlarge','enlarge','overlarge','largemouth','larges',
|
kkonganti@0
|
118 'bonemeal','cornmeal','fishmeal','inchmeal','oatmeal','piecemeal','premeal',
|
kkonganti@0
|
119 'wholemeal','biosample','ensample','resample','subsample','backwater',
|
kkonganti@0
|
120 'another','bother','brother','foremother','frother','godmother','grandmother',
|
kkonganti@0
|
121 'housemother','mother','otherguess','otherness','othernesse','otherwhere',
|
kkonganti@0
|
122 'otherwhile','otherworld','pother','soother','smoother','smother','stepbrother',
|
kkonganti@0
|
123 'stepmother','tother',
|
kkonganti@0
|
124 'byproduct','coproduct','production','productive','subproduct',
|
kkonganti@0
|
125 'ultrasmall','smaller','smallmouth','smalltime','smallpox','smallpoxe',
|
kkonganti@0
|
126 'smallsword','smallsholder','mediumship',
|
kkonganti@0
|
127 'bathwater','bilgewater','blackwater','breakwater','cutwater','deepwater',
|
kkonganti@0
|
128 'dewater','dishwater','eyewater','firewater','floodwater','freshwater',
|
kkonganti@0
|
129 'graywater','groundwater','headwater','jerkwater','limewater','meltwater',
|
kkonganti@0
|
130 'overwater','polywater','rainwater','rosewater','saltwater','seawater',
|
kkonganti@0
|
131 'shearwater','springwater','tailwater','tidewater','underwater','wastewater',
|
kkonganti@0
|
132 'semiwild','wildcard','wildcat','wildcatter','wildcatted','wildebeest','wilded',
|
kkonganti@0
|
133 'wilder','wilderment','wilderness','wildernesse','wildest','wildfire','wildflower',
|
kkonganti@0
|
134 'wildfowl','wildfowler','wildish','wildland','wildling','wildlife','wildwood',
|
kkonganti@0
|
135 ]
|
kkonganti@0
|
136
|
kkonganti@0
|
137 found_comp = []
|
kkonganti@0
|
138 for comp_word in stt_words:
|
kkonganti@0
|
139 found_comp.extend(re.findall(f'({comp_word})(\w+)', sample_text))
|
kkonganti@0
|
140 for comp_word in end_words:
|
kkonganti@0
|
141 found_comp.extend(re.findall(f'(\w+)({comp_word})', sample_text))
|
kkonganti@0
|
142 for x in found_comp:
|
kkonganti@0
|
143 if x[0]+x[1] not in not_replace and x[0]+x[1]+'s' not in not_replace:
|
kkonganti@0
|
144 sample_text = sample_text.replace(x[0]+x[1], x[0]+' '+x[1])
|
kkonganti@0
|
145
|
kkonganti@0
|
146 for sample_word in sample_text.split():
|
kkonganti@0
|
147 if len(sample_word) > 1:
|
kkonganti@0
|
148 new_text.append(sample_word.strip())
|
kkonganti@0
|
149
|
kkonganti@0
|
150 if 'nor' in new_text:
|
kkonganti@0
|
151 if 'neither' not in new_text:
|
kkonganti@0
|
152 word_ind = new_text.index('nor')
|
kkonganti@0
|
153 new_text.insert(max[0,word_ind-2], 'neither')
|
kkonganti@0
|
154
|
kkonganti@0
|
155 for neg_word in neg_words:
|
kkonganti@0
|
156 if neg_word in new_text:
|
kkonganti@0
|
157 word_ind = new_text.index(neg_word)
|
kkonganti@0
|
158 del(new_text[word_ind:word_ind+2])
|
kkonganti@0
|
159 return(' '.join(new_text))
|
kkonganti@0
|
160
|
kkonganti@0
|
161
|
kkonganti@0
|
162 def is_number(input_string):
|
kkonganti@0
|
163 '''Determine whether a string is a number'''
|
kkonganti@0
|
164 try:
|
kkonganti@0
|
165 unicodedata.numeric(input_string)
|
kkonganti@0
|
166 return(True)
|
kkonganti@0
|
167 except(TypeError, ValueError):
|
kkonganti@0
|
168 return(False)
|
kkonganti@0
|
169
|
kkonganti@0
|
170
|
kkonganti@0
|
171 def is_date(input_string):
|
kkonganti@0
|
172 '''Determine whether a string is a date or day'''
|
kkonganti@0
|
173 try:
|
kkonganti@0
|
174 parse(input_string)
|
kkonganti@0
|
175 return(True)
|
kkonganti@0
|
176 except(ValueError, OverflowError):
|
kkonganti@0
|
177 return(False)
|
kkonganti@0
|
178
|
kkonganti@0
|
179
|
kkonganti@0
|
180 def singularize_token(token, lookup_table, micro_status, c):
|
kkonganti@0
|
181 '''Singularize the string token, if applicable'''
|
kkonganti@0
|
182 if token in lookup_table['inflection_exceptions']:
|
kkonganti@0
|
183 return(token, micro_status)
|
kkonganti@0
|
184
|
kkonganti@0
|
185 exception_tail_chars_list = ['us', 'ia', 'ta', 'ss'] # TODO: add as, is?
|
kkonganti@0
|
186 for char in exception_tail_chars_list:
|
kkonganti@0
|
187 if token.endswith(char):
|
kkonganti@0
|
188 return(token, micro_status)
|
kkonganti@0
|
189
|
kkonganti@0
|
190 taxon_names = c.execute('''SELECT * FROM standard_resource_labels WHERE key LIKE :key AND
|
kkonganti@0
|
191 value LIKE :value''',
|
kkonganti@0
|
192 {'key':'% '+token,'value':'NCBITaxon%'}).fetchall()
|
kkonganti@0
|
193 remove_finds = []
|
kkonganti@0
|
194 for x in taxon_names:
|
kkonganti@0
|
195 if len(x[0].split()) > 2:
|
kkonganti@0
|
196 remove_finds.append(x)
|
kkonganti@0
|
197 for x in remove_finds:
|
kkonganti@0
|
198 taxon_names.remove(x)
|
kkonganti@0
|
199 if taxon_names != []:
|
kkonganti@0
|
200 return(token, micro_status)
|
kkonganti@0
|
201
|
kkonganti@0
|
202 lemma = inflection.singularize(token)
|
kkonganti@0
|
203 micro_status.append('Inflection (Plural) Treatment: ' + token)
|
kkonganti@0
|
204 return(lemma, micro_status)
|
kkonganti@0
|
205
|
kkonganti@0
|
206
|
kkonganti@0
|
207 def get_cleaned_sample(input_sample, token, lookup_table):
|
kkonganti@0
|
208 '''Prepare the cleaned sample phrase using the input token'''
|
kkonganti@0
|
209 if input_sample == '' and token not in lookup_table['stop_words']:
|
kkonganti@0
|
210 return(token)
|
kkonganti@0
|
211 elif token not in lookup_table['stop_words']:
|
kkonganti@0
|
212 return(input_sample + ' ' + token)
|
kkonganti@0
|
213 else:
|
kkonganti@0
|
214 return(input_sample)
|
kkonganti@0
|
215
|
kkonganti@0
|
216
|
kkonganti@0
|
217 def get_annotated_sample(annotated_sample, lemma):
|
kkonganti@0
|
218 '''Embed synonyms in the sample, if available'''
|
kkonganti@0
|
219 # TODO: able to annotate permuatations instead of just left to right?
|
kkonganti@0
|
220 synonym_map = {}
|
kkonganti@0
|
221 if not annotated_sample:
|
kkonganti@0
|
222 annotated_sample = lemma
|
kkonganti@0
|
223 else:
|
kkonganti@0
|
224 annotated_sample = f'{annotated_sample} {lemma}'
|
kkonganti@0
|
225
|
kkonganti@0
|
226 conn_syn = sqlite3.connect(synonym_db)
|
kkonganti@0
|
227 d = conn_syn.cursor()
|
kkonganti@0
|
228 for y in [lemma, _remove_annotated_synonyms(annotated_sample)]:
|
kkonganti@0
|
229 d.execute('SELECT * FROM label_synonyms WHERE key=:key', {'key':y})
|
kkonganti@0
|
230 for x in d.fetchall():
|
kkonganti@0
|
231 if not re.search(x[1], annotated_sample):
|
kkonganti@0
|
232 annotated_sample = annotated_sample+' {'+x[1]+'}'
|
kkonganti@0
|
233 synonym_map[y] = x[1]
|
kkonganti@0
|
234 conn_syn.close()
|
kkonganti@0
|
235 return(annotated_sample, synonym_map)
|
kkonganti@0
|
236
|
kkonganti@0
|
237
|
kkonganti@0
|
238 def map_term(term, lookup_table, c, consider_suffixes=False):
|
kkonganti@0
|
239 '''Map term to some resource in database'''
|
kkonganti@0
|
240 if consider_suffixes:
|
kkonganti@0
|
241 for suffix in lookup_table['suffixes']:
|
kkonganti@0
|
242 mapping = _map_term_helper(term+' '+suffix, c)
|
kkonganti@0
|
243 if mapping:
|
kkonganti@0
|
244 for x in mapping:
|
kkonganti@0
|
245 x['status'].insert(-2, 'Suffix Addition')
|
kkonganti@0
|
246 return(mapping)
|
kkonganti@0
|
247 else:
|
kkonganti@0
|
248 mapping = _map_term_helper(term, c)
|
kkonganti@0
|
249 if mapping:
|
kkonganti@0
|
250 return(mapping)
|
kkonganti@0
|
251 return([])
|
kkonganti@0
|
252
|
kkonganti@0
|
253
|
kkonganti@0
|
254 def annotation_reduce(annotated_sample, synonym_map):
|
kkonganti@0
|
255 '''Remove annotations on shorter phrases included in longer phrases with annotations'''
|
kkonganti@0
|
256 remove_list = []
|
kkonganti@0
|
257 for x in list(synonym_map.keys()):
|
kkonganti@0
|
258 for y in list(synonym_map.keys()):
|
kkonganti@0
|
259 if x != y:
|
kkonganti@0
|
260 if x.startswith(y) or x.endswith(y) == True:
|
kkonganti@0
|
261 remove_list.append(y)
|
kkonganti@0
|
262 for x in remove_list:
|
kkonganti@0
|
263 annotated_sample = annotated_sample.replace('{'+synonym_map[x]+'}',' ')
|
kkonganti@0
|
264 return(' '.join(annotated_sample.split()))
|
kkonganti@0
|
265
|
kkonganti@0
|
266
|
kkonganti@0
|
267 def get_annotated_synonyms(input_annotations):
|
kkonganti@0
|
268 '''Get list of the annotations'''
|
kkonganti@0
|
269 synonym_list = []
|
kkonganti@0
|
270 for x in input_annotations.split('{')[1:]:
|
kkonganti@0
|
271 synonym_list.append(x.split('}')[0])
|
kkonganti@0
|
272 return(synonym_list)
|
kkonganti@0
|
273
|
kkonganti@0
|
274
|
kkonganti@0
|
275 def get_gram_chunks(input_phrase, num):
|
kkonganti@0
|
276 '''Make num-gram chunks from input'''
|
kkonganti@0
|
277 input_tokens = input_phrase.split()
|
kkonganti@0
|
278 if len(input_tokens) < 15:
|
kkonganti@0
|
279 return(list(combinations(input_tokens, num)))
|
kkonganti@0
|
280 else:
|
kkonganti@0
|
281 return(_ngrams(input_phrase, num))
|