Mercurial > repos > kkonganti > cfsan_lexmapr2
comparison lexmapr/pipeline_helpers.py @ 0:f5c39d0447be
"planemo upload"
author | kkonganti |
---|---|
date | Wed, 31 Aug 2022 14:32:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f5c39d0447be |
---|---|
1 """Helper functions for main pipeline""" | |
2 | |
3 import inflection, re, unicodedata, sqlite3 | |
4 from collections import OrderedDict | |
5 from itertools import combinations | |
6 from dateutil.parser import parse | |
7 from lexmapr.definitions import synonym_db | |
8 from nltk import pos_tag | |
9 from nltk.tokenize import word_tokenize | |
10 from nltk.tokenize.treebank import TreebankWordDetokenizer | |
11 | |
12 | |
13 def _lookup_correction(sample, lookup_table, lookup_x, micro_status, status_title): | |
14 '''Apply corections, if available in resource''' | |
15 sample = ' ' + sample + ' ' | |
16 for x in lookup_table[lookup_x]: | |
17 find_x = re.findall(' '+x+' ', sample) | |
18 if find_x != []: | |
19 micro_status.append(status_title + x) | |
20 sample = sample.replace(' '+x+' ', ' '+lookup_table[lookup_x][x]+' ') | |
21 return(' '.join(sample.split()), micro_status) | |
22 | |
23 | |
24 def _remove_annotated_synonyms(input_annotations): | |
25 '''Remove annotations to see original phrase''' | |
26 output_sample = '' | |
27 copy_char = True | |
28 for x in input_annotations: | |
29 if x == '{': | |
30 copy_char = False | |
31 elif x == '}': | |
32 copy_char = True | |
33 else: | |
34 if copy_char == True: | |
35 output_sample += x | |
36 while re.search(' ', output_sample): | |
37 output_sample = output_sample.replace(' ', ' ') | |
38 return(output_sample) | |
39 | |
40 | |
41 def _retrieve_map_id(search_results, c): | |
42 '''Get resource id from database''' | |
43 return_list = [] | |
44 for x in search_results: | |
45 c.execute('SELECT * FROM non_standard_resource_ids WHERE key=:key', {'key':x[1]}) | |
46 for y in c.fetchall(): | |
47 result_dic = {'term':y[1], 'id':y[0], 'status':[]} | |
48 if not result_dic in return_list: | |
49 return_list.append(result_dic) | |
50 return(return_list) | |
51 | |
52 | |
53 def _map_term_helper(term, c): | |
54 '''Maps term to resource or resource permutation''' | |
55 c.execute('SELECT * FROM standard_resource_labels WHERE key=:key', {'key':term}) | |
56 search_results = c.fetchall() | |
57 if len(search_results) == 0: | |
58 c.execute('SELECT * FROM standard_resource_permutations WHERE key=:key', {'key':term}) | |
59 search_results = c.fetchall() | |
60 if len(search_results) != 0: | |
61 return(_retrieve_map_id(search_results, c)) | |
62 else: | |
63 return(_retrieve_map_id(search_results, c)) | |
64 return(None) | |
65 | |
66 | |
67 def _ngrams(input_phrase, gram_value): | |
68 '''Get ngrams with a given value of gram_value''' | |
69 input_phrase = input_phrase.split() | |
70 output = [] | |
71 for i in range(len(input_phrase) - gram_value + 1): | |
72 output.append(input_phrase[i:i + gram_value]) | |
73 return(output) | |
74 | |
75 | |
76 def process_sample(sample, lookup_table, micro_status): | |
77 '''Apply corrections to input sample''' | |
78 sample, micro_status = _lookup_correction(sample, lookup_table, 'spelling_mistakes', | |
79 micro_status, 'Spelling Correction Treatment: ') | |
80 sample, micro_status = _lookup_correction(sample, lookup_table, 'abbreviations', | |
81 micro_status, 'Abbreviation-Acronym Treatment: ') | |
82 sample, micro_status = _lookup_correction(sample, lookup_table, 'non_english_words', | |
83 micro_status, 'Non English Language Words Treatment: ') | |
84 return(sample, micro_status) | |
85 | |
86 | |
87 def punctuation_treatment(untreated_term): | |
88 '''Remove punctuations from term''' | |
89 punctuations_regex_char_class = '[~`!@#$%^*()_\|/{}:;,.<>?]' | |
90 ret_term = '' | |
91 for word_token in untreated_term.split(): | |
92 if word_token.count('-') > 1: | |
93 ret_term += word_token.replace('-',' ') + ' ' | |
94 else: | |
95 ret_term += word_token + ' ' | |
96 ret_term = ret_term.lower().replace('\"','').replace('\'ve','').replace('\'m','') | |
97 ret_term = ret_term.replace('\'s','').replace('\'t','').replace('\'ll','').replace('\'re','') | |
98 ret_term = ret_term.replace('\'','').replace('-','').replace('[','').replace(']','') | |
99 ret_term = ret_term.replace('&',' and ').replace('+',' and ').replace('=',' is ') | |
100 ret_term = re.sub(punctuations_regex_char_class, ' ', ret_term).lower() | |
101 return(' '.join(ret_term.split())) | |
102 | |
103 | |
104 def further_cleanup(sample_text): | |
105 '''Remove terms indicated to not be relevant and some compound words''' | |
106 new_text = [] | |
107 neg_words = [r'no ',r'non',r'not',r'neither',r'nor',r'without'] | |
108 stt_words = ['animal','cb','chicken','environmental','food','human','large','medium','necropsy', | |
109 'organic','other','poultry','product','sausage','small','stool','swab','wild',] | |
110 end_words = ['aspirate','culture','environmental','fluid','food','intestine','large','meal','medium', | |
111 'mixed','necropsy','other','poultry','product','research','sample','sausage','slaughter', | |
112 'small','swab','water','wild',] | |
113 not_replace = ['agriculture','apiculture','aquaculture','aquiculture','aviculture', | |
114 'coculture','hemoculture','mariculture','monoculture','sericulture', | |
115 'subculture','viniculture','viticulture', | |
116 'semifluid','subfluid','superfluid', | |
117 'superlarge','reenlarge','enlarge','overlarge','largemouth','larges', | |
118 'bonemeal','cornmeal','fishmeal','inchmeal','oatmeal','piecemeal','premeal', | |
119 'wholemeal','biosample','ensample','resample','subsample','backwater', | |
120 'another','bother','brother','foremother','frother','godmother','grandmother', | |
121 'housemother','mother','otherguess','otherness','othernesse','otherwhere', | |
122 'otherwhile','otherworld','pother','soother','smoother','smother','stepbrother', | |
123 'stepmother','tother', | |
124 'byproduct','coproduct','production','productive','subproduct', | |
125 'ultrasmall','smaller','smallmouth','smalltime','smallpox','smallpoxe', | |
126 'smallsword','smallsholder','mediumship', | |
127 'bathwater','bilgewater','blackwater','breakwater','cutwater','deepwater', | |
128 'dewater','dishwater','eyewater','firewater','floodwater','freshwater', | |
129 'graywater','groundwater','headwater','jerkwater','limewater','meltwater', | |
130 'overwater','polywater','rainwater','rosewater','saltwater','seawater', | |
131 'shearwater','springwater','tailwater','tidewater','underwater','wastewater', | |
132 'semiwild','wildcard','wildcat','wildcatter','wildcatted','wildebeest','wilded', | |
133 'wilder','wilderment','wilderness','wildernesse','wildest','wildfire','wildflower', | |
134 'wildfowl','wildfowler','wildish','wildland','wildling','wildlife','wildwood', | |
135 ] | |
136 | |
137 found_comp = [] | |
138 for comp_word in stt_words: | |
139 found_comp.extend(re.findall(f'({comp_word})(\w+)', sample_text)) | |
140 for comp_word in end_words: | |
141 found_comp.extend(re.findall(f'(\w+)({comp_word})', sample_text)) | |
142 for x in found_comp: | |
143 if x[0]+x[1] not in not_replace and x[0]+x[1]+'s' not in not_replace: | |
144 sample_text = sample_text.replace(x[0]+x[1], x[0]+' '+x[1]) | |
145 | |
146 for sample_word in sample_text.split(): | |
147 if len(sample_word) > 1: | |
148 new_text.append(sample_word.strip()) | |
149 | |
150 if 'nor' in new_text: | |
151 if 'neither' not in new_text: | |
152 word_ind = new_text.index('nor') | |
153 new_text.insert(max[0,word_ind-2], 'neither') | |
154 | |
155 for neg_word in neg_words: | |
156 if neg_word in new_text: | |
157 word_ind = new_text.index(neg_word) | |
158 del(new_text[word_ind:word_ind+2]) | |
159 return(' '.join(new_text)) | |
160 | |
161 | |
162 def is_number(input_string): | |
163 '''Determine whether a string is a number''' | |
164 try: | |
165 unicodedata.numeric(input_string) | |
166 return(True) | |
167 except(TypeError, ValueError): | |
168 return(False) | |
169 | |
170 | |
171 def is_date(input_string): | |
172 '''Determine whether a string is a date or day''' | |
173 try: | |
174 parse(input_string) | |
175 return(True) | |
176 except(ValueError, OverflowError): | |
177 return(False) | |
178 | |
179 | |
180 def singularize_token(token, lookup_table, micro_status, c): | |
181 '''Singularize the string token, if applicable''' | |
182 if token in lookup_table['inflection_exceptions']: | |
183 return(token, micro_status) | |
184 | |
185 exception_tail_chars_list = ['us', 'ia', 'ta', 'ss'] # TODO: add as, is? | |
186 for char in exception_tail_chars_list: | |
187 if token.endswith(char): | |
188 return(token, micro_status) | |
189 | |
190 taxon_names = c.execute('''SELECT * FROM standard_resource_labels WHERE key LIKE :key AND | |
191 value LIKE :value''', | |
192 {'key':'% '+token,'value':'NCBITaxon%'}).fetchall() | |
193 remove_finds = [] | |
194 for x in taxon_names: | |
195 if len(x[0].split()) > 2: | |
196 remove_finds.append(x) | |
197 for x in remove_finds: | |
198 taxon_names.remove(x) | |
199 if taxon_names != []: | |
200 return(token, micro_status) | |
201 | |
202 lemma = inflection.singularize(token) | |
203 micro_status.append('Inflection (Plural) Treatment: ' + token) | |
204 return(lemma, micro_status) | |
205 | |
206 | |
207 def get_cleaned_sample(input_sample, token, lookup_table): | |
208 '''Prepare the cleaned sample phrase using the input token''' | |
209 if input_sample == '' and token not in lookup_table['stop_words']: | |
210 return(token) | |
211 elif token not in lookup_table['stop_words']: | |
212 return(input_sample + ' ' + token) | |
213 else: | |
214 return(input_sample) | |
215 | |
216 | |
217 def get_annotated_sample(annotated_sample, lemma): | |
218 '''Embed synonyms in the sample, if available''' | |
219 # TODO: able to annotate permuatations instead of just left to right? | |
220 synonym_map = {} | |
221 if not annotated_sample: | |
222 annotated_sample = lemma | |
223 else: | |
224 annotated_sample = f'{annotated_sample} {lemma}' | |
225 | |
226 conn_syn = sqlite3.connect(synonym_db) | |
227 d = conn_syn.cursor() | |
228 for y in [lemma, _remove_annotated_synonyms(annotated_sample)]: | |
229 d.execute('SELECT * FROM label_synonyms WHERE key=:key', {'key':y}) | |
230 for x in d.fetchall(): | |
231 if not re.search(x[1], annotated_sample): | |
232 annotated_sample = annotated_sample+' {'+x[1]+'}' | |
233 synonym_map[y] = x[1] | |
234 conn_syn.close() | |
235 return(annotated_sample, synonym_map) | |
236 | |
237 | |
238 def map_term(term, lookup_table, c, consider_suffixes=False): | |
239 '''Map term to some resource in database''' | |
240 if consider_suffixes: | |
241 for suffix in lookup_table['suffixes']: | |
242 mapping = _map_term_helper(term+' '+suffix, c) | |
243 if mapping: | |
244 for x in mapping: | |
245 x['status'].insert(-2, 'Suffix Addition') | |
246 return(mapping) | |
247 else: | |
248 mapping = _map_term_helper(term, c) | |
249 if mapping: | |
250 return(mapping) | |
251 return([]) | |
252 | |
253 | |
254 def annotation_reduce(annotated_sample, synonym_map): | |
255 '''Remove annotations on shorter phrases included in longer phrases with annotations''' | |
256 remove_list = [] | |
257 for x in list(synonym_map.keys()): | |
258 for y in list(synonym_map.keys()): | |
259 if x != y: | |
260 if x.startswith(y) or x.endswith(y) == True: | |
261 remove_list.append(y) | |
262 for x in remove_list: | |
263 annotated_sample = annotated_sample.replace('{'+synonym_map[x]+'}',' ') | |
264 return(' '.join(annotated_sample.split())) | |
265 | |
266 | |
267 def get_annotated_synonyms(input_annotations): | |
268 '''Get list of the annotations''' | |
269 synonym_list = [] | |
270 for x in input_annotations.split('{')[1:]: | |
271 synonym_list.append(x.split('}')[0]) | |
272 return(synonym_list) | |
273 | |
274 | |
275 def get_gram_chunks(input_phrase, num): | |
276 '''Make num-gram chunks from input''' | |
277 input_tokens = input_phrase.split() | |
278 if len(input_tokens) < 15: | |
279 return(list(combinations(input_tokens, num))) | |
280 else: | |
281 return(_ngrams(input_phrase, num)) |