annotate lexmapr/pipeline_resources.py @ 0:f5c39d0447be

"planemo upload"
author kkonganti
date Wed, 31 Aug 2022 14:32:07 -0400
parents
children
rev   line source
kkonganti@0 1 """Cache, generate and load resources"""
kkonganti@0 2
kkonganti@0 3 import csv, json, os, sys
kkonganti@0 4 from nltk import word_tokenize
kkonganti@0 5 from itertools import permutations
kkonganti@0 6 from collections import OrderedDict
kkonganti@0 7 from lexmapr.definitions import ROOT
kkonganti@0 8 from lexmapr.pipeline_helpers import punctuation_treatment
kkonganti@0 9
kkonganti@0 10
kkonganti@0 11 def _create_lookup_table_skeleton():
kkonganti@0 12 '''Generate an empty lookup table'''
kkonganti@0 13 return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{},
kkonganti@0 14 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}})
kkonganti@0 15
kkonganti@0 16
kkonganti@0 17 def _get_resource_dict(resource_file_name):
kkonganti@0 18 '''Get dictionary of resources from CSV file'''
kkonganti@0 19 ret_dic = {}
kkonganti@0 20 with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file:
kkonganti@0 21 next(RES_file)
kkonganti@0 22 for row in csv.reader(RES_file, delimiter=','):
kkonganti@0 23 try:
kkonganti@0 24 ret_dic[punctuation_treatment(row[0].strip())] = \
kkonganti@0 25 punctuation_treatment(row[1].strip())
kkonganti@0 26 except IndexError:
kkonganti@0 27 ret_dic[punctuation_treatment(row[0].strip())] = ''
kkonganti@0 28 return(ret_dic)
kkonganti@0 29
kkonganti@0 30
kkonganti@0 31 def _add_predefined_resources_to_lookup_table(lookup_table):
kkonganti@0 32 '''Adds elements from lexmapr/predefined_resources to lookup table'''
kkonganti@0 33 lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv')
kkonganti@0 34 lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv')
kkonganti@0 35 lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv')
kkonganti@0 36 lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7
kkonganti@0 37 lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv'))
kkonganti@0 38 lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv')
kkonganti@0 39 lookup_table['suffixes'] = _get_resource_dict('suffixes.csv')
kkonganti@0 40 return(lookup_table)
kkonganti@0 41
kkonganti@0 42
kkonganti@0 43 def get_predefined_resources():
kkonganti@0 44 '''Creates lookup table'''
kkonganti@0 45 lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json')
kkonganti@0 46 if os.path.exists(lookup_table_path):
kkonganti@0 47 with open(lookup_table_path) as LT_file:
kkonganti@0 48 lookup_table = json.load(LT_file)
kkonganti@0 49 else:
kkonganti@0 50 lookup_table = _create_lookup_table_skeleton()
kkonganti@0 51 lookup_table = _add_predefined_resources_to_lookup_table(lookup_table)
kkonganti@0 52 with open(lookup_table_path, 'w') as LT_file:
kkonganti@0 53 json.dump(lookup_table, LT_file)
kkonganti@0 54 return(lookup_table)
kkonganti@0 55
kkonganti@0 56
kkonganti@0 57 def get_resource_label_permutations(resource_label):
kkonganti@0 58 '''Get permutations of some term'''
kkonganti@0 59 permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split())))
kkonganti@0 60 ret_list = []
kkonganti@0 61 for permutation_tuple in permutations_set:
kkonganti@0 62 ret_list.append(' '.join(permutation_tuple))
kkonganti@0 63 return(ret_list)