kkonganti@0
|
1 """Cache, generate and load resources"""
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import csv, json, os, sys
|
kkonganti@0
|
4 from nltk import word_tokenize
|
kkonganti@0
|
5 from itertools import permutations
|
kkonganti@0
|
6 from collections import OrderedDict
|
kkonganti@0
|
7 from lexmapr.definitions import ROOT
|
kkonganti@0
|
8 from lexmapr.pipeline_helpers import punctuation_treatment
|
kkonganti@0
|
9
|
kkonganti@0
|
10
|
kkonganti@0
|
11 def _create_lookup_table_skeleton():
|
kkonganti@0
|
12 '''Generate an empty lookup table'''
|
kkonganti@0
|
13 return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{},
|
kkonganti@0
|
14 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}})
|
kkonganti@0
|
15
|
kkonganti@0
|
16
|
kkonganti@0
|
17 def _get_resource_dict(resource_file_name):
|
kkonganti@0
|
18 '''Get dictionary of resources from CSV file'''
|
kkonganti@0
|
19 ret_dic = {}
|
kkonganti@0
|
20 with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file:
|
kkonganti@0
|
21 next(RES_file)
|
kkonganti@0
|
22 for row in csv.reader(RES_file, delimiter=','):
|
kkonganti@0
|
23 try:
|
kkonganti@0
|
24 ret_dic[punctuation_treatment(row[0].strip())] = \
|
kkonganti@0
|
25 punctuation_treatment(row[1].strip())
|
kkonganti@0
|
26 except IndexError:
|
kkonganti@0
|
27 ret_dic[punctuation_treatment(row[0].strip())] = ''
|
kkonganti@0
|
28 return(ret_dic)
|
kkonganti@0
|
29
|
kkonganti@0
|
30
|
kkonganti@0
|
31 def _add_predefined_resources_to_lookup_table(lookup_table):
|
kkonganti@0
|
32 '''Adds elements from lexmapr/predefined_resources to lookup table'''
|
kkonganti@0
|
33 lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv')
|
kkonganti@0
|
34 lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv')
|
kkonganti@0
|
35 lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv')
|
kkonganti@0
|
36 lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7
|
kkonganti@0
|
37 lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv'))
|
kkonganti@0
|
38 lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv')
|
kkonganti@0
|
39 lookup_table['suffixes'] = _get_resource_dict('suffixes.csv')
|
kkonganti@0
|
40 return(lookup_table)
|
kkonganti@0
|
41
|
kkonganti@0
|
42
|
kkonganti@0
|
43 def get_predefined_resources():
|
kkonganti@0
|
44 '''Creates lookup table'''
|
kkonganti@0
|
45 lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json')
|
kkonganti@0
|
46 if os.path.exists(lookup_table_path):
|
kkonganti@0
|
47 with open(lookup_table_path) as LT_file:
|
kkonganti@0
|
48 lookup_table = json.load(LT_file)
|
kkonganti@0
|
49 else:
|
kkonganti@0
|
50 lookup_table = _create_lookup_table_skeleton()
|
kkonganti@0
|
51 lookup_table = _add_predefined_resources_to_lookup_table(lookup_table)
|
kkonganti@0
|
52 with open(lookup_table_path, 'w') as LT_file:
|
kkonganti@0
|
53 json.dump(lookup_table, LT_file)
|
kkonganti@0
|
54 return(lookup_table)
|
kkonganti@0
|
55
|
kkonganti@0
|
56
|
kkonganti@0
|
57 def get_resource_label_permutations(resource_label):
|
kkonganti@0
|
58 '''Get permutations of some term'''
|
kkonganti@0
|
59 permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split())))
|
kkonganti@0
|
60 ret_list = []
|
kkonganti@0
|
61 for permutation_tuple in permutations_set:
|
kkonganti@0
|
62 ret_list.append(' '.join(permutation_tuple))
|
kkonganti@0
|
63 return(ret_list)
|