Mercurial > repos > kkonganti > cfsan_lexmapr2
comparison lexmapr/pipeline_resources.py @ 0:f5c39d0447be
"planemo upload"
author | kkonganti |
---|---|
date | Wed, 31 Aug 2022 14:32:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f5c39d0447be |
---|---|
1 """Cache, generate and load resources""" | |
2 | |
3 import csv, json, os, sys | |
4 from nltk import word_tokenize | |
5 from itertools import permutations | |
6 from collections import OrderedDict | |
7 from lexmapr.definitions import ROOT | |
8 from lexmapr.pipeline_helpers import punctuation_treatment | |
9 | |
10 | |
11 def _create_lookup_table_skeleton(): | |
12 '''Generate an empty lookup table''' | |
13 return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{}, | |
14 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}}) | |
15 | |
16 | |
17 def _get_resource_dict(resource_file_name): | |
18 '''Get dictionary of resources from CSV file''' | |
19 ret_dic = {} | |
20 with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file: | |
21 next(RES_file) | |
22 for row in csv.reader(RES_file, delimiter=','): | |
23 try: | |
24 ret_dic[punctuation_treatment(row[0].strip())] = \ | |
25 punctuation_treatment(row[1].strip()) | |
26 except IndexError: | |
27 ret_dic[punctuation_treatment(row[0].strip())] = '' | |
28 return(ret_dic) | |
29 | |
30 | |
31 def _add_predefined_resources_to_lookup_table(lookup_table): | |
32 '''Adds elements from lexmapr/predefined_resources to lookup table''' | |
33 lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv') | |
34 lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv') | |
35 lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv') | |
36 lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7 | |
37 lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv')) | |
38 lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv') | |
39 lookup_table['suffixes'] = _get_resource_dict('suffixes.csv') | |
40 return(lookup_table) | |
41 | |
42 | |
43 def get_predefined_resources(): | |
44 '''Creates lookup table''' | |
45 lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json') | |
46 if os.path.exists(lookup_table_path): | |
47 with open(lookup_table_path) as LT_file: | |
48 lookup_table = json.load(LT_file) | |
49 else: | |
50 lookup_table = _create_lookup_table_skeleton() | |
51 lookup_table = _add_predefined_resources_to_lookup_table(lookup_table) | |
52 with open(lookup_table_path, 'w') as LT_file: | |
53 json.dump(lookup_table, LT_file) | |
54 return(lookup_table) | |
55 | |
56 | |
57 def get_resource_label_permutations(resource_label): | |
58 '''Get permutations of some term''' | |
59 permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split()))) | |
60 ret_list = [] | |
61 for permutation_tuple in permutations_set: | |
62 ret_list.append(' '.join(permutation_tuple)) | |
63 return(ret_list) |