cstrittmatter@0: """Cache, generate and load resources""" cstrittmatter@0: cstrittmatter@0: import csv, json, os, sys cstrittmatter@0: from nltk import word_tokenize cstrittmatter@0: from itertools import permutations cstrittmatter@0: from collections import OrderedDict cstrittmatter@0: from lexmapr.definitions import ROOT cstrittmatter@0: from lexmapr.pipeline_helpers import punctuation_treatment cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _create_lookup_table_skeleton(): cstrittmatter@0: '''Generate an empty lookup table''' cstrittmatter@0: return({'abbreviations':{}, 'inflection_exceptions':{}, 'non_english_words':{}, cstrittmatter@0: 'spelling_mistakes':{}, 'stop_words':{}, 'suffixes':{}}) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _get_resource_dict(resource_file_name): cstrittmatter@0: '''Get dictionary of resources from CSV file''' cstrittmatter@0: ret_dic = {} cstrittmatter@0: with open(os.path.join(ROOT, 'predefined_resources', resource_file_name)) as RES_file: cstrittmatter@0: next(RES_file) cstrittmatter@0: for row in csv.reader(RES_file, delimiter=','): cstrittmatter@0: try: cstrittmatter@0: ret_dic[punctuation_treatment(row[0].strip())] = \ cstrittmatter@0: punctuation_treatment(row[1].strip()) cstrittmatter@0: except IndexError: cstrittmatter@0: ret_dic[punctuation_treatment(row[0].strip())] = '' cstrittmatter@0: return(ret_dic) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def _add_predefined_resources_to_lookup_table(lookup_table): cstrittmatter@0: '''Adds elements from lexmapr/predefined_resources to lookup table''' cstrittmatter@0: lookup_table['abbreviations'] = _get_resource_dict('abbrv.csv') cstrittmatter@0: lookup_table['inflection_exceptions'] = _get_resource_dict('inflection_exceptions.csv') cstrittmatter@0: lookup_table['non_english_words'] = _get_resource_dict('nengwords.csv') cstrittmatter@0: lookup_table['spelling_mistakes'] = _get_resource_dict('ScorLex.csv') #only from v 0.7 cstrittmatter@0: lookup_table['spelling_mistakes'].update(_get_resource_dict('misspellings.csv')) cstrittmatter@0: lookup_table['stop_words'] = _get_resource_dict('mining_stopwords.csv') cstrittmatter@0: lookup_table['suffixes'] = _get_resource_dict('suffixes.csv') cstrittmatter@0: return(lookup_table) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def get_predefined_resources(): cstrittmatter@0: '''Creates lookup table''' cstrittmatter@0: lookup_table_path = os.path.join(ROOT, 'predefined_resources', 'lookup_table.json') cstrittmatter@0: if os.path.exists(lookup_table_path): cstrittmatter@0: with open(lookup_table_path) as LT_file: cstrittmatter@0: lookup_table = json.load(LT_file) cstrittmatter@0: else: cstrittmatter@0: lookup_table = _create_lookup_table_skeleton() cstrittmatter@0: lookup_table = _add_predefined_resources_to_lookup_table(lookup_table) cstrittmatter@0: with open(lookup_table_path, 'w') as LT_file: cstrittmatter@0: json.dump(lookup_table, LT_file) cstrittmatter@0: return(lookup_table) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def get_resource_label_permutations(resource_label): cstrittmatter@0: '''Get permutations of some term''' cstrittmatter@0: permutations_set = list(OrderedDict.fromkeys(permutations(resource_label.split()))) cstrittmatter@0: ret_list = [] cstrittmatter@0: for permutation_tuple in permutations_set: cstrittmatter@0: ret_list.append(' '.join(permutation_tuple)) cstrittmatter@0: return(ret_list)