jpayne@7: import importlib
jpayne@7: from codecs import IncrementalDecoder
jpayne@7: from collections import Counter
jpayne@7: from functools import lru_cache
jpayne@7: from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
jpayne@7: 
jpayne@7: from .constant import (
jpayne@7:     FREQUENCIES,
jpayne@7:     KO_NAMES,
jpayne@7:     LANGUAGE_SUPPORTED_COUNT,
jpayne@7:     TOO_SMALL_SEQUENCE,
jpayne@7:     ZH_NAMES,
jpayne@7: )
jpayne@7: from .md import is_suspiciously_successive_range
jpayne@7: from .models import CoherenceMatches
jpayne@7: from .utils import (
jpayne@7:     is_accentuated,
jpayne@7:     is_latin,
jpayne@7:     is_multi_byte_encoding,
jpayne@7:     is_unicode_range_secondary,
jpayne@7:     unicode_range,
jpayne@7: )
jpayne@7: 
jpayne@7: 
jpayne@7: def encoding_unicode_range(iana_name: str) -> List[str]:
jpayne@7:     """
jpayne@7:     Return associated unicode ranges in a single byte code page.
jpayne@7:     """
jpayne@7:     if is_multi_byte_encoding(iana_name):
jpayne@7:         raise IOError("Function not supported on multi-byte code page")
jpayne@7: 
jpayne@7:     decoder = importlib.import_module(
jpayne@7:         "encodings.{}".format(iana_name)
jpayne@7:     ).IncrementalDecoder
jpayne@7: 
jpayne@7:     p: IncrementalDecoder = decoder(errors="ignore")
jpayne@7:     seen_ranges: Dict[str, int] = {}
jpayne@7:     character_count: int = 0
jpayne@7: 
jpayne@7:     for i in range(0x40, 0xFF):
jpayne@7:         chunk: str = p.decode(bytes([i]))
jpayne@7: 
jpayne@7:         if chunk:
jpayne@7:             character_range: Optional[str] = unicode_range(chunk)
jpayne@7: 
jpayne@7:             if character_range is None:
jpayne@7:                 continue
jpayne@7: 
jpayne@7:             if is_unicode_range_secondary(character_range) is False:
jpayne@7:                 if character_range not in seen_ranges:
jpayne@7:                     seen_ranges[character_range] = 0
jpayne@7:                 seen_ranges[character_range] += 1
jpayne@7:                 character_count += 1
jpayne@7: 
jpayne@7:     return sorted(
jpayne@7:         [
jpayne@7:             character_range
jpayne@7:             for character_range in seen_ranges
jpayne@7:             if seen_ranges[character_range] / character_count >= 0.15
jpayne@7:         ]
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: def unicode_range_languages(primary_range: str) -> List[str]:
jpayne@7:     """
jpayne@7:     Return inferred languages used with a unicode range.
jpayne@7:     """
jpayne@7:     languages: List[str] = []
jpayne@7: 
jpayne@7:     for language, characters in FREQUENCIES.items():
jpayne@7:         for character in characters:
jpayne@7:             if unicode_range(character) == primary_range:
jpayne@7:                 languages.append(language)
jpayne@7:                 break
jpayne@7: 
jpayne@7:     return languages
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache()
jpayne@7: def encoding_languages(iana_name: str) -> List[str]:
jpayne@7:     """
jpayne@7:     Single-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7:     This function does the correspondence.
jpayne@7:     """
jpayne@7:     unicode_ranges: List[str] = encoding_unicode_range(iana_name)
jpayne@7:     primary_range: Optional[str] = None
jpayne@7: 
jpayne@7:     for specified_range in unicode_ranges:
jpayne@7:         if "Latin" not in specified_range:
jpayne@7:             primary_range = specified_range
jpayne@7:             break
jpayne@7: 
jpayne@7:     if primary_range is None:
jpayne@7:         return ["Latin Based"]
jpayne@7: 
jpayne@7:     return unicode_range_languages(primary_range)
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache()
jpayne@7: def mb_encoding_languages(iana_name: str) -> List[str]:
jpayne@7:     """
jpayne@7:     Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7:     This function does the correspondence.
jpayne@7:     """
jpayne@7:     if (
jpayne@7:         iana_name.startswith("shift_")
jpayne@7:         or iana_name.startswith("iso2022_jp")
jpayne@7:         or iana_name.startswith("euc_j")
jpayne@7:         or iana_name == "cp932"
jpayne@7:     ):
jpayne@7:         return ["Japanese"]
jpayne@7:     if iana_name.startswith("gb") or iana_name in ZH_NAMES:
jpayne@7:         return ["Chinese"]
jpayne@7:     if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
jpayne@7:         return ["Korean"]
jpayne@7: 
jpayne@7:     return []
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
jpayne@7: def get_target_features(language: str) -> Tuple[bool, bool]:
jpayne@7:     """
jpayne@7:     Determine main aspects from a supported language if it contains accents and if is pure Latin.
jpayne@7:     """
jpayne@7:     target_have_accents: bool = False
jpayne@7:     target_pure_latin: bool = True
jpayne@7: 
jpayne@7:     for character in FREQUENCIES[language]:
jpayne@7:         if not target_have_accents and is_accentuated(character):
jpayne@7:             target_have_accents = True
jpayne@7:         if target_pure_latin and is_latin(character) is False:
jpayne@7:             target_pure_latin = False
jpayne@7: 
jpayne@7:     return target_have_accents, target_pure_latin
jpayne@7: 
jpayne@7: 
jpayne@7: def alphabet_languages(
jpayne@7:     characters: List[str], ignore_non_latin: bool = False
jpayne@7: ) -> List[str]:
jpayne@7:     """
jpayne@7:     Return associated languages associated to given characters.
jpayne@7:     """
jpayne@7:     languages: List[Tuple[str, float]] = []
jpayne@7: 
jpayne@7:     source_have_accents = any(is_accentuated(character) for character in characters)
jpayne@7: 
jpayne@7:     for language, language_characters in FREQUENCIES.items():
jpayne@7:         target_have_accents, target_pure_latin = get_target_features(language)
jpayne@7: 
jpayne@7:         if ignore_non_latin and target_pure_latin is False:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if target_have_accents is False and source_have_accents:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         character_count: int = len(language_characters)
jpayne@7: 
jpayne@7:         character_match_count: int = len(
jpayne@7:             [c for c in language_characters if c in characters]
jpayne@7:         )
jpayne@7: 
jpayne@7:         ratio: float = character_match_count / character_count
jpayne@7: 
jpayne@7:         if ratio >= 0.2:
jpayne@7:             languages.append((language, ratio))
jpayne@7: 
jpayne@7:     languages = sorted(languages, key=lambda x: x[1], reverse=True)
jpayne@7: 
jpayne@7:     return [compatible_language[0] for compatible_language in languages]
jpayne@7: 
jpayne@7: 
jpayne@7: def characters_popularity_compare(
jpayne@7:     language: str, ordered_characters: List[str]
jpayne@7: ) -> float:
jpayne@7:     """
jpayne@7:     Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
jpayne@7:     The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
jpayne@7:     Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
jpayne@7:     """
jpayne@7:     if language not in FREQUENCIES:
jpayne@7:         raise ValueError("{} not available".format(language))
jpayne@7: 
jpayne@7:     character_approved_count: int = 0
jpayne@7:     FREQUENCIES_language_set = set(FREQUENCIES[language])
jpayne@7: 
jpayne@7:     ordered_characters_count: int = len(ordered_characters)
jpayne@7:     target_language_characters_count: int = len(FREQUENCIES[language])
jpayne@7: 
jpayne@7:     large_alphabet: bool = target_language_characters_count > 26
jpayne@7: 
jpayne@7:     for character, character_rank in zip(
jpayne@7:         ordered_characters, range(0, ordered_characters_count)
jpayne@7:     ):
jpayne@7:         if character not in FREQUENCIES_language_set:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         character_rank_in_language: int = FREQUENCIES[language].index(character)
jpayne@7:         expected_projection_ratio: float = (
jpayne@7:             target_language_characters_count / ordered_characters_count
jpayne@7:         )
jpayne@7:         character_rank_projection: int = int(character_rank * expected_projection_ratio)
jpayne@7: 
jpayne@7:         if (
jpayne@7:             large_alphabet is False
jpayne@7:             and abs(character_rank_projection - character_rank_in_language) > 4
jpayne@7:         ):
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if (
jpayne@7:             large_alphabet is True
jpayne@7:             and abs(character_rank_projection - character_rank_in_language)
jpayne@7:             < target_language_characters_count / 3
jpayne@7:         ):
jpayne@7:             character_approved_count += 1
jpayne@7:             continue
jpayne@7: 
jpayne@7:         characters_before_source: List[str] = FREQUENCIES[language][
jpayne@7:             0:character_rank_in_language
jpayne@7:         ]
jpayne@7:         characters_after_source: List[str] = FREQUENCIES[language][
jpayne@7:             character_rank_in_language:
jpayne@7:         ]
jpayne@7:         characters_before: List[str] = ordered_characters[0:character_rank]
jpayne@7:         characters_after: List[str] = ordered_characters[character_rank:]
jpayne@7: 
jpayne@7:         before_match_count: int = len(
jpayne@7:             set(characters_before) & set(characters_before_source)
jpayne@7:         )
jpayne@7: 
jpayne@7:         after_match_count: int = len(
jpayne@7:             set(characters_after) & set(characters_after_source)
jpayne@7:         )
jpayne@7: 
jpayne@7:         if len(characters_before_source) == 0 and before_match_count <= 4:
jpayne@7:             character_approved_count += 1
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if len(characters_after_source) == 0 and after_match_count <= 4:
jpayne@7:             character_approved_count += 1
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if (
jpayne@7:             before_match_count / len(characters_before_source) >= 0.4
jpayne@7:             or after_match_count / len(characters_after_source) >= 0.4
jpayne@7:         ):
jpayne@7:             character_approved_count += 1
jpayne@7:             continue
jpayne@7: 
jpayne@7:     return character_approved_count / len(ordered_characters)
jpayne@7: 
jpayne@7: 
jpayne@7: def alpha_unicode_split(decoded_sequence: str) -> List[str]:
jpayne@7:     """
jpayne@7:     Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
jpayne@7:     Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
jpayne@7:     One containing the latin letters and the other hebrew.
jpayne@7:     """
jpayne@7:     layers: Dict[str, str] = {}
jpayne@7: 
jpayne@7:     for character in decoded_sequence:
jpayne@7:         if character.isalpha() is False:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         character_range: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:         if character_range is None:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         layer_target_range: Optional[str] = None
jpayne@7: 
jpayne@7:         for discovered_range in layers:
jpayne@7:             if (
jpayne@7:                 is_suspiciously_successive_range(discovered_range, character_range)
jpayne@7:                 is False
jpayne@7:             ):
jpayne@7:                 layer_target_range = discovered_range
jpayne@7:                 break
jpayne@7: 
jpayne@7:         if layer_target_range is None:
jpayne@7:             layer_target_range = character_range
jpayne@7: 
jpayne@7:         if layer_target_range not in layers:
jpayne@7:             layers[layer_target_range] = character.lower()
jpayne@7:             continue
jpayne@7: 
jpayne@7:         layers[layer_target_range] += character.lower()
jpayne@7: 
jpayne@7:     return list(layers.values())
jpayne@7: 
jpayne@7: 
jpayne@7: def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
jpayne@7:     """
jpayne@7:     This function merge results previously given by the function coherence_ratio.
jpayne@7:     The return type is the same as coherence_ratio.
jpayne@7:     """
jpayne@7:     per_language_ratios: Dict[str, List[float]] = {}
jpayne@7:     for result in results:
jpayne@7:         for sub_result in result:
jpayne@7:             language, ratio = sub_result
jpayne@7:             if language not in per_language_ratios:
jpayne@7:                 per_language_ratios[language] = [ratio]
jpayne@7:                 continue
jpayne@7:             per_language_ratios[language].append(ratio)
jpayne@7: 
jpayne@7:     merge = [
jpayne@7:         (
jpayne@7:             language,
jpayne@7:             round(
jpayne@7:                 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
jpayne@7:                 4,
jpayne@7:             ),
jpayne@7:         )
jpayne@7:         for language in per_language_ratios
jpayne@7:     ]
jpayne@7: 
jpayne@7:     return sorted(merge, key=lambda x: x[1], reverse=True)
jpayne@7: 
jpayne@7: 
jpayne@7: def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
jpayne@7:     """
jpayne@7:     We shall NOT return "English—" in CoherenceMatches because it is an alternative
jpayne@7:     of "English". This function only keeps the best match and remove the em-dash in it.
jpayne@7:     """
jpayne@7:     index_results: Dict[str, List[float]] = dict()
jpayne@7: 
jpayne@7:     for result in results:
jpayne@7:         language, ratio = result
jpayne@7:         no_em_name: str = language.replace("—", "")
jpayne@7: 
jpayne@7:         if no_em_name not in index_results:
jpayne@7:             index_results[no_em_name] = []
jpayne@7: 
jpayne@7:         index_results[no_em_name].append(ratio)
jpayne@7: 
jpayne@7:     if any(len(index_results[e]) > 1 for e in index_results):
jpayne@7:         filtered_results: CoherenceMatches = []
jpayne@7: 
jpayne@7:         for language in index_results:
jpayne@7:             filtered_results.append((language, max(index_results[language])))
jpayne@7: 
jpayne@7:         return filtered_results
jpayne@7: 
jpayne@7:     return results
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=2048)
jpayne@7: def coherence_ratio(
jpayne@7:     decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
jpayne@7: ) -> CoherenceMatches:
jpayne@7:     """
jpayne@7:     Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
jpayne@7:     A layer = Character extraction by alphabets/ranges.
jpayne@7:     """
jpayne@7: 
jpayne@7:     results: List[Tuple[str, float]] = []
jpayne@7:     ignore_non_latin: bool = False
jpayne@7: 
jpayne@7:     sufficient_match_count: int = 0
jpayne@7: 
jpayne@7:     lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
jpayne@7:     if "Latin Based" in lg_inclusion_list:
jpayne@7:         ignore_non_latin = True
jpayne@7:         lg_inclusion_list.remove("Latin Based")
jpayne@7: 
jpayne@7:     for layer in alpha_unicode_split(decoded_sequence):
jpayne@7:         sequence_frequencies: TypeCounter[str] = Counter(layer)
jpayne@7:         most_common = sequence_frequencies.most_common()
jpayne@7: 
jpayne@7:         character_count: int = sum(o for c, o in most_common)
jpayne@7: 
jpayne@7:         if character_count <= TOO_SMALL_SEQUENCE:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         popular_character_ordered: List[str] = [c for c, o in most_common]
jpayne@7: 
jpayne@7:         for language in lg_inclusion_list or alphabet_languages(
jpayne@7:             popular_character_ordered, ignore_non_latin
jpayne@7:         ):
jpayne@7:             ratio: float = characters_popularity_compare(
jpayne@7:                 language, popular_character_ordered
jpayne@7:             )
jpayne@7: 
jpayne@7:             if ratio < threshold:
jpayne@7:                 continue
jpayne@7:             elif ratio >= 0.8:
jpayne@7:                 sufficient_match_count += 1
jpayne@7: 
jpayne@7:             results.append((language, round(ratio, 4)))
jpayne@7: 
jpayne@7:             if sufficient_match_count >= 3:
jpayne@7:                 break
jpayne@7: 
jpayne@7:     return sorted(
jpayne@7:         filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
jpayne@7:     )