jpayne@7: import importlib jpayne@7: from codecs import IncrementalDecoder jpayne@7: from collections import Counter jpayne@7: from functools import lru_cache jpayne@7: from typing import Counter as TypeCounter, Dict, List, Optional, Tuple jpayne@7: jpayne@7: from .constant import ( jpayne@7: FREQUENCIES, jpayne@7: KO_NAMES, jpayne@7: LANGUAGE_SUPPORTED_COUNT, jpayne@7: TOO_SMALL_SEQUENCE, jpayne@7: ZH_NAMES, jpayne@7: ) jpayne@7: from .md import is_suspiciously_successive_range jpayne@7: from .models import CoherenceMatches jpayne@7: from .utils import ( jpayne@7: is_accentuated, jpayne@7: is_latin, jpayne@7: is_multi_byte_encoding, jpayne@7: is_unicode_range_secondary, jpayne@7: unicode_range, jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: def encoding_unicode_range(iana_name: str) -> List[str]: jpayne@7: """ jpayne@7: Return associated unicode ranges in a single byte code page. jpayne@7: """ jpayne@7: if is_multi_byte_encoding(iana_name): jpayne@7: raise IOError("Function not supported on multi-byte code page") jpayne@7: jpayne@7: decoder = importlib.import_module( jpayne@7: "encodings.{}".format(iana_name) jpayne@7: ).IncrementalDecoder jpayne@7: jpayne@7: p: IncrementalDecoder = decoder(errors="ignore") jpayne@7: seen_ranges: Dict[str, int] = {} jpayne@7: character_count: int = 0 jpayne@7: jpayne@7: for i in range(0x40, 0xFF): jpayne@7: chunk: str = p.decode(bytes([i])) jpayne@7: jpayne@7: if chunk: jpayne@7: character_range: Optional[str] = unicode_range(chunk) jpayne@7: jpayne@7: if character_range is None: jpayne@7: continue jpayne@7: jpayne@7: if is_unicode_range_secondary(character_range) is False: jpayne@7: if character_range not in seen_ranges: jpayne@7: seen_ranges[character_range] = 0 jpayne@7: seen_ranges[character_range] += 1 jpayne@7: character_count += 1 jpayne@7: jpayne@7: return sorted( jpayne@7: [ jpayne@7: character_range jpayne@7: for character_range in seen_ranges jpayne@7: if seen_ranges[character_range] / character_count >= 0.15 jpayne@7: ] jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: def unicode_range_languages(primary_range: str) -> List[str]: jpayne@7: """ jpayne@7: Return inferred languages used with a unicode range. jpayne@7: """ jpayne@7: languages: List[str] = [] jpayne@7: jpayne@7: for language, characters in FREQUENCIES.items(): jpayne@7: for character in characters: jpayne@7: if unicode_range(character) == primary_range: jpayne@7: languages.append(language) jpayne@7: break jpayne@7: jpayne@7: return languages jpayne@7: jpayne@7: jpayne@7: @lru_cache() jpayne@7: def encoding_languages(iana_name: str) -> List[str]: jpayne@7: """ jpayne@7: Single-byte encoding language association. Some code page are heavily linked to particular language(s). jpayne@7: This function does the correspondence. jpayne@7: """ jpayne@7: unicode_ranges: List[str] = encoding_unicode_range(iana_name) jpayne@7: primary_range: Optional[str] = None jpayne@7: jpayne@7: for specified_range in unicode_ranges: jpayne@7: if "Latin" not in specified_range: jpayne@7: primary_range = specified_range jpayne@7: break jpayne@7: jpayne@7: if primary_range is None: jpayne@7: return ["Latin Based"] jpayne@7: jpayne@7: return unicode_range_languages(primary_range) jpayne@7: jpayne@7: jpayne@7: @lru_cache() jpayne@7: def mb_encoding_languages(iana_name: str) -> List[str]: jpayne@7: """ jpayne@7: Multi-byte encoding language association. Some code page are heavily linked to particular language(s). jpayne@7: This function does the correspondence. jpayne@7: """ jpayne@7: if ( jpayne@7: iana_name.startswith("shift_") jpayne@7: or iana_name.startswith("iso2022_jp") jpayne@7: or iana_name.startswith("euc_j") jpayne@7: or iana_name == "cp932" jpayne@7: ): jpayne@7: return ["Japanese"] jpayne@7: if iana_name.startswith("gb") or iana_name in ZH_NAMES: jpayne@7: return ["Chinese"] jpayne@7: if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: jpayne@7: return ["Korean"] jpayne@7: jpayne@7: return [] jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) jpayne@7: def get_target_features(language: str) -> Tuple[bool, bool]: jpayne@7: """ jpayne@7: Determine main aspects from a supported language if it contains accents and if is pure Latin. jpayne@7: """ jpayne@7: target_have_accents: bool = False jpayne@7: target_pure_latin: bool = True jpayne@7: jpayne@7: for character in FREQUENCIES[language]: jpayne@7: if not target_have_accents and is_accentuated(character): jpayne@7: target_have_accents = True jpayne@7: if target_pure_latin and is_latin(character) is False: jpayne@7: target_pure_latin = False jpayne@7: jpayne@7: return target_have_accents, target_pure_latin jpayne@7: jpayne@7: jpayne@7: def alphabet_languages( jpayne@7: characters: List[str], ignore_non_latin: bool = False jpayne@7: ) -> List[str]: jpayne@7: """ jpayne@7: Return associated languages associated to given characters. jpayne@7: """ jpayne@7: languages: List[Tuple[str, float]] = [] jpayne@7: jpayne@7: source_have_accents = any(is_accentuated(character) for character in characters) jpayne@7: jpayne@7: for language, language_characters in FREQUENCIES.items(): jpayne@7: target_have_accents, target_pure_latin = get_target_features(language) jpayne@7: jpayne@7: if ignore_non_latin and target_pure_latin is False: jpayne@7: continue jpayne@7: jpayne@7: if target_have_accents is False and source_have_accents: jpayne@7: continue jpayne@7: jpayne@7: character_count: int = len(language_characters) jpayne@7: jpayne@7: character_match_count: int = len( jpayne@7: [c for c in language_characters if c in characters] jpayne@7: ) jpayne@7: jpayne@7: ratio: float = character_match_count / character_count jpayne@7: jpayne@7: if ratio >= 0.2: jpayne@7: languages.append((language, ratio)) jpayne@7: jpayne@7: languages = sorted(languages, key=lambda x: x[1], reverse=True) jpayne@7: jpayne@7: return [compatible_language[0] for compatible_language in languages] jpayne@7: jpayne@7: jpayne@7: def characters_popularity_compare( jpayne@7: language: str, ordered_characters: List[str] jpayne@7: ) -> float: jpayne@7: """ jpayne@7: Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. jpayne@7: The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). jpayne@7: Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) jpayne@7: """ jpayne@7: if language not in FREQUENCIES: jpayne@7: raise ValueError("{} not available".format(language)) jpayne@7: jpayne@7: character_approved_count: int = 0 jpayne@7: FREQUENCIES_language_set = set(FREQUENCIES[language]) jpayne@7: jpayne@7: ordered_characters_count: int = len(ordered_characters) jpayne@7: target_language_characters_count: int = len(FREQUENCIES[language]) jpayne@7: jpayne@7: large_alphabet: bool = target_language_characters_count > 26 jpayne@7: jpayne@7: for character, character_rank in zip( jpayne@7: ordered_characters, range(0, ordered_characters_count) jpayne@7: ): jpayne@7: if character not in FREQUENCIES_language_set: jpayne@7: continue jpayne@7: jpayne@7: character_rank_in_language: int = FREQUENCIES[language].index(character) jpayne@7: expected_projection_ratio: float = ( jpayne@7: target_language_characters_count / ordered_characters_count jpayne@7: ) jpayne@7: character_rank_projection: int = int(character_rank * expected_projection_ratio) jpayne@7: jpayne@7: if ( jpayne@7: large_alphabet is False jpayne@7: and abs(character_rank_projection - character_rank_in_language) > 4 jpayne@7: ): jpayne@7: continue jpayne@7: jpayne@7: if ( jpayne@7: large_alphabet is True jpayne@7: and abs(character_rank_projection - character_rank_in_language) jpayne@7: < target_language_characters_count / 3 jpayne@7: ): jpayne@7: character_approved_count += 1 jpayne@7: continue jpayne@7: jpayne@7: characters_before_source: List[str] = FREQUENCIES[language][ jpayne@7: 0:character_rank_in_language jpayne@7: ] jpayne@7: characters_after_source: List[str] = FREQUENCIES[language][ jpayne@7: character_rank_in_language: jpayne@7: ] jpayne@7: characters_before: List[str] = ordered_characters[0:character_rank] jpayne@7: characters_after: List[str] = ordered_characters[character_rank:] jpayne@7: jpayne@7: before_match_count: int = len( jpayne@7: set(characters_before) & set(characters_before_source) jpayne@7: ) jpayne@7: jpayne@7: after_match_count: int = len( jpayne@7: set(characters_after) & set(characters_after_source) jpayne@7: ) jpayne@7: jpayne@7: if len(characters_before_source) == 0 and before_match_count <= 4: jpayne@7: character_approved_count += 1 jpayne@7: continue jpayne@7: jpayne@7: if len(characters_after_source) == 0 and after_match_count <= 4: jpayne@7: character_approved_count += 1 jpayne@7: continue jpayne@7: jpayne@7: if ( jpayne@7: before_match_count / len(characters_before_source) >= 0.4 jpayne@7: or after_match_count / len(characters_after_source) >= 0.4 jpayne@7: ): jpayne@7: character_approved_count += 1 jpayne@7: continue jpayne@7: jpayne@7: return character_approved_count / len(ordered_characters) jpayne@7: jpayne@7: jpayne@7: def alpha_unicode_split(decoded_sequence: str) -> List[str]: jpayne@7: """ jpayne@7: Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. jpayne@7: Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; jpayne@7: One containing the latin letters and the other hebrew. jpayne@7: """ jpayne@7: layers: Dict[str, str] = {} jpayne@7: jpayne@7: for character in decoded_sequence: jpayne@7: if character.isalpha() is False: jpayne@7: continue jpayne@7: jpayne@7: character_range: Optional[str] = unicode_range(character) jpayne@7: jpayne@7: if character_range is None: jpayne@7: continue jpayne@7: jpayne@7: layer_target_range: Optional[str] = None jpayne@7: jpayne@7: for discovered_range in layers: jpayne@7: if ( jpayne@7: is_suspiciously_successive_range(discovered_range, character_range) jpayne@7: is False jpayne@7: ): jpayne@7: layer_target_range = discovered_range jpayne@7: break jpayne@7: jpayne@7: if layer_target_range is None: jpayne@7: layer_target_range = character_range jpayne@7: jpayne@7: if layer_target_range not in layers: jpayne@7: layers[layer_target_range] = character.lower() jpayne@7: continue jpayne@7: jpayne@7: layers[layer_target_range] += character.lower() jpayne@7: jpayne@7: return list(layers.values()) jpayne@7: jpayne@7: jpayne@7: def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: jpayne@7: """ jpayne@7: This function merge results previously given by the function coherence_ratio. jpayne@7: The return type is the same as coherence_ratio. jpayne@7: """ jpayne@7: per_language_ratios: Dict[str, List[float]] = {} jpayne@7: for result in results: jpayne@7: for sub_result in result: jpayne@7: language, ratio = sub_result jpayne@7: if language not in per_language_ratios: jpayne@7: per_language_ratios[language] = [ratio] jpayne@7: continue jpayne@7: per_language_ratios[language].append(ratio) jpayne@7: jpayne@7: merge = [ jpayne@7: ( jpayne@7: language, jpayne@7: round( jpayne@7: sum(per_language_ratios[language]) / len(per_language_ratios[language]), jpayne@7: 4, jpayne@7: ), jpayne@7: ) jpayne@7: for language in per_language_ratios jpayne@7: ] jpayne@7: jpayne@7: return sorted(merge, key=lambda x: x[1], reverse=True) jpayne@7: jpayne@7: jpayne@7: def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: jpayne@7: """ jpayne@7: We shall NOT return "English—" in CoherenceMatches because it is an alternative jpayne@7: of "English". This function only keeps the best match and remove the em-dash in it. jpayne@7: """ jpayne@7: index_results: Dict[str, List[float]] = dict() jpayne@7: jpayne@7: for result in results: jpayne@7: language, ratio = result jpayne@7: no_em_name: str = language.replace("—", "") jpayne@7: jpayne@7: if no_em_name not in index_results: jpayne@7: index_results[no_em_name] = [] jpayne@7: jpayne@7: index_results[no_em_name].append(ratio) jpayne@7: jpayne@7: if any(len(index_results[e]) > 1 for e in index_results): jpayne@7: filtered_results: CoherenceMatches = [] jpayne@7: jpayne@7: for language in index_results: jpayne@7: filtered_results.append((language, max(index_results[language]))) jpayne@7: jpayne@7: return filtered_results jpayne@7: jpayne@7: return results jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=2048) jpayne@7: def coherence_ratio( jpayne@7: decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None jpayne@7: ) -> CoherenceMatches: jpayne@7: """ jpayne@7: Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. jpayne@7: A layer = Character extraction by alphabets/ranges. jpayne@7: """ jpayne@7: jpayne@7: results: List[Tuple[str, float]] = [] jpayne@7: ignore_non_latin: bool = False jpayne@7: jpayne@7: sufficient_match_count: int = 0 jpayne@7: jpayne@7: lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] jpayne@7: if "Latin Based" in lg_inclusion_list: jpayne@7: ignore_non_latin = True jpayne@7: lg_inclusion_list.remove("Latin Based") jpayne@7: jpayne@7: for layer in alpha_unicode_split(decoded_sequence): jpayne@7: sequence_frequencies: TypeCounter[str] = Counter(layer) jpayne@7: most_common = sequence_frequencies.most_common() jpayne@7: jpayne@7: character_count: int = sum(o for c, o in most_common) jpayne@7: jpayne@7: if character_count <= TOO_SMALL_SEQUENCE: jpayne@7: continue jpayne@7: jpayne@7: popular_character_ordered: List[str] = [c for c, o in most_common] jpayne@7: jpayne@7: for language in lg_inclusion_list or alphabet_languages( jpayne@7: popular_character_ordered, ignore_non_latin jpayne@7: ): jpayne@7: ratio: float = characters_popularity_compare( jpayne@7: language, popular_character_ordered jpayne@7: ) jpayne@7: jpayne@7: if ratio < threshold: jpayne@7: continue jpayne@7: elif ratio >= 0.8: jpayne@7: sufficient_match_count += 1 jpayne@7: jpayne@7: results.append((language, round(ratio, 4))) jpayne@7: jpayne@7: if sufficient_match_count >= 3: jpayne@7: break jpayne@7: jpayne@7: return sorted( jpayne@7: filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True jpayne@7: )