bioproject_to_srr_2: charset_normalizer/cd.py comparison

comparison charset_normalizer/cd.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:32:17 -0400
parents
children

comparison

equal deleted inserted replaced

-:b2745907b1eb
+:5eb2d5e3bf22
+import importlib
+from codecs import IncrementalDecoder
+from collections import Counter
+from functools import lru_cache
+from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
+from .constant import (
+FREQUENCIES,
+KO_NAMES,
+LANGUAGE_SUPPORTED_COUNT,
+TOO_SMALL_SEQUENCE,
+ZH_NAMES,
+)
+from .md import is_suspiciously_successive_range
+from .models import CoherenceMatches
+from .utils import (
+is_accentuated,
+is_latin,
+is_multi_byte_encoding,
+is_unicode_range_secondary,
+unicode_range,
+)
+def encoding_unicode_range(iana_name: str) -> List[str]:
+"""
+Return associated unicode ranges in a single byte code page.
+"""
+if is_multi_byte_encoding(iana_name):
+raise IOError("Function not supported on multi-byte code page")
+decoder = importlib.import_module(
+"encodings.{}".format(iana_name)
+).IncrementalDecoder
+p: IncrementalDecoder = decoder(errors="ignore")
+seen_ranges: Dict[str, int] = {}
+character_count: int = 0
+for i in range(0x40, 0xFF):
+chunk: str = p.decode(bytes([i]))
+if chunk:
+character_range: Optional[str] = unicode_range(chunk)
+if character_range is None:
+continue
+if is_unicode_range_secondary(character_range) is False:
+if character_range not in seen_ranges:
+seen_ranges[character_range] = 0
+seen_ranges[character_range] += 1
+character_count += 1
+return sorted(
+[
+character_range
+for character_range in seen_ranges
+if seen_ranges[character_range] / character_count >= 0.15
+]
+)
+def unicode_range_languages(primary_range: str) -> List[str]:
+"""
+Return inferred languages used with a unicode range.
+"""
+languages: List[str] = []
+for language, characters in FREQUENCIES.items():
+for character in characters:
+if unicode_range(character) == primary_range:
+languages.append(language)
+break
+return languages
+@lru_cache()
+def encoding_languages(iana_name: str) -> List[str]:
+"""
+Single-byte encoding language association. Some code page are heavily linked to particular language(s).
+This function does the correspondence.
+"""
+unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+primary_range: Optional[str] = None
+for specified_range in unicode_ranges:
+if "Latin" not in specified_range:
+primary_range = specified_range
+break
+if primary_range is None:
+return ["Latin Based"]
+return unicode_range_languages(primary_range)
+@lru_cache()
+def mb_encoding_languages(iana_name: str) -> List[str]:
+"""
+Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
+This function does the correspondence.
+"""
+if (
+iana_name.startswith("shift_")
+or iana_name.startswith("iso2022_jp")
+or iana_name.startswith("euc_j")
+or iana_name == "cp932"
+):
+return ["Japanese"]
+if iana_name.startswith("gb") or iana_name in ZH_NAMES:
+return ["Chinese"]
+if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
+return ["Korean"]
+return []
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> Tuple[bool, bool]:
+"""
+Determine main aspects from a supported language if it contains accents and if is pure Latin.
+"""
+target_have_accents: bool = False
+target_pure_latin: bool = True
+for character in FREQUENCIES[language]:
+if not target_have_accents and is_accentuated(character):
+target_have_accents = True
+if target_pure_latin and is_latin(character) is False:
+target_pure_latin = False
+return target_have_accents, target_pure_latin
+def alphabet_languages(
+characters: List[str], ignore_non_latin: bool = False
+) -> List[str]:
+"""
+Return associated languages associated to given characters.
+"""
+languages: List[Tuple[str, float]] = []
+source_have_accents = any(is_accentuated(character) for character in characters)
+for language, language_characters in FREQUENCIES.items():
+target_have_accents, target_pure_latin = get_target_features(language)
+if ignore_non_latin and target_pure_latin is False:
+continue
+if target_have_accents is False and source_have_accents:
+continue
+character_count: int = len(language_characters)
+character_match_count: int = len(
+[c for c in language_characters if c in characters]
+)
+ratio: float = character_match_count / character_count
+if ratio >= 0.2:
+languages.append((language, ratio))
+languages = sorted(languages, key=lambda x: x[1], reverse=True)
+return [compatible_language[0] for compatible_language in languages]
+def characters_popularity_compare(
+language: str, ordered_characters: List[str]
+) -> float:
+"""
+Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
+The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
+Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
+"""
+if language not in FREQUENCIES:
+raise ValueError("{} not available".format(language))
+character_approved_count: int = 0
+FREQUENCIES_language_set = set(FREQUENCIES[language])
+ordered_characters_count: int = len(ordered_characters)
+target_language_characters_count: int = len(FREQUENCIES[language])
+large_alphabet: bool = target_language_characters_count > 26
+for character, character_rank in zip(
+ordered_characters, range(0, ordered_characters_count)
+):
+if character not in FREQUENCIES_language_set:
+continue
+character_rank_in_language: int = FREQUENCIES[language].index(character)
+expected_projection_ratio: float = (
+target_language_characters_count / ordered_characters_count
+)
+character_rank_projection: int = int(character_rank * expected_projection_ratio)
+if (
+large_alphabet is False
+and abs(character_rank_projection - character_rank_in_language) > 4
+):
+continue
+if (
+large_alphabet is True
+and abs(character_rank_projection - character_rank_in_language)
+< target_language_characters_count / 3
+):
+character_approved_count += 1
+continue
+characters_before_source: List[str] = FREQUENCIES[language][
+0:character_rank_in_language
+]
+characters_after_source: List[str] = FREQUENCIES[language][
+character_rank_in_language:
+]
+characters_before: List[str] = ordered_characters[0:character_rank]
+characters_after: List[str] = ordered_characters[character_rank:]
+before_match_count: int = len(
+set(characters_before) & set(characters_before_source)
+)
+after_match_count: int = len(
+set(characters_after) & set(characters_after_source)
+)
+if len(characters_before_source) == 0 and before_match_count <= 4:
+character_approved_count += 1
+continue
+if len(characters_after_source) == 0 and after_match_count <= 4:
+character_approved_count += 1
+continue
+if (
+before_match_count / len(characters_before_source) >= 0.4
+or after_match_count / len(characters_after_source) >= 0.4
+):
+character_approved_count += 1
+continue
+return character_approved_count / len(ordered_characters)
+def alpha_unicode_split(decoded_sequence: str) -> List[str]:
+"""
+Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
+Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
+One containing the latin letters and the other hebrew.
+"""
+layers: Dict[str, str] = {}
+for character in decoded_sequence:
+if character.isalpha() is False:
+continue
+character_range: Optional[str] = unicode_range(character)
+if character_range is None:
+continue
+layer_target_range: Optional[str] = None
+for discovered_range in layers:
+if (
+is_suspiciously_successive_range(discovered_range, character_range)
+is False
+):
+layer_target_range = discovered_range
+break
+if layer_target_range is None:
+layer_target_range = character_range
+if layer_target_range not in layers:
+layers[layer_target_range] = character.lower()
+continue
+layers[layer_target_range] += character.lower()
+return list(layers.values())
+def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
+"""
+This function merge results previously given by the function coherence_ratio.
+The return type is the same as coherence_ratio.
+"""
+per_language_ratios: Dict[str, List[float]] = {}
+for result in results:
+for sub_result in result:
+language, ratio = sub_result
+if language not in per_language_ratios:
+per_language_ratios[language] = [ratio]
+continue
+per_language_ratios[language].append(ratio)
+merge = [
+(
+language,
+round(
+sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+4,
+),
+)
+for language in per_language_ratios
+]
+return sorted(merge, key=lambda x: x[1], reverse=True)
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+"""
+We shall NOT return "English—" in CoherenceMatches because it is an alternative
+of "English". This function only keeps the best match and remove the em-dash in it.
+"""
+index_results: Dict[str, List[float]] = dict()
+for result in results:
+language, ratio = result
+no_em_name: str = language.replace("—", "")
+if no_em_name not in index_results:
+index_results[no_em_name] = []
+index_results[no_em_name].append(ratio)
+if any(len(index_results[e]) > 1 for e in index_results):
+filtered_results: CoherenceMatches = []
+for language in index_results:
+filtered_results.append((language, max(index_results[language])))
+return filtered_results
+return results
+@lru_cache(maxsize=2048)
+def coherence_ratio(
+decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
+) -> CoherenceMatches:
+"""
+Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
+A layer = Character extraction by alphabets/ranges.
+"""
+results: List[Tuple[str, float]] = []
+ignore_non_latin: bool = False
+sufficient_match_count: int = 0
+lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
+if "Latin Based" in lg_inclusion_list:
+ignore_non_latin = True
+lg_inclusion_list.remove("Latin Based")
+for layer in alpha_unicode_split(decoded_sequence):
+sequence_frequencies: TypeCounter[str] = Counter(layer)
+most_common = sequence_frequencies.most_common()
+character_count: int = sum(o for c, o in most_common)
+if character_count <= TOO_SMALL_SEQUENCE:
+continue
+popular_character_ordered: List[str] = [c for c, o in most_common]
+for language in lg_inclusion_list or alphabet_languages(
+popular_character_ordered, ignore_non_latin
+):
+ratio: float = characters_popularity_compare(
+language, popular_character_ordered
+)
+if ratio < threshold:
+continue
+elif ratio >= 0.8:
+sufficient_match_count += 1
+results.append((language, round(ratio, 4)))
+if sufficient_match_count >= 3:
+break
+return sorted(
+filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+)

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison charset_normalizer/cd.py @ 7:5eb2d5e3bf22