Mercurial > repos > jpayne > bioproject_to_srr_2
diff charset_normalizer/md.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/charset_normalizer/md.py Sun May 05 23:32:17 2024 -0400 @@ -0,0 +1,615 @@ +from functools import lru_cache +from logging import getLogger +from typing import List, Optional + +from .constant import ( + COMMON_SAFE_ASCII_CHARACTERS, + TRACE, + UNICODE_SECONDARY_RANGE_KEYWORD, +) +from .utils import ( + is_accentuated, + is_arabic, + is_arabic_isolated_form, + is_case_variable, + is_cjk, + is_emoticon, + is_hangul, + is_hiragana, + is_katakana, + is_latin, + is_punctuation, + is_separator, + is_symbol, + is_thai, + is_unprintable, + remove_accent, + unicode_range, +) + + +class MessDetectorPlugin: + """ + Base abstract class used for mess detection plugins. + All detectors MUST extend and implement given methods. + """ + + def eligible(self, character: str) -> bool: + """ + Determine if given character should be fed in. + """ + raise NotImplementedError # pragma: nocover + + def feed(self, character: str) -> None: + """ + The main routine to be executed upon character. + Insert the logic in witch the text would be considered chaotic. + """ + raise NotImplementedError # pragma: nocover + + def reset(self) -> None: # pragma: no cover + """ + Permit to reset the plugin to the initial state. + """ + raise NotImplementedError + + @property + def ratio(self) -> float: + """ + Compute the chaos ratio based on what your feed() has seen. + Must NOT be lower than 0.; No restriction gt 0. + """ + raise NotImplementedError # pragma: nocover + + +class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._punctuation_count: int = 0 + self._symbol_count: int = 0 + self._character_count: int = 0 + + self._last_printable_char: Optional[str] = None + self._frenzy_symbol_in_word: bool = False + + def eligible(self, character: str) -> bool: + return character.isprintable() + + def feed(self, character: str) -> None: + self._character_count += 1 + + if ( + character != self._last_printable_char + and character not in COMMON_SAFE_ASCII_CHARACTERS + ): + if is_punctuation(character): + self._punctuation_count += 1 + elif ( + character.isdigit() is False + and is_symbol(character) + and is_emoticon(character) is False + ): + self._symbol_count += 2 + + self._last_printable_char = character + + def reset(self) -> None: # pragma: no cover + self._punctuation_count = 0 + self._character_count = 0 + self._symbol_count = 0 + + @property + def ratio(self) -> float: + if self._character_count == 0: + return 0.0 + + ratio_of_punctuation: float = ( + self._punctuation_count + self._symbol_count + ) / self._character_count + + return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 + + +class TooManyAccentuatedPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._character_count: int = 0 + self._accentuated_count: int = 0 + + def eligible(self, character: str) -> bool: + return character.isalpha() + + def feed(self, character: str) -> None: + self._character_count += 1 + + if is_accentuated(character): + self._accentuated_count += 1 + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._accentuated_count = 0 + + @property + def ratio(self) -> float: + if self._character_count < 8: + return 0.0 + + ratio_of_accentuation: float = self._accentuated_count / self._character_count + return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 + + +class UnprintablePlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._unprintable_count: int = 0 + self._character_count: int = 0 + + def eligible(self, character: str) -> bool: + return True + + def feed(self, character: str) -> None: + if is_unprintable(character): + self._unprintable_count += 1 + self._character_count += 1 + + def reset(self) -> None: # pragma: no cover + self._unprintable_count = 0 + + @property + def ratio(self) -> float: + if self._character_count == 0: + return 0.0 + + return (self._unprintable_count * 8) / self._character_count + + +class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._successive_count: int = 0 + self._character_count: int = 0 + + self._last_latin_character: Optional[str] = None + + def eligible(self, character: str) -> bool: + return character.isalpha() and is_latin(character) + + def feed(self, character: str) -> None: + self._character_count += 1 + if ( + self._last_latin_character is not None + and is_accentuated(character) + and is_accentuated(self._last_latin_character) + ): + if character.isupper() and self._last_latin_character.isupper(): + self._successive_count += 1 + # Worse if its the same char duplicated with different accent. + if remove_accent(character) == remove_accent(self._last_latin_character): + self._successive_count += 1 + self._last_latin_character = character + + def reset(self) -> None: # pragma: no cover + self._successive_count = 0 + self._character_count = 0 + self._last_latin_character = None + + @property + def ratio(self) -> float: + if self._character_count == 0: + return 0.0 + + return (self._successive_count * 2) / self._character_count + + +class SuspiciousRange(MessDetectorPlugin): + def __init__(self) -> None: + self._suspicious_successive_range_count: int = 0 + self._character_count: int = 0 + self._last_printable_seen: Optional[str] = None + + def eligible(self, character: str) -> bool: + return character.isprintable() + + def feed(self, character: str) -> None: + self._character_count += 1 + + if ( + character.isspace() + or is_punctuation(character) + or character in COMMON_SAFE_ASCII_CHARACTERS + ): + self._last_printable_seen = None + return + + if self._last_printable_seen is None: + self._last_printable_seen = character + return + + unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) + unicode_range_b: Optional[str] = unicode_range(character) + + if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): + self._suspicious_successive_range_count += 1 + + self._last_printable_seen = character + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._suspicious_successive_range_count = 0 + self._last_printable_seen = None + + @property + def ratio(self) -> float: + if self._character_count <= 24: + return 0.0 + + ratio_of_suspicious_range_usage: float = ( + self._suspicious_successive_range_count * 2 + ) / self._character_count + + return ratio_of_suspicious_range_usage + + +class SuperWeirdWordPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._word_count: int = 0 + self._bad_word_count: int = 0 + self._foreign_long_count: int = 0 + + self._is_current_word_bad: bool = False + self._foreign_long_watch: bool = False + + self._character_count: int = 0 + self._bad_character_count: int = 0 + + self._buffer: str = "" + self._buffer_accent_count: int = 0 + + def eligible(self, character: str) -> bool: + return True + + def feed(self, character: str) -> None: + if character.isalpha(): + self._buffer += character + if is_accentuated(character): + self._buffer_accent_count += 1 + if ( + self._foreign_long_watch is False + and (is_latin(character) is False or is_accentuated(character)) + and is_cjk(character) is False + and is_hangul(character) is False + and is_katakana(character) is False + and is_hiragana(character) is False + and is_thai(character) is False + ): + self._foreign_long_watch = True + return + if not self._buffer: + return + if ( + character.isspace() or is_punctuation(character) or is_separator(character) + ) and self._buffer: + self._word_count += 1 + buffer_length: int = len(self._buffer) + + self._character_count += buffer_length + + if buffer_length >= 4: + if self._buffer_accent_count / buffer_length > 0.34: + self._is_current_word_bad = True + # Word/Buffer ending with an upper case accentuated letter are so rare, + # that we will consider them all as suspicious. Same weight as foreign_long suspicious. + if ( + is_accentuated(self._buffer[-1]) + and self._buffer[-1].isupper() + and all(_.isupper() for _ in self._buffer) is False + ): + self._foreign_long_count += 1 + self._is_current_word_bad = True + if buffer_length >= 24 and self._foreign_long_watch: + camel_case_dst = [ + i + for c, i in zip(self._buffer, range(0, buffer_length)) + if c.isupper() + ] + probable_camel_cased: bool = False + + if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): + probable_camel_cased = True + + if not probable_camel_cased: + self._foreign_long_count += 1 + self._is_current_word_bad = True + + if self._is_current_word_bad: + self._bad_word_count += 1 + self._bad_character_count += len(self._buffer) + self._is_current_word_bad = False + + self._foreign_long_watch = False + self._buffer = "" + self._buffer_accent_count = 0 + elif ( + character not in {"<", ">", "-", "=", "~", "|", "_"} + and character.isdigit() is False + and is_symbol(character) + ): + self._is_current_word_bad = True + self._buffer += character + + def reset(self) -> None: # pragma: no cover + self._buffer = "" + self._is_current_word_bad = False + self._foreign_long_watch = False + self._bad_word_count = 0 + self._word_count = 0 + self._character_count = 0 + self._bad_character_count = 0 + self._foreign_long_count = 0 + + @property + def ratio(self) -> float: + if self._word_count <= 10 and self._foreign_long_count == 0: + return 0.0 + + return self._bad_character_count / self._character_count + + +class CjkInvalidStopPlugin(MessDetectorPlugin): + """ + GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and + can be easily detected. Searching for the overuse of '丅' and '丄'. + """ + + def __init__(self) -> None: + self._wrong_stop_count: int = 0 + self._cjk_character_count: int = 0 + + def eligible(self, character: str) -> bool: + return True + + def feed(self, character: str) -> None: + if character in {"丅", "丄"}: + self._wrong_stop_count += 1 + return + if is_cjk(character): + self._cjk_character_count += 1 + + def reset(self) -> None: # pragma: no cover + self._wrong_stop_count = 0 + self._cjk_character_count = 0 + + @property + def ratio(self) -> float: + if self._cjk_character_count < 16: + return 0.0 + return self._wrong_stop_count / self._cjk_character_count + + +class ArchaicUpperLowerPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._buf: bool = False + + self._character_count_since_last_sep: int = 0 + + self._successive_upper_lower_count: int = 0 + self._successive_upper_lower_count_final: int = 0 + + self._character_count: int = 0 + + self._last_alpha_seen: Optional[str] = None + self._current_ascii_only: bool = True + + def eligible(self, character: str) -> bool: + return True + + def feed(self, character: str) -> None: + is_concerned = character.isalpha() and is_case_variable(character) + chunk_sep = is_concerned is False + + if chunk_sep and self._character_count_since_last_sep > 0: + if ( + self._character_count_since_last_sep <= 64 + and character.isdigit() is False + and self._current_ascii_only is False + ): + self._successive_upper_lower_count_final += ( + self._successive_upper_lower_count + ) + + self._successive_upper_lower_count = 0 + self._character_count_since_last_sep = 0 + self._last_alpha_seen = None + self._buf = False + self._character_count += 1 + self._current_ascii_only = True + + return + + if self._current_ascii_only is True and character.isascii() is False: + self._current_ascii_only = False + + if self._last_alpha_seen is not None: + if (character.isupper() and self._last_alpha_seen.islower()) or ( + character.islower() and self._last_alpha_seen.isupper() + ): + if self._buf is True: + self._successive_upper_lower_count += 2 + self._buf = False + else: + self._buf = True + else: + self._buf = False + + self._character_count += 1 + self._character_count_since_last_sep += 1 + self._last_alpha_seen = character + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._character_count_since_last_sep = 0 + self._successive_upper_lower_count = 0 + self._successive_upper_lower_count_final = 0 + self._last_alpha_seen = None + self._buf = False + self._current_ascii_only = True + + @property + def ratio(self) -> float: + if self._character_count == 0: + return 0.0 + + return self._successive_upper_lower_count_final / self._character_count + + +class ArabicIsolatedFormPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._character_count: int = 0 + self._isolated_form_count: int = 0 + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._isolated_form_count = 0 + + def eligible(self, character: str) -> bool: + return is_arabic(character) + + def feed(self, character: str) -> None: + self._character_count += 1 + + if is_arabic_isolated_form(character): + self._isolated_form_count += 1 + + @property + def ratio(self) -> float: + if self._character_count < 8: + return 0.0 + + isolated_form_usage: float = self._isolated_form_count / self._character_count + + return isolated_form_usage + + +@lru_cache(maxsize=1024) +def is_suspiciously_successive_range( + unicode_range_a: Optional[str], unicode_range_b: Optional[str] +) -> bool: + """ + Determine if two Unicode range seen next to each other can be considered as suspicious. + """ + if unicode_range_a is None or unicode_range_b is None: + return True + + if unicode_range_a == unicode_range_b: + return False + + if "Latin" in unicode_range_a and "Latin" in unicode_range_b: + return False + + if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: + return False + + # Latin characters can be accompanied with a combining diacritical mark + # eg. Vietnamese. + if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( + "Combining" in unicode_range_a or "Combining" in unicode_range_b + ): + return False + + keywords_range_a, keywords_range_b = unicode_range_a.split( + " " + ), unicode_range_b.split(" ") + + for el in keywords_range_a: + if el in UNICODE_SECONDARY_RANGE_KEYWORD: + continue + if el in keywords_range_b: + return False + + # Japanese Exception + range_a_jp_chars, range_b_jp_chars = ( + unicode_range_a + in ( + "Hiragana", + "Katakana", + ), + unicode_range_b in ("Hiragana", "Katakana"), + ) + if (range_a_jp_chars or range_b_jp_chars) and ( + "CJK" in unicode_range_a or "CJK" in unicode_range_b + ): + return False + if range_a_jp_chars and range_b_jp_chars: + return False + + if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: + if "CJK" in unicode_range_a or "CJK" in unicode_range_b: + return False + if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": + return False + + # Chinese/Japanese use dedicated range for punctuation and/or separators. + if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( + unicode_range_a in ["Katakana", "Hiragana"] + and unicode_range_b in ["Katakana", "Hiragana"] + ): + if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: + return False + if "Forms" in unicode_range_a or "Forms" in unicode_range_b: + return False + if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": + return False + + return True + + +@lru_cache(maxsize=2048) +def mess_ratio( + decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False +) -> float: + """ + Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. + """ + + detectors: List[MessDetectorPlugin] = [ + md_class() for md_class in MessDetectorPlugin.__subclasses__() + ] + + length: int = len(decoded_sequence) + 1 + + mean_mess_ratio: float = 0.0 + + if length < 512: + intermediary_mean_mess_ratio_calc: int = 32 + elif length <= 1024: + intermediary_mean_mess_ratio_calc = 64 + else: + intermediary_mean_mess_ratio_calc = 128 + + for character, index in zip(decoded_sequence + "\n", range(length)): + for detector in detectors: + if detector.eligible(character): + detector.feed(character) + + if ( + index > 0 and index % intermediary_mean_mess_ratio_calc == 0 + ) or index == length - 1: + mean_mess_ratio = sum(dt.ratio for dt in detectors) + + if mean_mess_ratio >= maximum_threshold: + break + + if debug: + logger = getLogger("charset_normalizer") + + logger.log( + TRACE, + "Mess-detector extended-analysis start. " + f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " + f"maximum_threshold={maximum_threshold}", + ) + + if len(decoded_sequence) > 16: + logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") + logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") + + for dt in detectors: # pragma: nocover + logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") + + return round(mean_mess_ratio, 3)