jpayne@7: from functools import lru_cache
jpayne@7: from logging import getLogger
jpayne@7: from typing import List, Optional
jpayne@7: 
jpayne@7: from .constant import (
jpayne@7:     COMMON_SAFE_ASCII_CHARACTERS,
jpayne@7:     TRACE,
jpayne@7:     UNICODE_SECONDARY_RANGE_KEYWORD,
jpayne@7: )
jpayne@7: from .utils import (
jpayne@7:     is_accentuated,
jpayne@7:     is_arabic,
jpayne@7:     is_arabic_isolated_form,
jpayne@7:     is_case_variable,
jpayne@7:     is_cjk,
jpayne@7:     is_emoticon,
jpayne@7:     is_hangul,
jpayne@7:     is_hiragana,
jpayne@7:     is_katakana,
jpayne@7:     is_latin,
jpayne@7:     is_punctuation,
jpayne@7:     is_separator,
jpayne@7:     is_symbol,
jpayne@7:     is_thai,
jpayne@7:     is_unprintable,
jpayne@7:     remove_accent,
jpayne@7:     unicode_range,
jpayne@7: )
jpayne@7: 
jpayne@7: 
jpayne@7: class MessDetectorPlugin:
jpayne@7:     """
jpayne@7:     Base abstract class used for mess detection plugins.
jpayne@7:     All detectors MUST extend and implement given methods.
jpayne@7:     """
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         """
jpayne@7:         Determine if given character should be fed in.
jpayne@7:         """
jpayne@7:         raise NotImplementedError  # pragma: nocover
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         """
jpayne@7:         The main routine to be executed upon character.
jpayne@7:         Insert the logic in witch the text would be considered chaotic.
jpayne@7:         """
jpayne@7:         raise NotImplementedError  # pragma: nocover
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         """
jpayne@7:         Permit to reset the plugin to the initial state.
jpayne@7:         """
jpayne@7:         raise NotImplementedError
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         """
jpayne@7:         Compute the chaos ratio based on what your feed() has seen.
jpayne@7:         Must NOT be lower than 0.; No restriction gt 0.
jpayne@7:         """
jpayne@7:         raise NotImplementedError  # pragma: nocover
jpayne@7: 
jpayne@7: 
jpayne@7: class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._punctuation_count: int = 0
jpayne@7:         self._symbol_count: int = 0
jpayne@7:         self._character_count: int = 0
jpayne@7: 
jpayne@7:         self._last_printable_char: Optional[str] = None
jpayne@7:         self._frenzy_symbol_in_word: bool = False
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return character.isprintable()
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         self._character_count += 1
jpayne@7: 
jpayne@7:         if (
jpayne@7:             character != self._last_printable_char
jpayne@7:             and character not in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7:         ):
jpayne@7:             if is_punctuation(character):
jpayne@7:                 self._punctuation_count += 1
jpayne@7:             elif (
jpayne@7:                 character.isdigit() is False
jpayne@7:                 and is_symbol(character)
jpayne@7:                 and is_emoticon(character) is False
jpayne@7:             ):
jpayne@7:                 self._symbol_count += 2
jpayne@7: 
jpayne@7:         self._last_printable_char = character
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._punctuation_count = 0
jpayne@7:         self._character_count = 0
jpayne@7:         self._symbol_count = 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count == 0:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         ratio_of_punctuation: float = (
jpayne@7:             self._punctuation_count + self._symbol_count
jpayne@7:         ) / self._character_count
jpayne@7: 
jpayne@7:         return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
jpayne@7: 
jpayne@7: 
jpayne@7: class TooManyAccentuatedPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._character_count: int = 0
jpayne@7:         self._accentuated_count: int = 0
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return character.isalpha()
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         self._character_count += 1
jpayne@7: 
jpayne@7:         if is_accentuated(character):
jpayne@7:             self._accentuated_count += 1
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._character_count = 0
jpayne@7:         self._accentuated_count = 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count < 8:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         ratio_of_accentuation: float = self._accentuated_count / self._character_count
jpayne@7:         return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
jpayne@7: 
jpayne@7: 
jpayne@7: class UnprintablePlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._unprintable_count: int = 0
jpayne@7:         self._character_count: int = 0
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         if is_unprintable(character):
jpayne@7:             self._unprintable_count += 1
jpayne@7:         self._character_count += 1
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._unprintable_count = 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count == 0:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         return (self._unprintable_count * 8) / self._character_count
jpayne@7: 
jpayne@7: 
jpayne@7: class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._successive_count: int = 0
jpayne@7:         self._character_count: int = 0
jpayne@7: 
jpayne@7:         self._last_latin_character: Optional[str] = None
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return character.isalpha() and is_latin(character)
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         self._character_count += 1
jpayne@7:         if (
jpayne@7:             self._last_latin_character is not None
jpayne@7:             and is_accentuated(character)
jpayne@7:             and is_accentuated(self._last_latin_character)
jpayne@7:         ):
jpayne@7:             if character.isupper() and self._last_latin_character.isupper():
jpayne@7:                 self._successive_count += 1
jpayne@7:             # Worse if its the same char duplicated with different accent.
jpayne@7:             if remove_accent(character) == remove_accent(self._last_latin_character):
jpayne@7:                 self._successive_count += 1
jpayne@7:         self._last_latin_character = character
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._successive_count = 0
jpayne@7:         self._character_count = 0
jpayne@7:         self._last_latin_character = None
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count == 0:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         return (self._successive_count * 2) / self._character_count
jpayne@7: 
jpayne@7: 
jpayne@7: class SuspiciousRange(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._suspicious_successive_range_count: int = 0
jpayne@7:         self._character_count: int = 0
jpayne@7:         self._last_printable_seen: Optional[str] = None
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return character.isprintable()
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         self._character_count += 1
jpayne@7: 
jpayne@7:         if (
jpayne@7:             character.isspace()
jpayne@7:             or is_punctuation(character)
jpayne@7:             or character in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7:         ):
jpayne@7:             self._last_printable_seen = None
jpayne@7:             return
jpayne@7: 
jpayne@7:         if self._last_printable_seen is None:
jpayne@7:             self._last_printable_seen = character
jpayne@7:             return
jpayne@7: 
jpayne@7:         unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
jpayne@7:         unicode_range_b: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:         if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
jpayne@7:             self._suspicious_successive_range_count += 1
jpayne@7: 
jpayne@7:         self._last_printable_seen = character
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._character_count = 0
jpayne@7:         self._suspicious_successive_range_count = 0
jpayne@7:         self._last_printable_seen = None
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count <= 24:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         ratio_of_suspicious_range_usage: float = (
jpayne@7:             self._suspicious_successive_range_count * 2
jpayne@7:         ) / self._character_count
jpayne@7: 
jpayne@7:         return ratio_of_suspicious_range_usage
jpayne@7: 
jpayne@7: 
jpayne@7: class SuperWeirdWordPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._word_count: int = 0
jpayne@7:         self._bad_word_count: int = 0
jpayne@7:         self._foreign_long_count: int = 0
jpayne@7: 
jpayne@7:         self._is_current_word_bad: bool = False
jpayne@7:         self._foreign_long_watch: bool = False
jpayne@7: 
jpayne@7:         self._character_count: int = 0
jpayne@7:         self._bad_character_count: int = 0
jpayne@7: 
jpayne@7:         self._buffer: str = ""
jpayne@7:         self._buffer_accent_count: int = 0
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         if character.isalpha():
jpayne@7:             self._buffer += character
jpayne@7:             if is_accentuated(character):
jpayne@7:                 self._buffer_accent_count += 1
jpayne@7:             if (
jpayne@7:                 self._foreign_long_watch is False
jpayne@7:                 and (is_latin(character) is False or is_accentuated(character))
jpayne@7:                 and is_cjk(character) is False
jpayne@7:                 and is_hangul(character) is False
jpayne@7:                 and is_katakana(character) is False
jpayne@7:                 and is_hiragana(character) is False
jpayne@7:                 and is_thai(character) is False
jpayne@7:             ):
jpayne@7:                 self._foreign_long_watch = True
jpayne@7:             return
jpayne@7:         if not self._buffer:
jpayne@7:             return
jpayne@7:         if (
jpayne@7:             character.isspace() or is_punctuation(character) or is_separator(character)
jpayne@7:         ) and self._buffer:
jpayne@7:             self._word_count += 1
jpayne@7:             buffer_length: int = len(self._buffer)
jpayne@7: 
jpayne@7:             self._character_count += buffer_length
jpayne@7: 
jpayne@7:             if buffer_length >= 4:
jpayne@7:                 if self._buffer_accent_count / buffer_length > 0.34:
jpayne@7:                     self._is_current_word_bad = True
jpayne@7:                 # Word/Buffer ending with an upper case accentuated letter are so rare,
jpayne@7:                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
jpayne@7:                 if (
jpayne@7:                     is_accentuated(self._buffer[-1])
jpayne@7:                     and self._buffer[-1].isupper()
jpayne@7:                     and all(_.isupper() for _ in self._buffer) is False
jpayne@7:                 ):
jpayne@7:                     self._foreign_long_count += 1
jpayne@7:                     self._is_current_word_bad = True
jpayne@7:             if buffer_length >= 24 and self._foreign_long_watch:
jpayne@7:                 camel_case_dst = [
jpayne@7:                     i
jpayne@7:                     for c, i in zip(self._buffer, range(0, buffer_length))
jpayne@7:                     if c.isupper()
jpayne@7:                 ]
jpayne@7:                 probable_camel_cased: bool = False
jpayne@7: 
jpayne@7:                 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
jpayne@7:                     probable_camel_cased = True
jpayne@7: 
jpayne@7:                 if not probable_camel_cased:
jpayne@7:                     self._foreign_long_count += 1
jpayne@7:                     self._is_current_word_bad = True
jpayne@7: 
jpayne@7:             if self._is_current_word_bad:
jpayne@7:                 self._bad_word_count += 1
jpayne@7:                 self._bad_character_count += len(self._buffer)
jpayne@7:                 self._is_current_word_bad = False
jpayne@7: 
jpayne@7:             self._foreign_long_watch = False
jpayne@7:             self._buffer = ""
jpayne@7:             self._buffer_accent_count = 0
jpayne@7:         elif (
jpayne@7:             character not in {"<", ">", "-", "=", "~", "|", "_"}
jpayne@7:             and character.isdigit() is False
jpayne@7:             and is_symbol(character)
jpayne@7:         ):
jpayne@7:             self._is_current_word_bad = True
jpayne@7:             self._buffer += character
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._buffer = ""
jpayne@7:         self._is_current_word_bad = False
jpayne@7:         self._foreign_long_watch = False
jpayne@7:         self._bad_word_count = 0
jpayne@7:         self._word_count = 0
jpayne@7:         self._character_count = 0
jpayne@7:         self._bad_character_count = 0
jpayne@7:         self._foreign_long_count = 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._word_count <= 10 and self._foreign_long_count == 0:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         return self._bad_character_count / self._character_count
jpayne@7: 
jpayne@7: 
jpayne@7: class CjkInvalidStopPlugin(MessDetectorPlugin):
jpayne@7:     """
jpayne@7:     GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
jpayne@7:     can be easily detected. Searching for the overuse of '丅' and '丄'.
jpayne@7:     """
jpayne@7: 
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._wrong_stop_count: int = 0
jpayne@7:         self._cjk_character_count: int = 0
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         if character in {"丅", "丄"}:
jpayne@7:             self._wrong_stop_count += 1
jpayne@7:             return
jpayne@7:         if is_cjk(character):
jpayne@7:             self._cjk_character_count += 1
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._wrong_stop_count = 0
jpayne@7:         self._cjk_character_count = 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._cjk_character_count < 16:
jpayne@7:             return 0.0
jpayne@7:         return self._wrong_stop_count / self._cjk_character_count
jpayne@7: 
jpayne@7: 
jpayne@7: class ArchaicUpperLowerPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._buf: bool = False
jpayne@7: 
jpayne@7:         self._character_count_since_last_sep: int = 0
jpayne@7: 
jpayne@7:         self._successive_upper_lower_count: int = 0
jpayne@7:         self._successive_upper_lower_count_final: int = 0
jpayne@7: 
jpayne@7:         self._character_count: int = 0
jpayne@7: 
jpayne@7:         self._last_alpha_seen: Optional[str] = None
jpayne@7:         self._current_ascii_only: bool = True
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         is_concerned = character.isalpha() and is_case_variable(character)
jpayne@7:         chunk_sep = is_concerned is False
jpayne@7: 
jpayne@7:         if chunk_sep and self._character_count_since_last_sep > 0:
jpayne@7:             if (
jpayne@7:                 self._character_count_since_last_sep <= 64
jpayne@7:                 and character.isdigit() is False
jpayne@7:                 and self._current_ascii_only is False
jpayne@7:             ):
jpayne@7:                 self._successive_upper_lower_count_final += (
jpayne@7:                     self._successive_upper_lower_count
jpayne@7:                 )
jpayne@7: 
jpayne@7:             self._successive_upper_lower_count = 0
jpayne@7:             self._character_count_since_last_sep = 0
jpayne@7:             self._last_alpha_seen = None
jpayne@7:             self._buf = False
jpayne@7:             self._character_count += 1
jpayne@7:             self._current_ascii_only = True
jpayne@7: 
jpayne@7:             return
jpayne@7: 
jpayne@7:         if self._current_ascii_only is True and character.isascii() is False:
jpayne@7:             self._current_ascii_only = False
jpayne@7: 
jpayne@7:         if self._last_alpha_seen is not None:
jpayne@7:             if (character.isupper() and self._last_alpha_seen.islower()) or (
jpayne@7:                 character.islower() and self._last_alpha_seen.isupper()
jpayne@7:             ):
jpayne@7:                 if self._buf is True:
jpayne@7:                     self._successive_upper_lower_count += 2
jpayne@7:                     self._buf = False
jpayne@7:                 else:
jpayne@7:                     self._buf = True
jpayne@7:             else:
jpayne@7:                 self._buf = False
jpayne@7: 
jpayne@7:         self._character_count += 1
jpayne@7:         self._character_count_since_last_sep += 1
jpayne@7:         self._last_alpha_seen = character
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._character_count = 0
jpayne@7:         self._character_count_since_last_sep = 0
jpayne@7:         self._successive_upper_lower_count = 0
jpayne@7:         self._successive_upper_lower_count_final = 0
jpayne@7:         self._last_alpha_seen = None
jpayne@7:         self._buf = False
jpayne@7:         self._current_ascii_only = True
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count == 0:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         return self._successive_upper_lower_count_final / self._character_count
jpayne@7: 
jpayne@7: 
jpayne@7: class ArabicIsolatedFormPlugin(MessDetectorPlugin):
jpayne@7:     def __init__(self) -> None:
jpayne@7:         self._character_count: int = 0
jpayne@7:         self._isolated_form_count: int = 0
jpayne@7: 
jpayne@7:     def reset(self) -> None:  # pragma: no cover
jpayne@7:         self._character_count = 0
jpayne@7:         self._isolated_form_count = 0
jpayne@7: 
jpayne@7:     def eligible(self, character: str) -> bool:
jpayne@7:         return is_arabic(character)
jpayne@7: 
jpayne@7:     def feed(self, character: str) -> None:
jpayne@7:         self._character_count += 1
jpayne@7: 
jpayne@7:         if is_arabic_isolated_form(character):
jpayne@7:             self._isolated_form_count += 1
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def ratio(self) -> float:
jpayne@7:         if self._character_count < 8:
jpayne@7:             return 0.0
jpayne@7: 
jpayne@7:         isolated_form_usage: float = self._isolated_form_count / self._character_count
jpayne@7: 
jpayne@7:         return isolated_form_usage
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=1024)
jpayne@7: def is_suspiciously_successive_range(
jpayne@7:     unicode_range_a: Optional[str], unicode_range_b: Optional[str]
jpayne@7: ) -> bool:
jpayne@7:     """
jpayne@7:     Determine if two Unicode range seen next to each other can be considered as suspicious.
jpayne@7:     """
jpayne@7:     if unicode_range_a is None or unicode_range_b is None:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     if unicode_range_a == unicode_range_b:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     # Latin characters can be accompanied with a combining diacritical mark
jpayne@7:     # eg. Vietnamese.
jpayne@7:     if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
jpayne@7:         "Combining" in unicode_range_a or "Combining" in unicode_range_b
jpayne@7:     ):
jpayne@7:         return False
jpayne@7: 
jpayne@7:     keywords_range_a, keywords_range_b = unicode_range_a.split(
jpayne@7:         " "
jpayne@7:     ), unicode_range_b.split(" ")
jpayne@7: 
jpayne@7:     for el in keywords_range_a:
jpayne@7:         if el in UNICODE_SECONDARY_RANGE_KEYWORD:
jpayne@7:             continue
jpayne@7:         if el in keywords_range_b:
jpayne@7:             return False
jpayne@7: 
jpayne@7:     # Japanese Exception
jpayne@7:     range_a_jp_chars, range_b_jp_chars = (
jpayne@7:         unicode_range_a
jpayne@7:         in (
jpayne@7:             "Hiragana",
jpayne@7:             "Katakana",
jpayne@7:         ),
jpayne@7:         unicode_range_b in ("Hiragana", "Katakana"),
jpayne@7:     )
jpayne@7:     if (range_a_jp_chars or range_b_jp_chars) and (
jpayne@7:         "CJK" in unicode_range_a or "CJK" in unicode_range_b
jpayne@7:     ):
jpayne@7:         return False
jpayne@7:     if range_a_jp_chars and range_b_jp_chars:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
jpayne@7:         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
jpayne@7:             return False
jpayne@7:         if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7:             return False
jpayne@7: 
jpayne@7:     # Chinese/Japanese use dedicated range for punctuation and/or separators.
jpayne@7:     if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
jpayne@7:         unicode_range_a in ["Katakana", "Hiragana"]
jpayne@7:         and unicode_range_b in ["Katakana", "Hiragana"]
jpayne@7:     ):
jpayne@7:         if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
jpayne@7:             return False
jpayne@7:         if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
jpayne@7:             return False
jpayne@7:         if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7:             return False
jpayne@7: 
jpayne@7:     return True
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=2048)
jpayne@7: def mess_ratio(
jpayne@7:     decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
jpayne@7: ) -> float:
jpayne@7:     """
jpayne@7:     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
jpayne@7:     """
jpayne@7: 
jpayne@7:     detectors: List[MessDetectorPlugin] = [
jpayne@7:         md_class() for md_class in MessDetectorPlugin.__subclasses__()
jpayne@7:     ]
jpayne@7: 
jpayne@7:     length: int = len(decoded_sequence) + 1
jpayne@7: 
jpayne@7:     mean_mess_ratio: float = 0.0
jpayne@7: 
jpayne@7:     if length < 512:
jpayne@7:         intermediary_mean_mess_ratio_calc: int = 32
jpayne@7:     elif length <= 1024:
jpayne@7:         intermediary_mean_mess_ratio_calc = 64
jpayne@7:     else:
jpayne@7:         intermediary_mean_mess_ratio_calc = 128
jpayne@7: 
jpayne@7:     for character, index in zip(decoded_sequence + "\n", range(length)):
jpayne@7:         for detector in detectors:
jpayne@7:             if detector.eligible(character):
jpayne@7:                 detector.feed(character)
jpayne@7: 
jpayne@7:         if (
jpayne@7:             index > 0 and index % intermediary_mean_mess_ratio_calc == 0
jpayne@7:         ) or index == length - 1:
jpayne@7:             mean_mess_ratio = sum(dt.ratio for dt in detectors)
jpayne@7: 
jpayne@7:             if mean_mess_ratio >= maximum_threshold:
jpayne@7:                 break
jpayne@7: 
jpayne@7:     if debug:
jpayne@7:         logger = getLogger("charset_normalizer")
jpayne@7: 
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "Mess-detector extended-analysis start. "
jpayne@7:             f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
jpayne@7:             f"maximum_threshold={maximum_threshold}",
jpayne@7:         )
jpayne@7: 
jpayne@7:         if len(decoded_sequence) > 16:
jpayne@7:             logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
jpayne@7:             logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
jpayne@7: 
jpayne@7:         for dt in detectors:  # pragma: nocover
jpayne@7:             logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
jpayne@7: 
jpayne@7:     return round(mean_mess_ratio, 3)