bioproject_to_srr_2: charset_normalizer/md.py comparison

comparison charset_normalizer/md.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:32:17 -0400
parents
children

comparison

equal deleted inserted replaced

-:b2745907b1eb
+:5eb2d5e3bf22
+from functools import lru_cache
+from logging import getLogger
+from typing import List, Optional
+from .constant import (
+COMMON_SAFE_ASCII_CHARACTERS,
+TRACE,
+UNICODE_SECONDARY_RANGE_KEYWORD,
+)
+from .utils import (
+is_accentuated,
+is_arabic,
+is_arabic_isolated_form,
+is_case_variable,
+is_cjk,
+is_emoticon,
+is_hangul,
+is_hiragana,
+is_katakana,
+is_latin,
+is_punctuation,
+is_separator,
+is_symbol,
+is_thai,
+is_unprintable,
+remove_accent,
+unicode_range,
+)
+class MessDetectorPlugin:
+"""
+Base abstract class used for mess detection plugins.
+All detectors MUST extend and implement given methods.
+"""
+def eligible(self, character: str) -> bool:
+"""
+Determine if given character should be fed in.
+"""
+raise NotImplementedError  # pragma: nocover
+def feed(self, character: str) -> None:
+"""
+The main routine to be executed upon character.
+Insert the logic in witch the text would be considered chaotic.
+"""
+raise NotImplementedError  # pragma: nocover
+def reset(self) -> None:  # pragma: no cover
+"""
+Permit to reset the plugin to the initial state.
+"""
+raise NotImplementedError
+@property
+def ratio(self) -> float:
+"""
+Compute the chaos ratio based on what your feed() has seen.
+Must NOT be lower than 0.; No restriction gt 0.
+"""
+raise NotImplementedError  # pragma: nocover
+class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._punctuation_count: int = 0
+self._symbol_count: int = 0
+self._character_count: int = 0
+self._last_printable_char: Optional[str] = None
+self._frenzy_symbol_in_word: bool = False
+def eligible(self, character: str) -> bool:
+return character.isprintable()
+def feed(self, character: str) -> None:
+self._character_count += 1
+if (
+character != self._last_printable_char
+and character not in COMMON_SAFE_ASCII_CHARACTERS
+):
+if is_punctuation(character):
+self._punctuation_count += 1
+elif (
+character.isdigit() is False
+and is_symbol(character)
+and is_emoticon(character) is False
+):
+self._symbol_count += 2
+self._last_printable_char = character
+def reset(self) -> None:  # pragma: no cover
+self._punctuation_count = 0
+self._character_count = 0
+self._symbol_count = 0
+@property
+def ratio(self) -> float:
+if self._character_count == 0:
+return 0.0
+ratio_of_punctuation: float = (
+self._punctuation_count + self._symbol_count
+) / self._character_count
+return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
+class TooManyAccentuatedPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._character_count: int = 0
+self._accentuated_count: int = 0
+def eligible(self, character: str) -> bool:
+return character.isalpha()
+def feed(self, character: str) -> None:
+self._character_count += 1
+if is_accentuated(character):
+self._accentuated_count += 1
+def reset(self) -> None:  # pragma: no cover
+self._character_count = 0
+self._accentuated_count = 0
+@property
+def ratio(self) -> float:
+if self._character_count < 8:
+return 0.0
+ratio_of_accentuation: float = self._accentuated_count / self._character_count
+return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
+class UnprintablePlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._unprintable_count: int = 0
+self._character_count: int = 0
+def eligible(self, character: str) -> bool:
+return True
+def feed(self, character: str) -> None:
+if is_unprintable(character):
+self._unprintable_count += 1
+self._character_count += 1
+def reset(self) -> None:  # pragma: no cover
+self._unprintable_count = 0
+@property
+def ratio(self) -> float:
+if self._character_count == 0:
+return 0.0
+return (self._unprintable_count * 8) / self._character_count
+class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._successive_count: int = 0
+self._character_count: int = 0
+self._last_latin_character: Optional[str] = None
+def eligible(self, character: str) -> bool:
+return character.isalpha() and is_latin(character)
+def feed(self, character: str) -> None:
+self._character_count += 1
+if (
+self._last_latin_character is not None
+and is_accentuated(character)
+and is_accentuated(self._last_latin_character)
+):
+if character.isupper() and self._last_latin_character.isupper():
+self._successive_count += 1
+# Worse if its the same char duplicated with different accent.
+if remove_accent(character) == remove_accent(self._last_latin_character):
+self._successive_count += 1
+self._last_latin_character = character
+def reset(self) -> None:  # pragma: no cover
+self._successive_count = 0
+self._character_count = 0
+self._last_latin_character = None
+@property
+def ratio(self) -> float:
+if self._character_count == 0:
+return 0.0
+return (self._successive_count * 2) / self._character_count
+class SuspiciousRange(MessDetectorPlugin):
+def __init__(self) -> None:
+self._suspicious_successive_range_count: int = 0
+self._character_count: int = 0
+self._last_printable_seen: Optional[str] = None
+def eligible(self, character: str) -> bool:
+return character.isprintable()
+def feed(self, character: str) -> None:
+self._character_count += 1
+if (
+character.isspace()
+or is_punctuation(character)
+or character in COMMON_SAFE_ASCII_CHARACTERS
+):
+self._last_printable_seen = None
+return
+if self._last_printable_seen is None:
+self._last_printable_seen = character
+return
+unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
+unicode_range_b: Optional[str] = unicode_range(character)
+if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
+self._suspicious_successive_range_count += 1
+self._last_printable_seen = character
+def reset(self) -> None:  # pragma: no cover
+self._character_count = 0
+self._suspicious_successive_range_count = 0
+self._last_printable_seen = None
+@property
+def ratio(self) -> float:
+if self._character_count <= 24:
+return 0.0
+ratio_of_suspicious_range_usage: float = (
+self._suspicious_successive_range_count * 2
+) / self._character_count
+return ratio_of_suspicious_range_usage
+class SuperWeirdWordPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._word_count: int = 0
+self._bad_word_count: int = 0
+self._foreign_long_count: int = 0
+self._is_current_word_bad: bool = False
+self._foreign_long_watch: bool = False
+self._character_count: int = 0
+self._bad_character_count: int = 0
+self._buffer: str = ""
+self._buffer_accent_count: int = 0
+def eligible(self, character: str) -> bool:
+return True
+def feed(self, character: str) -> None:
+if character.isalpha():
+self._buffer += character
+if is_accentuated(character):
+self._buffer_accent_count += 1
+if (
+self._foreign_long_watch is False
+and (is_latin(character) is False or is_accentuated(character))
+and is_cjk(character) is False
+and is_hangul(character) is False
+and is_katakana(character) is False
+and is_hiragana(character) is False
+and is_thai(character) is False
+):
+self._foreign_long_watch = True
+return
+if not self._buffer:
+return
+if (
+character.isspace() or is_punctuation(character) or is_separator(character)
+) and self._buffer:
+self._word_count += 1
+buffer_length: int = len(self._buffer)
+self._character_count += buffer_length
+if buffer_length >= 4:
+if self._buffer_accent_count / buffer_length > 0.34:
+self._is_current_word_bad = True
+# Word/Buffer ending with an upper case accentuated letter are so rare,
+# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
+if (
+is_accentuated(self._buffer[-1])
+and self._buffer[-1].isupper()
+and all(_.isupper() for _ in self._buffer) is False
+):
+self._foreign_long_count += 1
+self._is_current_word_bad = True
+if buffer_length >= 24 and self._foreign_long_watch:
+camel_case_dst = [
+i
+for c, i in zip(self._buffer, range(0, buffer_length))
+if c.isupper()
+]
+probable_camel_cased: bool = False
+if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
+probable_camel_cased = True
+if not probable_camel_cased:
+self._foreign_long_count += 1
+self._is_current_word_bad = True
+if self._is_current_word_bad:
+self._bad_word_count += 1
+self._bad_character_count += len(self._buffer)
+self._is_current_word_bad = False
+self._foreign_long_watch = False
+self._buffer = ""
+self._buffer_accent_count = 0
+elif (
+character not in {"<", ">", "-", "=", "~", "|", "_"}
+and character.isdigit() is False
+and is_symbol(character)
+):
+self._is_current_word_bad = True
+self._buffer += character
+def reset(self) -> None:  # pragma: no cover
+self._buffer = ""
+self._is_current_word_bad = False
+self._foreign_long_watch = False
+self._bad_word_count = 0
+self._word_count = 0
+self._character_count = 0
+self._bad_character_count = 0
+self._foreign_long_count = 0
+@property
+def ratio(self) -> float:
+if self._word_count <= 10 and self._foreign_long_count == 0:
+return 0.0
+return self._bad_character_count / self._character_count
+class CjkInvalidStopPlugin(MessDetectorPlugin):
+"""
+GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
+can be easily detected. Searching for the overuse of '丅' and '丄'.
+"""
+def __init__(self) -> None:
+self._wrong_stop_count: int = 0
+self._cjk_character_count: int = 0
+def eligible(self, character: str) -> bool:
+return True
+def feed(self, character: str) -> None:
+if character in {"丅", "丄"}:
+self._wrong_stop_count += 1
+return
+if is_cjk(character):
+self._cjk_character_count += 1
+def reset(self) -> None:  # pragma: no cover
+self._wrong_stop_count = 0
+self._cjk_character_count = 0
+@property
+def ratio(self) -> float:
+if self._cjk_character_count < 16:
+return 0.0
+return self._wrong_stop_count / self._cjk_character_count
+class ArchaicUpperLowerPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._buf: bool = False
+self._character_count_since_last_sep: int = 0
+self._successive_upper_lower_count: int = 0
+self._successive_upper_lower_count_final: int = 0
+self._character_count: int = 0
+self._last_alpha_seen: Optional[str] = None
+self._current_ascii_only: bool = True
+def eligible(self, character: str) -> bool:
+return True
+def feed(self, character: str) -> None:
+is_concerned = character.isalpha() and is_case_variable(character)
+chunk_sep = is_concerned is False
+if chunk_sep and self._character_count_since_last_sep > 0:
+if (
+self._character_count_since_last_sep <= 64
+and character.isdigit() is False
+and self._current_ascii_only is False
+):
+self._successive_upper_lower_count_final += (
+self._successive_upper_lower_count
+)
+self._successive_upper_lower_count = 0
+self._character_count_since_last_sep = 0
+self._last_alpha_seen = None
+self._buf = False
+self._character_count += 1
+self._current_ascii_only = True
+return
+if self._current_ascii_only is True and character.isascii() is False:
+self._current_ascii_only = False
+if self._last_alpha_seen is not None:
+if (character.isupper() and self._last_alpha_seen.islower()) or (
+character.islower() and self._last_alpha_seen.isupper()
+):
+if self._buf is True:
+self._successive_upper_lower_count += 2
+self._buf = False
+else:
+self._buf = True
+else:
+self._buf = False
+self._character_count += 1
+self._character_count_since_last_sep += 1
+self._last_alpha_seen = character
+def reset(self) -> None:  # pragma: no cover
+self._character_count = 0
+self._character_count_since_last_sep = 0
+self._successive_upper_lower_count = 0
+self._successive_upper_lower_count_final = 0
+self._last_alpha_seen = None
+self._buf = False
+self._current_ascii_only = True
+@property
+def ratio(self) -> float:
+if self._character_count == 0:
+return 0.0
+return self._successive_upper_lower_count_final / self._character_count
+class ArabicIsolatedFormPlugin(MessDetectorPlugin):
+def __init__(self) -> None:
+self._character_count: int = 0
+self._isolated_form_count: int = 0
+def reset(self) -> None:  # pragma: no cover
+self._character_count = 0
+self._isolated_form_count = 0
+def eligible(self, character: str) -> bool:
+return is_arabic(character)
+def feed(self, character: str) -> None:
+self._character_count += 1
+if is_arabic_isolated_form(character):
+self._isolated_form_count += 1
+@property
+def ratio(self) -> float:
+if self._character_count < 8:
+return 0.0
+isolated_form_usage: float = self._isolated_form_count / self._character_count
+return isolated_form_usage
+@lru_cache(maxsize=1024)
+def is_suspiciously_successive_range(
+unicode_range_a: Optional[str], unicode_range_b: Optional[str]
+) -> bool:
+"""
+Determine if two Unicode range seen next to each other can be considered as suspicious.
+"""
+if unicode_range_a is None or unicode_range_b is None:
+return True
+if unicode_range_a == unicode_range_b:
+return False
+if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
+return False
+if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
+return False
+# Latin characters can be accompanied with a combining diacritical mark
+# eg. Vietnamese.
+if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
+"Combining" in unicode_range_a or "Combining" in unicode_range_b
+):
+return False
+keywords_range_a, keywords_range_b = unicode_range_a.split(
+" "
+), unicode_range_b.split(" ")
+for el in keywords_range_a:
+if el in UNICODE_SECONDARY_RANGE_KEYWORD:
+continue
+if el in keywords_range_b:
+return False
+# Japanese Exception
+range_a_jp_chars, range_b_jp_chars = (
+unicode_range_a
+in (
+"Hiragana",
+"Katakana",
+),
+unicode_range_b in ("Hiragana", "Katakana"),
+)
+if (range_a_jp_chars or range_b_jp_chars) and (
+"CJK" in unicode_range_a or "CJK" in unicode_range_b
+):
+return False
+if range_a_jp_chars and range_b_jp_chars:
+return False
+if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
+if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
+return False
+if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+return False
+# Chinese/Japanese use dedicated range for punctuation and/or separators.
+if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
+unicode_range_a in ["Katakana", "Hiragana"]
+and unicode_range_b in ["Katakana", "Hiragana"]
+):
+if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
+return False
+if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
+return False
+if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+return False
+return True
+@lru_cache(maxsize=2048)
+def mess_ratio(
+decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
+) -> float:
+"""
+Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
+"""
+detectors: List[MessDetectorPlugin] = [
+md_class() for md_class in MessDetectorPlugin.__subclasses__()
+]
+length: int = len(decoded_sequence) + 1
+mean_mess_ratio: float = 0.0
+if length < 512:
+intermediary_mean_mess_ratio_calc: int = 32
+elif length <= 1024:
+intermediary_mean_mess_ratio_calc = 64
+else:
+intermediary_mean_mess_ratio_calc = 128
+for character, index in zip(decoded_sequence + "\n", range(length)):
+for detector in detectors:
+if detector.eligible(character):
+detector.feed(character)
+if (
+index > 0 and index % intermediary_mean_mess_ratio_calc == 0
+) or index == length - 1:
+mean_mess_ratio = sum(dt.ratio for dt in detectors)
+if mean_mess_ratio >= maximum_threshold:
+break
+if debug:
+logger = getLogger("charset_normalizer")
+logger.log(
+TRACE,
+"Mess-detector extended-analysis start. "
+f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
+f"maximum_threshold={maximum_threshold}",
+)
+if len(decoded_sequence) > 16:
+logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
+logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
+for dt in detectors:  # pragma: nocover
+logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
+return round(mean_mess_ratio, 3)

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison charset_normalizer/md.py @ 7:5eb2d5e3bf22