annotate charset_normalizer/md.py @ 14:18e1cb6018fd

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Mon, 20 May 2024 02:25:23 -0400
parents 5eb2d5e3bf22
children
rev   line source
jpayne@7 1 from functools import lru_cache
jpayne@7 2 from logging import getLogger
jpayne@7 3 from typing import List, Optional
jpayne@7 4
jpayne@7 5 from .constant import (
jpayne@7 6 COMMON_SAFE_ASCII_CHARACTERS,
jpayne@7 7 TRACE,
jpayne@7 8 UNICODE_SECONDARY_RANGE_KEYWORD,
jpayne@7 9 )
jpayne@7 10 from .utils import (
jpayne@7 11 is_accentuated,
jpayne@7 12 is_arabic,
jpayne@7 13 is_arabic_isolated_form,
jpayne@7 14 is_case_variable,
jpayne@7 15 is_cjk,
jpayne@7 16 is_emoticon,
jpayne@7 17 is_hangul,
jpayne@7 18 is_hiragana,
jpayne@7 19 is_katakana,
jpayne@7 20 is_latin,
jpayne@7 21 is_punctuation,
jpayne@7 22 is_separator,
jpayne@7 23 is_symbol,
jpayne@7 24 is_thai,
jpayne@7 25 is_unprintable,
jpayne@7 26 remove_accent,
jpayne@7 27 unicode_range,
jpayne@7 28 )
jpayne@7 29
jpayne@7 30
jpayne@7 31 class MessDetectorPlugin:
jpayne@7 32 """
jpayne@7 33 Base abstract class used for mess detection plugins.
jpayne@7 34 All detectors MUST extend and implement given methods.
jpayne@7 35 """
jpayne@7 36
jpayne@7 37 def eligible(self, character: str) -> bool:
jpayne@7 38 """
jpayne@7 39 Determine if given character should be fed in.
jpayne@7 40 """
jpayne@7 41 raise NotImplementedError # pragma: nocover
jpayne@7 42
jpayne@7 43 def feed(self, character: str) -> None:
jpayne@7 44 """
jpayne@7 45 The main routine to be executed upon character.
jpayne@7 46 Insert the logic in witch the text would be considered chaotic.
jpayne@7 47 """
jpayne@7 48 raise NotImplementedError # pragma: nocover
jpayne@7 49
jpayne@7 50 def reset(self) -> None: # pragma: no cover
jpayne@7 51 """
jpayne@7 52 Permit to reset the plugin to the initial state.
jpayne@7 53 """
jpayne@7 54 raise NotImplementedError
jpayne@7 55
jpayne@7 56 @property
jpayne@7 57 def ratio(self) -> float:
jpayne@7 58 """
jpayne@7 59 Compute the chaos ratio based on what your feed() has seen.
jpayne@7 60 Must NOT be lower than 0.; No restriction gt 0.
jpayne@7 61 """
jpayne@7 62 raise NotImplementedError # pragma: nocover
jpayne@7 63
jpayne@7 64
jpayne@7 65 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
jpayne@7 66 def __init__(self) -> None:
jpayne@7 67 self._punctuation_count: int = 0
jpayne@7 68 self._symbol_count: int = 0
jpayne@7 69 self._character_count: int = 0
jpayne@7 70
jpayne@7 71 self._last_printable_char: Optional[str] = None
jpayne@7 72 self._frenzy_symbol_in_word: bool = False
jpayne@7 73
jpayne@7 74 def eligible(self, character: str) -> bool:
jpayne@7 75 return character.isprintable()
jpayne@7 76
jpayne@7 77 def feed(self, character: str) -> None:
jpayne@7 78 self._character_count += 1
jpayne@7 79
jpayne@7 80 if (
jpayne@7 81 character != self._last_printable_char
jpayne@7 82 and character not in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7 83 ):
jpayne@7 84 if is_punctuation(character):
jpayne@7 85 self._punctuation_count += 1
jpayne@7 86 elif (
jpayne@7 87 character.isdigit() is False
jpayne@7 88 and is_symbol(character)
jpayne@7 89 and is_emoticon(character) is False
jpayne@7 90 ):
jpayne@7 91 self._symbol_count += 2
jpayne@7 92
jpayne@7 93 self._last_printable_char = character
jpayne@7 94
jpayne@7 95 def reset(self) -> None: # pragma: no cover
jpayne@7 96 self._punctuation_count = 0
jpayne@7 97 self._character_count = 0
jpayne@7 98 self._symbol_count = 0
jpayne@7 99
jpayne@7 100 @property
jpayne@7 101 def ratio(self) -> float:
jpayne@7 102 if self._character_count == 0:
jpayne@7 103 return 0.0
jpayne@7 104
jpayne@7 105 ratio_of_punctuation: float = (
jpayne@7 106 self._punctuation_count + self._symbol_count
jpayne@7 107 ) / self._character_count
jpayne@7 108
jpayne@7 109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
jpayne@7 110
jpayne@7 111
jpayne@7 112 class TooManyAccentuatedPlugin(MessDetectorPlugin):
jpayne@7 113 def __init__(self) -> None:
jpayne@7 114 self._character_count: int = 0
jpayne@7 115 self._accentuated_count: int = 0
jpayne@7 116
jpayne@7 117 def eligible(self, character: str) -> bool:
jpayne@7 118 return character.isalpha()
jpayne@7 119
jpayne@7 120 def feed(self, character: str) -> None:
jpayne@7 121 self._character_count += 1
jpayne@7 122
jpayne@7 123 if is_accentuated(character):
jpayne@7 124 self._accentuated_count += 1
jpayne@7 125
jpayne@7 126 def reset(self) -> None: # pragma: no cover
jpayne@7 127 self._character_count = 0
jpayne@7 128 self._accentuated_count = 0
jpayne@7 129
jpayne@7 130 @property
jpayne@7 131 def ratio(self) -> float:
jpayne@7 132 if self._character_count < 8:
jpayne@7 133 return 0.0
jpayne@7 134
jpayne@7 135 ratio_of_accentuation: float = self._accentuated_count / self._character_count
jpayne@7 136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
jpayne@7 137
jpayne@7 138
jpayne@7 139 class UnprintablePlugin(MessDetectorPlugin):
jpayne@7 140 def __init__(self) -> None:
jpayne@7 141 self._unprintable_count: int = 0
jpayne@7 142 self._character_count: int = 0
jpayne@7 143
jpayne@7 144 def eligible(self, character: str) -> bool:
jpayne@7 145 return True
jpayne@7 146
jpayne@7 147 def feed(self, character: str) -> None:
jpayne@7 148 if is_unprintable(character):
jpayne@7 149 self._unprintable_count += 1
jpayne@7 150 self._character_count += 1
jpayne@7 151
jpayne@7 152 def reset(self) -> None: # pragma: no cover
jpayne@7 153 self._unprintable_count = 0
jpayne@7 154
jpayne@7 155 @property
jpayne@7 156 def ratio(self) -> float:
jpayne@7 157 if self._character_count == 0:
jpayne@7 158 return 0.0
jpayne@7 159
jpayne@7 160 return (self._unprintable_count * 8) / self._character_count
jpayne@7 161
jpayne@7 162
jpayne@7 163 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
jpayne@7 164 def __init__(self) -> None:
jpayne@7 165 self._successive_count: int = 0
jpayne@7 166 self._character_count: int = 0
jpayne@7 167
jpayne@7 168 self._last_latin_character: Optional[str] = None
jpayne@7 169
jpayne@7 170 def eligible(self, character: str) -> bool:
jpayne@7 171 return character.isalpha() and is_latin(character)
jpayne@7 172
jpayne@7 173 def feed(self, character: str) -> None:
jpayne@7 174 self._character_count += 1
jpayne@7 175 if (
jpayne@7 176 self._last_latin_character is not None
jpayne@7 177 and is_accentuated(character)
jpayne@7 178 and is_accentuated(self._last_latin_character)
jpayne@7 179 ):
jpayne@7 180 if character.isupper() and self._last_latin_character.isupper():
jpayne@7 181 self._successive_count += 1
jpayne@7 182 # Worse if its the same char duplicated with different accent.
jpayne@7 183 if remove_accent(character) == remove_accent(self._last_latin_character):
jpayne@7 184 self._successive_count += 1
jpayne@7 185 self._last_latin_character = character
jpayne@7 186
jpayne@7 187 def reset(self) -> None: # pragma: no cover
jpayne@7 188 self._successive_count = 0
jpayne@7 189 self._character_count = 0
jpayne@7 190 self._last_latin_character = None
jpayne@7 191
jpayne@7 192 @property
jpayne@7 193 def ratio(self) -> float:
jpayne@7 194 if self._character_count == 0:
jpayne@7 195 return 0.0
jpayne@7 196
jpayne@7 197 return (self._successive_count * 2) / self._character_count
jpayne@7 198
jpayne@7 199
jpayne@7 200 class SuspiciousRange(MessDetectorPlugin):
jpayne@7 201 def __init__(self) -> None:
jpayne@7 202 self._suspicious_successive_range_count: int = 0
jpayne@7 203 self._character_count: int = 0
jpayne@7 204 self._last_printable_seen: Optional[str] = None
jpayne@7 205
jpayne@7 206 def eligible(self, character: str) -> bool:
jpayne@7 207 return character.isprintable()
jpayne@7 208
jpayne@7 209 def feed(self, character: str) -> None:
jpayne@7 210 self._character_count += 1
jpayne@7 211
jpayne@7 212 if (
jpayne@7 213 character.isspace()
jpayne@7 214 or is_punctuation(character)
jpayne@7 215 or character in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7 216 ):
jpayne@7 217 self._last_printable_seen = None
jpayne@7 218 return
jpayne@7 219
jpayne@7 220 if self._last_printable_seen is None:
jpayne@7 221 self._last_printable_seen = character
jpayne@7 222 return
jpayne@7 223
jpayne@7 224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
jpayne@7 225 unicode_range_b: Optional[str] = unicode_range(character)
jpayne@7 226
jpayne@7 227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
jpayne@7 228 self._suspicious_successive_range_count += 1
jpayne@7 229
jpayne@7 230 self._last_printable_seen = character
jpayne@7 231
jpayne@7 232 def reset(self) -> None: # pragma: no cover
jpayne@7 233 self._character_count = 0
jpayne@7 234 self._suspicious_successive_range_count = 0
jpayne@7 235 self._last_printable_seen = None
jpayne@7 236
jpayne@7 237 @property
jpayne@7 238 def ratio(self) -> float:
jpayne@7 239 if self._character_count <= 24:
jpayne@7 240 return 0.0
jpayne@7 241
jpayne@7 242 ratio_of_suspicious_range_usage: float = (
jpayne@7 243 self._suspicious_successive_range_count * 2
jpayne@7 244 ) / self._character_count
jpayne@7 245
jpayne@7 246 return ratio_of_suspicious_range_usage
jpayne@7 247
jpayne@7 248
jpayne@7 249 class SuperWeirdWordPlugin(MessDetectorPlugin):
jpayne@7 250 def __init__(self) -> None:
jpayne@7 251 self._word_count: int = 0
jpayne@7 252 self._bad_word_count: int = 0
jpayne@7 253 self._foreign_long_count: int = 0
jpayne@7 254
jpayne@7 255 self._is_current_word_bad: bool = False
jpayne@7 256 self._foreign_long_watch: bool = False
jpayne@7 257
jpayne@7 258 self._character_count: int = 0
jpayne@7 259 self._bad_character_count: int = 0
jpayne@7 260
jpayne@7 261 self._buffer: str = ""
jpayne@7 262 self._buffer_accent_count: int = 0
jpayne@7 263
jpayne@7 264 def eligible(self, character: str) -> bool:
jpayne@7 265 return True
jpayne@7 266
jpayne@7 267 def feed(self, character: str) -> None:
jpayne@7 268 if character.isalpha():
jpayne@7 269 self._buffer += character
jpayne@7 270 if is_accentuated(character):
jpayne@7 271 self._buffer_accent_count += 1
jpayne@7 272 if (
jpayne@7 273 self._foreign_long_watch is False
jpayne@7 274 and (is_latin(character) is False or is_accentuated(character))
jpayne@7 275 and is_cjk(character) is False
jpayne@7 276 and is_hangul(character) is False
jpayne@7 277 and is_katakana(character) is False
jpayne@7 278 and is_hiragana(character) is False
jpayne@7 279 and is_thai(character) is False
jpayne@7 280 ):
jpayne@7 281 self._foreign_long_watch = True
jpayne@7 282 return
jpayne@7 283 if not self._buffer:
jpayne@7 284 return
jpayne@7 285 if (
jpayne@7 286 character.isspace() or is_punctuation(character) or is_separator(character)
jpayne@7 287 ) and self._buffer:
jpayne@7 288 self._word_count += 1
jpayne@7 289 buffer_length: int = len(self._buffer)
jpayne@7 290
jpayne@7 291 self._character_count += buffer_length
jpayne@7 292
jpayne@7 293 if buffer_length >= 4:
jpayne@7 294 if self._buffer_accent_count / buffer_length > 0.34:
jpayne@7 295 self._is_current_word_bad = True
jpayne@7 296 # Word/Buffer ending with an upper case accentuated letter are so rare,
jpayne@7 297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
jpayne@7 298 if (
jpayne@7 299 is_accentuated(self._buffer[-1])
jpayne@7 300 and self._buffer[-1].isupper()
jpayne@7 301 and all(_.isupper() for _ in self._buffer) is False
jpayne@7 302 ):
jpayne@7 303 self._foreign_long_count += 1
jpayne@7 304 self._is_current_word_bad = True
jpayne@7 305 if buffer_length >= 24 and self._foreign_long_watch:
jpayne@7 306 camel_case_dst = [
jpayne@7 307 i
jpayne@7 308 for c, i in zip(self._buffer, range(0, buffer_length))
jpayne@7 309 if c.isupper()
jpayne@7 310 ]
jpayne@7 311 probable_camel_cased: bool = False
jpayne@7 312
jpayne@7 313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
jpayne@7 314 probable_camel_cased = True
jpayne@7 315
jpayne@7 316 if not probable_camel_cased:
jpayne@7 317 self._foreign_long_count += 1
jpayne@7 318 self._is_current_word_bad = True
jpayne@7 319
jpayne@7 320 if self._is_current_word_bad:
jpayne@7 321 self._bad_word_count += 1
jpayne@7 322 self._bad_character_count += len(self._buffer)
jpayne@7 323 self._is_current_word_bad = False
jpayne@7 324
jpayne@7 325 self._foreign_long_watch = False
jpayne@7 326 self._buffer = ""
jpayne@7 327 self._buffer_accent_count = 0
jpayne@7 328 elif (
jpayne@7 329 character not in {"<", ">", "-", "=", "~", "|", "_"}
jpayne@7 330 and character.isdigit() is False
jpayne@7 331 and is_symbol(character)
jpayne@7 332 ):
jpayne@7 333 self._is_current_word_bad = True
jpayne@7 334 self._buffer += character
jpayne@7 335
jpayne@7 336 def reset(self) -> None: # pragma: no cover
jpayne@7 337 self._buffer = ""
jpayne@7 338 self._is_current_word_bad = False
jpayne@7 339 self._foreign_long_watch = False
jpayne@7 340 self._bad_word_count = 0
jpayne@7 341 self._word_count = 0
jpayne@7 342 self._character_count = 0
jpayne@7 343 self._bad_character_count = 0
jpayne@7 344 self._foreign_long_count = 0
jpayne@7 345
jpayne@7 346 @property
jpayne@7 347 def ratio(self) -> float:
jpayne@7 348 if self._word_count <= 10 and self._foreign_long_count == 0:
jpayne@7 349 return 0.0
jpayne@7 350
jpayne@7 351 return self._bad_character_count / self._character_count
jpayne@7 352
jpayne@7 353
jpayne@7 354 class CjkInvalidStopPlugin(MessDetectorPlugin):
jpayne@7 355 """
jpayne@7 356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
jpayne@7 357 can be easily detected. Searching for the overuse of '丅' and '丄'.
jpayne@7 358 """
jpayne@7 359
jpayne@7 360 def __init__(self) -> None:
jpayne@7 361 self._wrong_stop_count: int = 0
jpayne@7 362 self._cjk_character_count: int = 0
jpayne@7 363
jpayne@7 364 def eligible(self, character: str) -> bool:
jpayne@7 365 return True
jpayne@7 366
jpayne@7 367 def feed(self, character: str) -> None:
jpayne@7 368 if character in {"丅", "丄"}:
jpayne@7 369 self._wrong_stop_count += 1
jpayne@7 370 return
jpayne@7 371 if is_cjk(character):
jpayne@7 372 self._cjk_character_count += 1
jpayne@7 373
jpayne@7 374 def reset(self) -> None: # pragma: no cover
jpayne@7 375 self._wrong_stop_count = 0
jpayne@7 376 self._cjk_character_count = 0
jpayne@7 377
jpayne@7 378 @property
jpayne@7 379 def ratio(self) -> float:
jpayne@7 380 if self._cjk_character_count < 16:
jpayne@7 381 return 0.0
jpayne@7 382 return self._wrong_stop_count / self._cjk_character_count
jpayne@7 383
jpayne@7 384
jpayne@7 385 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
jpayne@7 386 def __init__(self) -> None:
jpayne@7 387 self._buf: bool = False
jpayne@7 388
jpayne@7 389 self._character_count_since_last_sep: int = 0
jpayne@7 390
jpayne@7 391 self._successive_upper_lower_count: int = 0
jpayne@7 392 self._successive_upper_lower_count_final: int = 0
jpayne@7 393
jpayne@7 394 self._character_count: int = 0
jpayne@7 395
jpayne@7 396 self._last_alpha_seen: Optional[str] = None
jpayne@7 397 self._current_ascii_only: bool = True
jpayne@7 398
jpayne@7 399 def eligible(self, character: str) -> bool:
jpayne@7 400 return True
jpayne@7 401
jpayne@7 402 def feed(self, character: str) -> None:
jpayne@7 403 is_concerned = character.isalpha() and is_case_variable(character)
jpayne@7 404 chunk_sep = is_concerned is False
jpayne@7 405
jpayne@7 406 if chunk_sep and self._character_count_since_last_sep > 0:
jpayne@7 407 if (
jpayne@7 408 self._character_count_since_last_sep <= 64
jpayne@7 409 and character.isdigit() is False
jpayne@7 410 and self._current_ascii_only is False
jpayne@7 411 ):
jpayne@7 412 self._successive_upper_lower_count_final += (
jpayne@7 413 self._successive_upper_lower_count
jpayne@7 414 )
jpayne@7 415
jpayne@7 416 self._successive_upper_lower_count = 0
jpayne@7 417 self._character_count_since_last_sep = 0
jpayne@7 418 self._last_alpha_seen = None
jpayne@7 419 self._buf = False
jpayne@7 420 self._character_count += 1
jpayne@7 421 self._current_ascii_only = True
jpayne@7 422
jpayne@7 423 return
jpayne@7 424
jpayne@7 425 if self._current_ascii_only is True and character.isascii() is False:
jpayne@7 426 self._current_ascii_only = False
jpayne@7 427
jpayne@7 428 if self._last_alpha_seen is not None:
jpayne@7 429 if (character.isupper() and self._last_alpha_seen.islower()) or (
jpayne@7 430 character.islower() and self._last_alpha_seen.isupper()
jpayne@7 431 ):
jpayne@7 432 if self._buf is True:
jpayne@7 433 self._successive_upper_lower_count += 2
jpayne@7 434 self._buf = False
jpayne@7 435 else:
jpayne@7 436 self._buf = True
jpayne@7 437 else:
jpayne@7 438 self._buf = False
jpayne@7 439
jpayne@7 440 self._character_count += 1
jpayne@7 441 self._character_count_since_last_sep += 1
jpayne@7 442 self._last_alpha_seen = character
jpayne@7 443
jpayne@7 444 def reset(self) -> None: # pragma: no cover
jpayne@7 445 self._character_count = 0
jpayne@7 446 self._character_count_since_last_sep = 0
jpayne@7 447 self._successive_upper_lower_count = 0
jpayne@7 448 self._successive_upper_lower_count_final = 0
jpayne@7 449 self._last_alpha_seen = None
jpayne@7 450 self._buf = False
jpayne@7 451 self._current_ascii_only = True
jpayne@7 452
jpayne@7 453 @property
jpayne@7 454 def ratio(self) -> float:
jpayne@7 455 if self._character_count == 0:
jpayne@7 456 return 0.0
jpayne@7 457
jpayne@7 458 return self._successive_upper_lower_count_final / self._character_count
jpayne@7 459
jpayne@7 460
jpayne@7 461 class ArabicIsolatedFormPlugin(MessDetectorPlugin):
jpayne@7 462 def __init__(self) -> None:
jpayne@7 463 self._character_count: int = 0
jpayne@7 464 self._isolated_form_count: int = 0
jpayne@7 465
jpayne@7 466 def reset(self) -> None: # pragma: no cover
jpayne@7 467 self._character_count = 0
jpayne@7 468 self._isolated_form_count = 0
jpayne@7 469
jpayne@7 470 def eligible(self, character: str) -> bool:
jpayne@7 471 return is_arabic(character)
jpayne@7 472
jpayne@7 473 def feed(self, character: str) -> None:
jpayne@7 474 self._character_count += 1
jpayne@7 475
jpayne@7 476 if is_arabic_isolated_form(character):
jpayne@7 477 self._isolated_form_count += 1
jpayne@7 478
jpayne@7 479 @property
jpayne@7 480 def ratio(self) -> float:
jpayne@7 481 if self._character_count < 8:
jpayne@7 482 return 0.0
jpayne@7 483
jpayne@7 484 isolated_form_usage: float = self._isolated_form_count / self._character_count
jpayne@7 485
jpayne@7 486 return isolated_form_usage
jpayne@7 487
jpayne@7 488
jpayne@7 489 @lru_cache(maxsize=1024)
jpayne@7 490 def is_suspiciously_successive_range(
jpayne@7 491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
jpayne@7 492 ) -> bool:
jpayne@7 493 """
jpayne@7 494 Determine if two Unicode range seen next to each other can be considered as suspicious.
jpayne@7 495 """
jpayne@7 496 if unicode_range_a is None or unicode_range_b is None:
jpayne@7 497 return True
jpayne@7 498
jpayne@7 499 if unicode_range_a == unicode_range_b:
jpayne@7 500 return False
jpayne@7 501
jpayne@7 502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
jpayne@7 503 return False
jpayne@7 504
jpayne@7 505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
jpayne@7 506 return False
jpayne@7 507
jpayne@7 508 # Latin characters can be accompanied with a combining diacritical mark
jpayne@7 509 # eg. Vietnamese.
jpayne@7 510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
jpayne@7 511 "Combining" in unicode_range_a or "Combining" in unicode_range_b
jpayne@7 512 ):
jpayne@7 513 return False
jpayne@7 514
jpayne@7 515 keywords_range_a, keywords_range_b = unicode_range_a.split(
jpayne@7 516 " "
jpayne@7 517 ), unicode_range_b.split(" ")
jpayne@7 518
jpayne@7 519 for el in keywords_range_a:
jpayne@7 520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
jpayne@7 521 continue
jpayne@7 522 if el in keywords_range_b:
jpayne@7 523 return False
jpayne@7 524
jpayne@7 525 # Japanese Exception
jpayne@7 526 range_a_jp_chars, range_b_jp_chars = (
jpayne@7 527 unicode_range_a
jpayne@7 528 in (
jpayne@7 529 "Hiragana",
jpayne@7 530 "Katakana",
jpayne@7 531 ),
jpayne@7 532 unicode_range_b in ("Hiragana", "Katakana"),
jpayne@7 533 )
jpayne@7 534 if (range_a_jp_chars or range_b_jp_chars) and (
jpayne@7 535 "CJK" in unicode_range_a or "CJK" in unicode_range_b
jpayne@7 536 ):
jpayne@7 537 return False
jpayne@7 538 if range_a_jp_chars and range_b_jp_chars:
jpayne@7 539 return False
jpayne@7 540
jpayne@7 541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
jpayne@7 542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
jpayne@7 543 return False
jpayne@7 544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7 545 return False
jpayne@7 546
jpayne@7 547 # Chinese/Japanese use dedicated range for punctuation and/or separators.
jpayne@7 548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
jpayne@7 549 unicode_range_a in ["Katakana", "Hiragana"]
jpayne@7 550 and unicode_range_b in ["Katakana", "Hiragana"]
jpayne@7 551 ):
jpayne@7 552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
jpayne@7 553 return False
jpayne@7 554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
jpayne@7 555 return False
jpayne@7 556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7 557 return False
jpayne@7 558
jpayne@7 559 return True
jpayne@7 560
jpayne@7 561
jpayne@7 562 @lru_cache(maxsize=2048)
jpayne@7 563 def mess_ratio(
jpayne@7 564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
jpayne@7 565 ) -> float:
jpayne@7 566 """
jpayne@7 567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
jpayne@7 568 """
jpayne@7 569
jpayne@7 570 detectors: List[MessDetectorPlugin] = [
jpayne@7 571 md_class() for md_class in MessDetectorPlugin.__subclasses__()
jpayne@7 572 ]
jpayne@7 573
jpayne@7 574 length: int = len(decoded_sequence) + 1
jpayne@7 575
jpayne@7 576 mean_mess_ratio: float = 0.0
jpayne@7 577
jpayne@7 578 if length < 512:
jpayne@7 579 intermediary_mean_mess_ratio_calc: int = 32
jpayne@7 580 elif length <= 1024:
jpayne@7 581 intermediary_mean_mess_ratio_calc = 64
jpayne@7 582 else:
jpayne@7 583 intermediary_mean_mess_ratio_calc = 128
jpayne@7 584
jpayne@7 585 for character, index in zip(decoded_sequence + "\n", range(length)):
jpayne@7 586 for detector in detectors:
jpayne@7 587 if detector.eligible(character):
jpayne@7 588 detector.feed(character)
jpayne@7 589
jpayne@7 590 if (
jpayne@7 591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
jpayne@7 592 ) or index == length - 1:
jpayne@7 593 mean_mess_ratio = sum(dt.ratio for dt in detectors)
jpayne@7 594
jpayne@7 595 if mean_mess_ratio >= maximum_threshold:
jpayne@7 596 break
jpayne@7 597
jpayne@7 598 if debug:
jpayne@7 599 logger = getLogger("charset_normalizer")
jpayne@7 600
jpayne@7 601 logger.log(
jpayne@7 602 TRACE,
jpayne@7 603 "Mess-detector extended-analysis start. "
jpayne@7 604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
jpayne@7 605 f"maximum_threshold={maximum_threshold}",
jpayne@7 606 )
jpayne@7 607
jpayne@7 608 if len(decoded_sequence) > 16:
jpayne@7 609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
jpayne@7 610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
jpayne@7 611
jpayne@7 612 for dt in detectors: # pragma: nocover
jpayne@7 613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
jpayne@7 614
jpayne@7 615 return round(mean_mess_ratio, 3)