bioproject_to_srr_2: charset_normalizer/md.py annotate

annotate charset_normalizer/md.py @ 14:18e1cb6018fd

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Mon, 20 May 2024 02:25:23 -0400
parents	5eb2d5e3bf22
children

rev	line source
jpayne@7	1 from functools import lru_cache
jpayne@7	2 from logging import getLogger
jpayne@7	3 from typing import List, Optional
jpayne@7	4
jpayne@7	5 from .constant import (
jpayne@7	6 COMMON_SAFE_ASCII_CHARACTERS,
jpayne@7	7 TRACE,
jpayne@7	8 UNICODE_SECONDARY_RANGE_KEYWORD,
jpayne@7	9 )
jpayne@7	10 from .utils import (
jpayne@7	11 is_accentuated,
jpayne@7	12 is_arabic,
jpayne@7	13 is_arabic_isolated_form,
jpayne@7	14 is_case_variable,
jpayne@7	15 is_cjk,
jpayne@7	16 is_emoticon,
jpayne@7	17 is_hangul,
jpayne@7	18 is_hiragana,
jpayne@7	19 is_katakana,
jpayne@7	20 is_latin,
jpayne@7	21 is_punctuation,
jpayne@7	22 is_separator,
jpayne@7	23 is_symbol,
jpayne@7	24 is_thai,
jpayne@7	25 is_unprintable,
jpayne@7	26 remove_accent,
jpayne@7	27 unicode_range,
jpayne@7	28 )
jpayne@7	29
jpayne@7	30
jpayne@7	31 class MessDetectorPlugin:
jpayne@7	32 """
jpayne@7	33 Base abstract class used for mess detection plugins.
jpayne@7	34 All detectors MUST extend and implement given methods.
jpayne@7	35 """
jpayne@7	36
jpayne@7	37 def eligible(self, character: str) -> bool:
jpayne@7	38 """
jpayne@7	39 Determine if given character should be fed in.
jpayne@7	40 """
jpayne@7	41 raise NotImplementedError # pragma: nocover
jpayne@7	42
jpayne@7	43 def feed(self, character: str) -> None:
jpayne@7	44 """
jpayne@7	45 The main routine to be executed upon character.
jpayne@7	46 Insert the logic in witch the text would be considered chaotic.
jpayne@7	47 """
jpayne@7	48 raise NotImplementedError # pragma: nocover
jpayne@7	49
jpayne@7	50 def reset(self) -> None: # pragma: no cover
jpayne@7	51 """
jpayne@7	52 Permit to reset the plugin to the initial state.
jpayne@7	53 """
jpayne@7	54 raise NotImplementedError
jpayne@7	55
jpayne@7	56 @property
jpayne@7	57 def ratio(self) -> float:
jpayne@7	58 """
jpayne@7	59 Compute the chaos ratio based on what your feed() has seen.
jpayne@7	60 Must NOT be lower than 0.; No restriction gt 0.
jpayne@7	61 """
jpayne@7	62 raise NotImplementedError # pragma: nocover
jpayne@7	63
jpayne@7	64
jpayne@7	65 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
jpayne@7	66 def __init__(self) -> None:
jpayne@7	67 self._punctuation_count: int = 0
jpayne@7	68 self._symbol_count: int = 0
jpayne@7	69 self._character_count: int = 0
jpayne@7	70
jpayne@7	71 self._last_printable_char: Optional[str] = None
jpayne@7	72 self._frenzy_symbol_in_word: bool = False
jpayne@7	73
jpayne@7	74 def eligible(self, character: str) -> bool:
jpayne@7	75 return character.isprintable()
jpayne@7	76
jpayne@7	77 def feed(self, character: str) -> None:
jpayne@7	78 self._character_count += 1
jpayne@7	79
jpayne@7	80 if (
jpayne@7	81 character != self._last_printable_char
jpayne@7	82 and character not in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7	83 ):
jpayne@7	84 if is_punctuation(character):
jpayne@7	85 self._punctuation_count += 1
jpayne@7	86 elif (
jpayne@7	87 character.isdigit() is False
jpayne@7	88 and is_symbol(character)
jpayne@7	89 and is_emoticon(character) is False
jpayne@7	90 ):
jpayne@7	91 self._symbol_count += 2
jpayne@7	92
jpayne@7	93 self._last_printable_char = character
jpayne@7	94
jpayne@7	95 def reset(self) -> None: # pragma: no cover
jpayne@7	96 self._punctuation_count = 0
jpayne@7	97 self._character_count = 0
jpayne@7	98 self._symbol_count = 0
jpayne@7	99
jpayne@7	100 @property
jpayne@7	101 def ratio(self) -> float:
jpayne@7	102 if self._character_count == 0:
jpayne@7	103 return 0.0
jpayne@7	104
jpayne@7	105 ratio_of_punctuation: float = (
jpayne@7	106 self._punctuation_count + self._symbol_count
jpayne@7	107 ) / self._character_count
jpayne@7	108
jpayne@7	109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
jpayne@7	110
jpayne@7	111
jpayne@7	112 class TooManyAccentuatedPlugin(MessDetectorPlugin):
jpayne@7	113 def __init__(self) -> None:
jpayne@7	114 self._character_count: int = 0
jpayne@7	115 self._accentuated_count: int = 0
jpayne@7	116
jpayne@7	117 def eligible(self, character: str) -> bool:
jpayne@7	118 return character.isalpha()
jpayne@7	119
jpayne@7	120 def feed(self, character: str) -> None:
jpayne@7	121 self._character_count += 1
jpayne@7	122
jpayne@7	123 if is_accentuated(character):
jpayne@7	124 self._accentuated_count += 1
jpayne@7	125
jpayne@7	126 def reset(self) -> None: # pragma: no cover
jpayne@7	127 self._character_count = 0
jpayne@7	128 self._accentuated_count = 0
jpayne@7	129
jpayne@7	130 @property
jpayne@7	131 def ratio(self) -> float:
jpayne@7	132 if self._character_count < 8:
jpayne@7	133 return 0.0
jpayne@7	134
jpayne@7	135 ratio_of_accentuation: float = self._accentuated_count / self._character_count
jpayne@7	136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
jpayne@7	137
jpayne@7	138
jpayne@7	139 class UnprintablePlugin(MessDetectorPlugin):
jpayne@7	140 def __init__(self) -> None:
jpayne@7	141 self._unprintable_count: int = 0
jpayne@7	142 self._character_count: int = 0
jpayne@7	143
jpayne@7	144 def eligible(self, character: str) -> bool:
jpayne@7	145 return True
jpayne@7	146
jpayne@7	147 def feed(self, character: str) -> None:
jpayne@7	148 if is_unprintable(character):
jpayne@7	149 self._unprintable_count += 1
jpayne@7	150 self._character_count += 1
jpayne@7	151
jpayne@7	152 def reset(self) -> None: # pragma: no cover
jpayne@7	153 self._unprintable_count = 0
jpayne@7	154
jpayne@7	155 @property
jpayne@7	156 def ratio(self) -> float:
jpayne@7	157 if self._character_count == 0:
jpayne@7	158 return 0.0
jpayne@7	159
jpayne@7	160 return (self._unprintable_count * 8) / self._character_count
jpayne@7	161
jpayne@7	162
jpayne@7	163 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
jpayne@7	164 def __init__(self) -> None:
jpayne@7	165 self._successive_count: int = 0
jpayne@7	166 self._character_count: int = 0
jpayne@7	167
jpayne@7	168 self._last_latin_character: Optional[str] = None
jpayne@7	169
jpayne@7	170 def eligible(self, character: str) -> bool:
jpayne@7	171 return character.isalpha() and is_latin(character)
jpayne@7	172
jpayne@7	173 def feed(self, character: str) -> None:
jpayne@7	174 self._character_count += 1
jpayne@7	175 if (
jpayne@7	176 self._last_latin_character is not None
jpayne@7	177 and is_accentuated(character)
jpayne@7	178 and is_accentuated(self._last_latin_character)
jpayne@7	179 ):
jpayne@7	180 if character.isupper() and self._last_latin_character.isupper():
jpayne@7	181 self._successive_count += 1
jpayne@7	182 # Worse if its the same char duplicated with different accent.
jpayne@7	183 if remove_accent(character) == remove_accent(self._last_latin_character):
jpayne@7	184 self._successive_count += 1
jpayne@7	185 self._last_latin_character = character
jpayne@7	186
jpayne@7	187 def reset(self) -> None: # pragma: no cover
jpayne@7	188 self._successive_count = 0
jpayne@7	189 self._character_count = 0
jpayne@7	190 self._last_latin_character = None
jpayne@7	191
jpayne@7	192 @property
jpayne@7	193 def ratio(self) -> float:
jpayne@7	194 if self._character_count == 0:
jpayne@7	195 return 0.0
jpayne@7	196
jpayne@7	197 return (self._successive_count * 2) / self._character_count
jpayne@7	198
jpayne@7	199
jpayne@7	200 class SuspiciousRange(MessDetectorPlugin):
jpayne@7	201 def __init__(self) -> None:
jpayne@7	202 self._suspicious_successive_range_count: int = 0
jpayne@7	203 self._character_count: int = 0
jpayne@7	204 self._last_printable_seen: Optional[str] = None
jpayne@7	205
jpayne@7	206 def eligible(self, character: str) -> bool:
jpayne@7	207 return character.isprintable()
jpayne@7	208
jpayne@7	209 def feed(self, character: str) -> None:
jpayne@7	210 self._character_count += 1
jpayne@7	211
jpayne@7	212 if (
jpayne@7	213 character.isspace()
jpayne@7	214 or is_punctuation(character)
jpayne@7	215 or character in COMMON_SAFE_ASCII_CHARACTERS
jpayne@7	216 ):
jpayne@7	217 self._last_printable_seen = None
jpayne@7	218 return
jpayne@7	219
jpayne@7	220 if self._last_printable_seen is None:
jpayne@7	221 self._last_printable_seen = character
jpayne@7	222 return
jpayne@7	223
jpayne@7	224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
jpayne@7	225 unicode_range_b: Optional[str] = unicode_range(character)
jpayne@7	226
jpayne@7	227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
jpayne@7	228 self._suspicious_successive_range_count += 1
jpayne@7	229
jpayne@7	230 self._last_printable_seen = character
jpayne@7	231
jpayne@7	232 def reset(self) -> None: # pragma: no cover
jpayne@7	233 self._character_count = 0
jpayne@7	234 self._suspicious_successive_range_count = 0
jpayne@7	235 self._last_printable_seen = None
jpayne@7	236
jpayne@7	237 @property
jpayne@7	238 def ratio(self) -> float:
jpayne@7	239 if self._character_count <= 24:
jpayne@7	240 return 0.0
jpayne@7	241
jpayne@7	242 ratio_of_suspicious_range_usage: float = (
jpayne@7	243 self._suspicious_successive_range_count * 2
jpayne@7	244 ) / self._character_count
jpayne@7	245
jpayne@7	246 return ratio_of_suspicious_range_usage
jpayne@7	247
jpayne@7	248
jpayne@7	249 class SuperWeirdWordPlugin(MessDetectorPlugin):
jpayne@7	250 def __init__(self) -> None:
jpayne@7	251 self._word_count: int = 0
jpayne@7	252 self._bad_word_count: int = 0
jpayne@7	253 self._foreign_long_count: int = 0
jpayne@7	254
jpayne@7	255 self._is_current_word_bad: bool = False
jpayne@7	256 self._foreign_long_watch: bool = False
jpayne@7	257
jpayne@7	258 self._character_count: int = 0
jpayne@7	259 self._bad_character_count: int = 0
jpayne@7	260
jpayne@7	261 self._buffer: str = ""
jpayne@7	262 self._buffer_accent_count: int = 0
jpayne@7	263
jpayne@7	264 def eligible(self, character: str) -> bool:
jpayne@7	265 return True
jpayne@7	266
jpayne@7	267 def feed(self, character: str) -> None:
jpayne@7	268 if character.isalpha():
jpayne@7	269 self._buffer += character
jpayne@7	270 if is_accentuated(character):
jpayne@7	271 self._buffer_accent_count += 1
jpayne@7	272 if (
jpayne@7	273 self._foreign_long_watch is False
jpayne@7	274 and (is_latin(character) is False or is_accentuated(character))
jpayne@7	275 and is_cjk(character) is False
jpayne@7	276 and is_hangul(character) is False
jpayne@7	277 and is_katakana(character) is False
jpayne@7	278 and is_hiragana(character) is False
jpayne@7	279 and is_thai(character) is False
jpayne@7	280 ):
jpayne@7	281 self._foreign_long_watch = True
jpayne@7	282 return
jpayne@7	283 if not self._buffer:
jpayne@7	284 return
jpayne@7	285 if (
jpayne@7	286 character.isspace() or is_punctuation(character) or is_separator(character)
jpayne@7	287 ) and self._buffer:
jpayne@7	288 self._word_count += 1
jpayne@7	289 buffer_length: int = len(self._buffer)
jpayne@7	290
jpayne@7	291 self._character_count += buffer_length
jpayne@7	292
jpayne@7	293 if buffer_length >= 4:
jpayne@7	294 if self._buffer_accent_count / buffer_length > 0.34:
jpayne@7	295 self._is_current_word_bad = True
jpayne@7	296 # Word/Buffer ending with an upper case accentuated letter are so rare,
jpayne@7	297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
jpayne@7	298 if (
jpayne@7	299 is_accentuated(self._buffer[-1])
jpayne@7	300 and self._buffer[-1].isupper()
jpayne@7	301 and all(_.isupper() for _ in self._buffer) is False
jpayne@7	302 ):
jpayne@7	303 self._foreign_long_count += 1
jpayne@7	304 self._is_current_word_bad = True
jpayne@7	305 if buffer_length >= 24 and self._foreign_long_watch:
jpayne@7	306 camel_case_dst = [
jpayne@7	307 i
jpayne@7	308 for c, i in zip(self._buffer, range(0, buffer_length))
jpayne@7	309 if c.isupper()
jpayne@7	310 ]
jpayne@7	311 probable_camel_cased: bool = False
jpayne@7	312
jpayne@7	313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
jpayne@7	314 probable_camel_cased = True
jpayne@7	315
jpayne@7	316 if not probable_camel_cased:
jpayne@7	317 self._foreign_long_count += 1
jpayne@7	318 self._is_current_word_bad = True
jpayne@7	319
jpayne@7	320 if self._is_current_word_bad:
jpayne@7	321 self._bad_word_count += 1
jpayne@7	322 self._bad_character_count += len(self._buffer)
jpayne@7	323 self._is_current_word_bad = False
jpayne@7	324
jpayne@7	325 self._foreign_long_watch = False
jpayne@7	326 self._buffer = ""
jpayne@7	327 self._buffer_accent_count = 0
jpayne@7	328 elif (
jpayne@7	329 character not in {"<", ">", "-", "=", "~", "\|", "_"}
jpayne@7	330 and character.isdigit() is False
jpayne@7	331 and is_symbol(character)
jpayne@7	332 ):
jpayne@7	333 self._is_current_word_bad = True
jpayne@7	334 self._buffer += character
jpayne@7	335
jpayne@7	336 def reset(self) -> None: # pragma: no cover
jpayne@7	337 self._buffer = ""
jpayne@7	338 self._is_current_word_bad = False
jpayne@7	339 self._foreign_long_watch = False
jpayne@7	340 self._bad_word_count = 0
jpayne@7	341 self._word_count = 0
jpayne@7	342 self._character_count = 0
jpayne@7	343 self._bad_character_count = 0
jpayne@7	344 self._foreign_long_count = 0
jpayne@7	345
jpayne@7	346 @property
jpayne@7	347 def ratio(self) -> float:
jpayne@7	348 if self._word_count <= 10 and self._foreign_long_count == 0:
jpayne@7	349 return 0.0
jpayne@7	350
jpayne@7	351 return self._bad_character_count / self._character_count
jpayne@7	352
jpayne@7	353
jpayne@7	354 class CjkInvalidStopPlugin(MessDetectorPlugin):
jpayne@7	355 """
jpayne@7	356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
jpayne@7	357 can be easily detected. Searching for the overuse of '丅' and '丄'.
jpayne@7	358 """
jpayne@7	359
jpayne@7	360 def __init__(self) -> None:
jpayne@7	361 self._wrong_stop_count: int = 0
jpayne@7	362 self._cjk_character_count: int = 0
jpayne@7	363
jpayne@7	364 def eligible(self, character: str) -> bool:
jpayne@7	365 return True
jpayne@7	366
jpayne@7	367 def feed(self, character: str) -> None:
jpayne@7	368 if character in {"丅", "丄"}:
jpayne@7	369 self._wrong_stop_count += 1
jpayne@7	370 return
jpayne@7	371 if is_cjk(character):
jpayne@7	372 self._cjk_character_count += 1
jpayne@7	373
jpayne@7	374 def reset(self) -> None: # pragma: no cover
jpayne@7	375 self._wrong_stop_count = 0
jpayne@7	376 self._cjk_character_count = 0
jpayne@7	377
jpayne@7	378 @property
jpayne@7	379 def ratio(self) -> float:
jpayne@7	380 if self._cjk_character_count < 16:
jpayne@7	381 return 0.0
jpayne@7	382 return self._wrong_stop_count / self._cjk_character_count
jpayne@7	383
jpayne@7	384
jpayne@7	385 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
jpayne@7	386 def __init__(self) -> None:
jpayne@7	387 self._buf: bool = False
jpayne@7	388
jpayne@7	389 self._character_count_since_last_sep: int = 0
jpayne@7	390
jpayne@7	391 self._successive_upper_lower_count: int = 0
jpayne@7	392 self._successive_upper_lower_count_final: int = 0
jpayne@7	393
jpayne@7	394 self._character_count: int = 0
jpayne@7	395
jpayne@7	396 self._last_alpha_seen: Optional[str] = None
jpayne@7	397 self._current_ascii_only: bool = True
jpayne@7	398
jpayne@7	399 def eligible(self, character: str) -> bool:
jpayne@7	400 return True
jpayne@7	401
jpayne@7	402 def feed(self, character: str) -> None:
jpayne@7	403 is_concerned = character.isalpha() and is_case_variable(character)
jpayne@7	404 chunk_sep = is_concerned is False
jpayne@7	405
jpayne@7	406 if chunk_sep and self._character_count_since_last_sep > 0:
jpayne@7	407 if (
jpayne@7	408 self._character_count_since_last_sep <= 64
jpayne@7	409 and character.isdigit() is False
jpayne@7	410 and self._current_ascii_only is False
jpayne@7	411 ):
jpayne@7	412 self._successive_upper_lower_count_final += (
jpayne@7	413 self._successive_upper_lower_count
jpayne@7	414 )
jpayne@7	415
jpayne@7	416 self._successive_upper_lower_count = 0
jpayne@7	417 self._character_count_since_last_sep = 0
jpayne@7	418 self._last_alpha_seen = None
jpayne@7	419 self._buf = False
jpayne@7	420 self._character_count += 1
jpayne@7	421 self._current_ascii_only = True
jpayne@7	422
jpayne@7	423 return
jpayne@7	424
jpayne@7	425 if self._current_ascii_only is True and character.isascii() is False:
jpayne@7	426 self._current_ascii_only = False
jpayne@7	427
jpayne@7	428 if self._last_alpha_seen is not None:
jpayne@7	429 if (character.isupper() and self._last_alpha_seen.islower()) or (
jpayne@7	430 character.islower() and self._last_alpha_seen.isupper()
jpayne@7	431 ):
jpayne@7	432 if self._buf is True:
jpayne@7	433 self._successive_upper_lower_count += 2
jpayne@7	434 self._buf = False
jpayne@7	435 else:
jpayne@7	436 self._buf = True
jpayne@7	437 else:
jpayne@7	438 self._buf = False
jpayne@7	439
jpayne@7	440 self._character_count += 1
jpayne@7	441 self._character_count_since_last_sep += 1
jpayne@7	442 self._last_alpha_seen = character
jpayne@7	443
jpayne@7	444 def reset(self) -> None: # pragma: no cover
jpayne@7	445 self._character_count = 0
jpayne@7	446 self._character_count_since_last_sep = 0
jpayne@7	447 self._successive_upper_lower_count = 0
jpayne@7	448 self._successive_upper_lower_count_final = 0
jpayne@7	449 self._last_alpha_seen = None
jpayne@7	450 self._buf = False
jpayne@7	451 self._current_ascii_only = True
jpayne@7	452
jpayne@7	453 @property
jpayne@7	454 def ratio(self) -> float:
jpayne@7	455 if self._character_count == 0:
jpayne@7	456 return 0.0
jpayne@7	457
jpayne@7	458 return self._successive_upper_lower_count_final / self._character_count
jpayne@7	459
jpayne@7	460
jpayne@7	461 class ArabicIsolatedFormPlugin(MessDetectorPlugin):
jpayne@7	462 def __init__(self) -> None:
jpayne@7	463 self._character_count: int = 0
jpayne@7	464 self._isolated_form_count: int = 0
jpayne@7	465
jpayne@7	466 def reset(self) -> None: # pragma: no cover
jpayne@7	467 self._character_count = 0
jpayne@7	468 self._isolated_form_count = 0
jpayne@7	469
jpayne@7	470 def eligible(self, character: str) -> bool:
jpayne@7	471 return is_arabic(character)
jpayne@7	472
jpayne@7	473 def feed(self, character: str) -> None:
jpayne@7	474 self._character_count += 1
jpayne@7	475
jpayne@7	476 if is_arabic_isolated_form(character):
jpayne@7	477 self._isolated_form_count += 1
jpayne@7	478
jpayne@7	479 @property
jpayne@7	480 def ratio(self) -> float:
jpayne@7	481 if self._character_count < 8:
jpayne@7	482 return 0.0
jpayne@7	483
jpayne@7	484 isolated_form_usage: float = self._isolated_form_count / self._character_count
jpayne@7	485
jpayne@7	486 return isolated_form_usage
jpayne@7	487
jpayne@7	488
jpayne@7	489 @lru_cache(maxsize=1024)
jpayne@7	490 def is_suspiciously_successive_range(
jpayne@7	491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
jpayne@7	492 ) -> bool:
jpayne@7	493 """
jpayne@7	494 Determine if two Unicode range seen next to each other can be considered as suspicious.
jpayne@7	495 """
jpayne@7	496 if unicode_range_a is None or unicode_range_b is None:
jpayne@7	497 return True
jpayne@7	498
jpayne@7	499 if unicode_range_a == unicode_range_b:
jpayne@7	500 return False
jpayne@7	501
jpayne@7	502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
jpayne@7	503 return False
jpayne@7	504
jpayne@7	505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
jpayne@7	506 return False
jpayne@7	507
jpayne@7	508 # Latin characters can be accompanied with a combining diacritical mark
jpayne@7	509 # eg. Vietnamese.
jpayne@7	510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
jpayne@7	511 "Combining" in unicode_range_a or "Combining" in unicode_range_b
jpayne@7	512 ):
jpayne@7	513 return False
jpayne@7	514
jpayne@7	515 keywords_range_a, keywords_range_b = unicode_range_a.split(
jpayne@7	516 " "
jpayne@7	517 ), unicode_range_b.split(" ")
jpayne@7	518
jpayne@7	519 for el in keywords_range_a:
jpayne@7	520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
jpayne@7	521 continue
jpayne@7	522 if el in keywords_range_b:
jpayne@7	523 return False
jpayne@7	524
jpayne@7	525 # Japanese Exception
jpayne@7	526 range_a_jp_chars, range_b_jp_chars = (
jpayne@7	527 unicode_range_a
jpayne@7	528 in (
jpayne@7	529 "Hiragana",
jpayne@7	530 "Katakana",
jpayne@7	531 ),
jpayne@7	532 unicode_range_b in ("Hiragana", "Katakana"),
jpayne@7	533 )
jpayne@7	534 if (range_a_jp_chars or range_b_jp_chars) and (
jpayne@7	535 "CJK" in unicode_range_a or "CJK" in unicode_range_b
jpayne@7	536 ):
jpayne@7	537 return False
jpayne@7	538 if range_a_jp_chars and range_b_jp_chars:
jpayne@7	539 return False
jpayne@7	540
jpayne@7	541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
jpayne@7	542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
jpayne@7	543 return False
jpayne@7	544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7	545 return False
jpayne@7	546
jpayne@7	547 # Chinese/Japanese use dedicated range for punctuation and/or separators.
jpayne@7	548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
jpayne@7	549 unicode_range_a in ["Katakana", "Hiragana"]
jpayne@7	550 and unicode_range_b in ["Katakana", "Hiragana"]
jpayne@7	551 ):
jpayne@7	552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
jpayne@7	553 return False
jpayne@7	554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
jpayne@7	555 return False
jpayne@7	556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
jpayne@7	557 return False
jpayne@7	558
jpayne@7	559 return True
jpayne@7	560
jpayne@7	561
jpayne@7	562 @lru_cache(maxsize=2048)
jpayne@7	563 def mess_ratio(
jpayne@7	564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
jpayne@7	565 ) -> float:
jpayne@7	566 """
jpayne@7	567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
jpayne@7	568 """
jpayne@7	569
jpayne@7	570 detectors: List[MessDetectorPlugin] = [
jpayne@7	571 md_class() for md_class in MessDetectorPlugin.__subclasses__()
jpayne@7	572 ]
jpayne@7	573
jpayne@7	574 length: int = len(decoded_sequence) + 1
jpayne@7	575
jpayne@7	576 mean_mess_ratio: float = 0.0
jpayne@7	577
jpayne@7	578 if length < 512:
jpayne@7	579 intermediary_mean_mess_ratio_calc: int = 32
jpayne@7	580 elif length <= 1024:
jpayne@7	581 intermediary_mean_mess_ratio_calc = 64
jpayne@7	582 else:
jpayne@7	583 intermediary_mean_mess_ratio_calc = 128
jpayne@7	584
jpayne@7	585 for character, index in zip(decoded_sequence + "\n", range(length)):
jpayne@7	586 for detector in detectors:
jpayne@7	587 if detector.eligible(character):
jpayne@7	588 detector.feed(character)
jpayne@7	589
jpayne@7	590 if (
jpayne@7	591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
jpayne@7	592 ) or index == length - 1:
jpayne@7	593 mean_mess_ratio = sum(dt.ratio for dt in detectors)
jpayne@7	594
jpayne@7	595 if mean_mess_ratio >= maximum_threshold:
jpayne@7	596 break
jpayne@7	597
jpayne@7	598 if debug:
jpayne@7	599 logger = getLogger("charset_normalizer")
jpayne@7	600
jpayne@7	601 logger.log(
jpayne@7	602 TRACE,
jpayne@7	603 "Mess-detector extended-analysis start. "
jpayne@7	604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
jpayne@7	605 f"maximum_threshold={maximum_threshold}",
jpayne@7	606 )
jpayne@7	607
jpayne@7	608 if len(decoded_sequence) > 16:
jpayne@7	609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
jpayne@7	610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
jpayne@7	611
jpayne@7	612 for dt in detectors: # pragma: nocover
jpayne@7	613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
jpayne@7	614
jpayne@7	615 return round(mean_mess_ratio, 3)

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate charset_normalizer/md.py @ 14:18e1cb6018fd