comparison charset_normalizer/md.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
6:b2745907b1eb 7:5eb2d5e3bf22
1 from functools import lru_cache
2 from logging import getLogger
3 from typing import List, Optional
4
5 from .constant import (
6 COMMON_SAFE_ASCII_CHARACTERS,
7 TRACE,
8 UNICODE_SECONDARY_RANGE_KEYWORD,
9 )
10 from .utils import (
11 is_accentuated,
12 is_arabic,
13 is_arabic_isolated_form,
14 is_case_variable,
15 is_cjk,
16 is_emoticon,
17 is_hangul,
18 is_hiragana,
19 is_katakana,
20 is_latin,
21 is_punctuation,
22 is_separator,
23 is_symbol,
24 is_thai,
25 is_unprintable,
26 remove_accent,
27 unicode_range,
28 )
29
30
31 class MessDetectorPlugin:
32 """
33 Base abstract class used for mess detection plugins.
34 All detectors MUST extend and implement given methods.
35 """
36
37 def eligible(self, character: str) -> bool:
38 """
39 Determine if given character should be fed in.
40 """
41 raise NotImplementedError # pragma: nocover
42
43 def feed(self, character: str) -> None:
44 """
45 The main routine to be executed upon character.
46 Insert the logic in witch the text would be considered chaotic.
47 """
48 raise NotImplementedError # pragma: nocover
49
50 def reset(self) -> None: # pragma: no cover
51 """
52 Permit to reset the plugin to the initial state.
53 """
54 raise NotImplementedError
55
56 @property
57 def ratio(self) -> float:
58 """
59 Compute the chaos ratio based on what your feed() has seen.
60 Must NOT be lower than 0.; No restriction gt 0.
61 """
62 raise NotImplementedError # pragma: nocover
63
64
65 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
66 def __init__(self) -> None:
67 self._punctuation_count: int = 0
68 self._symbol_count: int = 0
69 self._character_count: int = 0
70
71 self._last_printable_char: Optional[str] = None
72 self._frenzy_symbol_in_word: bool = False
73
74 def eligible(self, character: str) -> bool:
75 return character.isprintable()
76
77 def feed(self, character: str) -> None:
78 self._character_count += 1
79
80 if (
81 character != self._last_printable_char
82 and character not in COMMON_SAFE_ASCII_CHARACTERS
83 ):
84 if is_punctuation(character):
85 self._punctuation_count += 1
86 elif (
87 character.isdigit() is False
88 and is_symbol(character)
89 and is_emoticon(character) is False
90 ):
91 self._symbol_count += 2
92
93 self._last_printable_char = character
94
95 def reset(self) -> None: # pragma: no cover
96 self._punctuation_count = 0
97 self._character_count = 0
98 self._symbol_count = 0
99
100 @property
101 def ratio(self) -> float:
102 if self._character_count == 0:
103 return 0.0
104
105 ratio_of_punctuation: float = (
106 self._punctuation_count + self._symbol_count
107 ) / self._character_count
108
109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
110
111
112 class TooManyAccentuatedPlugin(MessDetectorPlugin):
113 def __init__(self) -> None:
114 self._character_count: int = 0
115 self._accentuated_count: int = 0
116
117 def eligible(self, character: str) -> bool:
118 return character.isalpha()
119
120 def feed(self, character: str) -> None:
121 self._character_count += 1
122
123 if is_accentuated(character):
124 self._accentuated_count += 1
125
126 def reset(self) -> None: # pragma: no cover
127 self._character_count = 0
128 self._accentuated_count = 0
129
130 @property
131 def ratio(self) -> float:
132 if self._character_count < 8:
133 return 0.0
134
135 ratio_of_accentuation: float = self._accentuated_count / self._character_count
136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
137
138
139 class UnprintablePlugin(MessDetectorPlugin):
140 def __init__(self) -> None:
141 self._unprintable_count: int = 0
142 self._character_count: int = 0
143
144 def eligible(self, character: str) -> bool:
145 return True
146
147 def feed(self, character: str) -> None:
148 if is_unprintable(character):
149 self._unprintable_count += 1
150 self._character_count += 1
151
152 def reset(self) -> None: # pragma: no cover
153 self._unprintable_count = 0
154
155 @property
156 def ratio(self) -> float:
157 if self._character_count == 0:
158 return 0.0
159
160 return (self._unprintable_count * 8) / self._character_count
161
162
163 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
164 def __init__(self) -> None:
165 self._successive_count: int = 0
166 self._character_count: int = 0
167
168 self._last_latin_character: Optional[str] = None
169
170 def eligible(self, character: str) -> bool:
171 return character.isalpha() and is_latin(character)
172
173 def feed(self, character: str) -> None:
174 self._character_count += 1
175 if (
176 self._last_latin_character is not None
177 and is_accentuated(character)
178 and is_accentuated(self._last_latin_character)
179 ):
180 if character.isupper() and self._last_latin_character.isupper():
181 self._successive_count += 1
182 # Worse if its the same char duplicated with different accent.
183 if remove_accent(character) == remove_accent(self._last_latin_character):
184 self._successive_count += 1
185 self._last_latin_character = character
186
187 def reset(self) -> None: # pragma: no cover
188 self._successive_count = 0
189 self._character_count = 0
190 self._last_latin_character = None
191
192 @property
193 def ratio(self) -> float:
194 if self._character_count == 0:
195 return 0.0
196
197 return (self._successive_count * 2) / self._character_count
198
199
200 class SuspiciousRange(MessDetectorPlugin):
201 def __init__(self) -> None:
202 self._suspicious_successive_range_count: int = 0
203 self._character_count: int = 0
204 self._last_printable_seen: Optional[str] = None
205
206 def eligible(self, character: str) -> bool:
207 return character.isprintable()
208
209 def feed(self, character: str) -> None:
210 self._character_count += 1
211
212 if (
213 character.isspace()
214 or is_punctuation(character)
215 or character in COMMON_SAFE_ASCII_CHARACTERS
216 ):
217 self._last_printable_seen = None
218 return
219
220 if self._last_printable_seen is None:
221 self._last_printable_seen = character
222 return
223
224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
225 unicode_range_b: Optional[str] = unicode_range(character)
226
227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
228 self._suspicious_successive_range_count += 1
229
230 self._last_printable_seen = character
231
232 def reset(self) -> None: # pragma: no cover
233 self._character_count = 0
234 self._suspicious_successive_range_count = 0
235 self._last_printable_seen = None
236
237 @property
238 def ratio(self) -> float:
239 if self._character_count <= 24:
240 return 0.0
241
242 ratio_of_suspicious_range_usage: float = (
243 self._suspicious_successive_range_count * 2
244 ) / self._character_count
245
246 return ratio_of_suspicious_range_usage
247
248
249 class SuperWeirdWordPlugin(MessDetectorPlugin):
250 def __init__(self) -> None:
251 self._word_count: int = 0
252 self._bad_word_count: int = 0
253 self._foreign_long_count: int = 0
254
255 self._is_current_word_bad: bool = False
256 self._foreign_long_watch: bool = False
257
258 self._character_count: int = 0
259 self._bad_character_count: int = 0
260
261 self._buffer: str = ""
262 self._buffer_accent_count: int = 0
263
264 def eligible(self, character: str) -> bool:
265 return True
266
267 def feed(self, character: str) -> None:
268 if character.isalpha():
269 self._buffer += character
270 if is_accentuated(character):
271 self._buffer_accent_count += 1
272 if (
273 self._foreign_long_watch is False
274 and (is_latin(character) is False or is_accentuated(character))
275 and is_cjk(character) is False
276 and is_hangul(character) is False
277 and is_katakana(character) is False
278 and is_hiragana(character) is False
279 and is_thai(character) is False
280 ):
281 self._foreign_long_watch = True
282 return
283 if not self._buffer:
284 return
285 if (
286 character.isspace() or is_punctuation(character) or is_separator(character)
287 ) and self._buffer:
288 self._word_count += 1
289 buffer_length: int = len(self._buffer)
290
291 self._character_count += buffer_length
292
293 if buffer_length >= 4:
294 if self._buffer_accent_count / buffer_length > 0.34:
295 self._is_current_word_bad = True
296 # Word/Buffer ending with an upper case accentuated letter are so rare,
297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
298 if (
299 is_accentuated(self._buffer[-1])
300 and self._buffer[-1].isupper()
301 and all(_.isupper() for _ in self._buffer) is False
302 ):
303 self._foreign_long_count += 1
304 self._is_current_word_bad = True
305 if buffer_length >= 24 and self._foreign_long_watch:
306 camel_case_dst = [
307 i
308 for c, i in zip(self._buffer, range(0, buffer_length))
309 if c.isupper()
310 ]
311 probable_camel_cased: bool = False
312
313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
314 probable_camel_cased = True
315
316 if not probable_camel_cased:
317 self._foreign_long_count += 1
318 self._is_current_word_bad = True
319
320 if self._is_current_word_bad:
321 self._bad_word_count += 1
322 self._bad_character_count += len(self._buffer)
323 self._is_current_word_bad = False
324
325 self._foreign_long_watch = False
326 self._buffer = ""
327 self._buffer_accent_count = 0
328 elif (
329 character not in {"<", ">", "-", "=", "~", "|", "_"}
330 and character.isdigit() is False
331 and is_symbol(character)
332 ):
333 self._is_current_word_bad = True
334 self._buffer += character
335
336 def reset(self) -> None: # pragma: no cover
337 self._buffer = ""
338 self._is_current_word_bad = False
339 self._foreign_long_watch = False
340 self._bad_word_count = 0
341 self._word_count = 0
342 self._character_count = 0
343 self._bad_character_count = 0
344 self._foreign_long_count = 0
345
346 @property
347 def ratio(self) -> float:
348 if self._word_count <= 10 and self._foreign_long_count == 0:
349 return 0.0
350
351 return self._bad_character_count / self._character_count
352
353
354 class CjkInvalidStopPlugin(MessDetectorPlugin):
355 """
356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
357 can be easily detected. Searching for the overuse of '丅' and '丄'.
358 """
359
360 def __init__(self) -> None:
361 self._wrong_stop_count: int = 0
362 self._cjk_character_count: int = 0
363
364 def eligible(self, character: str) -> bool:
365 return True
366
367 def feed(self, character: str) -> None:
368 if character in {"丅", "丄"}:
369 self._wrong_stop_count += 1
370 return
371 if is_cjk(character):
372 self._cjk_character_count += 1
373
374 def reset(self) -> None: # pragma: no cover
375 self._wrong_stop_count = 0
376 self._cjk_character_count = 0
377
378 @property
379 def ratio(self) -> float:
380 if self._cjk_character_count < 16:
381 return 0.0
382 return self._wrong_stop_count / self._cjk_character_count
383
384
385 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
386 def __init__(self) -> None:
387 self._buf: bool = False
388
389 self._character_count_since_last_sep: int = 0
390
391 self._successive_upper_lower_count: int = 0
392 self._successive_upper_lower_count_final: int = 0
393
394 self._character_count: int = 0
395
396 self._last_alpha_seen: Optional[str] = None
397 self._current_ascii_only: bool = True
398
399 def eligible(self, character: str) -> bool:
400 return True
401
402 def feed(self, character: str) -> None:
403 is_concerned = character.isalpha() and is_case_variable(character)
404 chunk_sep = is_concerned is False
405
406 if chunk_sep and self._character_count_since_last_sep > 0:
407 if (
408 self._character_count_since_last_sep <= 64
409 and character.isdigit() is False
410 and self._current_ascii_only is False
411 ):
412 self._successive_upper_lower_count_final += (
413 self._successive_upper_lower_count
414 )
415
416 self._successive_upper_lower_count = 0
417 self._character_count_since_last_sep = 0
418 self._last_alpha_seen = None
419 self._buf = False
420 self._character_count += 1
421 self._current_ascii_only = True
422
423 return
424
425 if self._current_ascii_only is True and character.isascii() is False:
426 self._current_ascii_only = False
427
428 if self._last_alpha_seen is not None:
429 if (character.isupper() and self._last_alpha_seen.islower()) or (
430 character.islower() and self._last_alpha_seen.isupper()
431 ):
432 if self._buf is True:
433 self._successive_upper_lower_count += 2
434 self._buf = False
435 else:
436 self._buf = True
437 else:
438 self._buf = False
439
440 self._character_count += 1
441 self._character_count_since_last_sep += 1
442 self._last_alpha_seen = character
443
444 def reset(self) -> None: # pragma: no cover
445 self._character_count = 0
446 self._character_count_since_last_sep = 0
447 self._successive_upper_lower_count = 0
448 self._successive_upper_lower_count_final = 0
449 self._last_alpha_seen = None
450 self._buf = False
451 self._current_ascii_only = True
452
453 @property
454 def ratio(self) -> float:
455 if self._character_count == 0:
456 return 0.0
457
458 return self._successive_upper_lower_count_final / self._character_count
459
460
461 class ArabicIsolatedFormPlugin(MessDetectorPlugin):
462 def __init__(self) -> None:
463 self._character_count: int = 0
464 self._isolated_form_count: int = 0
465
466 def reset(self) -> None: # pragma: no cover
467 self._character_count = 0
468 self._isolated_form_count = 0
469
470 def eligible(self, character: str) -> bool:
471 return is_arabic(character)
472
473 def feed(self, character: str) -> None:
474 self._character_count += 1
475
476 if is_arabic_isolated_form(character):
477 self._isolated_form_count += 1
478
479 @property
480 def ratio(self) -> float:
481 if self._character_count < 8:
482 return 0.0
483
484 isolated_form_usage: float = self._isolated_form_count / self._character_count
485
486 return isolated_form_usage
487
488
489 @lru_cache(maxsize=1024)
490 def is_suspiciously_successive_range(
491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
492 ) -> bool:
493 """
494 Determine if two Unicode range seen next to each other can be considered as suspicious.
495 """
496 if unicode_range_a is None or unicode_range_b is None:
497 return True
498
499 if unicode_range_a == unicode_range_b:
500 return False
501
502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
503 return False
504
505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
506 return False
507
508 # Latin characters can be accompanied with a combining diacritical mark
509 # eg. Vietnamese.
510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
511 "Combining" in unicode_range_a or "Combining" in unicode_range_b
512 ):
513 return False
514
515 keywords_range_a, keywords_range_b = unicode_range_a.split(
516 " "
517 ), unicode_range_b.split(" ")
518
519 for el in keywords_range_a:
520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
521 continue
522 if el in keywords_range_b:
523 return False
524
525 # Japanese Exception
526 range_a_jp_chars, range_b_jp_chars = (
527 unicode_range_a
528 in (
529 "Hiragana",
530 "Katakana",
531 ),
532 unicode_range_b in ("Hiragana", "Katakana"),
533 )
534 if (range_a_jp_chars or range_b_jp_chars) and (
535 "CJK" in unicode_range_a or "CJK" in unicode_range_b
536 ):
537 return False
538 if range_a_jp_chars and range_b_jp_chars:
539 return False
540
541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
543 return False
544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
545 return False
546
547 # Chinese/Japanese use dedicated range for punctuation and/or separators.
548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
549 unicode_range_a in ["Katakana", "Hiragana"]
550 and unicode_range_b in ["Katakana", "Hiragana"]
551 ):
552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
553 return False
554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
555 return False
556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
557 return False
558
559 return True
560
561
562 @lru_cache(maxsize=2048)
563 def mess_ratio(
564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
565 ) -> float:
566 """
567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
568 """
569
570 detectors: List[MessDetectorPlugin] = [
571 md_class() for md_class in MessDetectorPlugin.__subclasses__()
572 ]
573
574 length: int = len(decoded_sequence) + 1
575
576 mean_mess_ratio: float = 0.0
577
578 if length < 512:
579 intermediary_mean_mess_ratio_calc: int = 32
580 elif length <= 1024:
581 intermediary_mean_mess_ratio_calc = 64
582 else:
583 intermediary_mean_mess_ratio_calc = 128
584
585 for character, index in zip(decoded_sequence + "\n", range(length)):
586 for detector in detectors:
587 if detector.eligible(character):
588 detector.feed(character)
589
590 if (
591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
592 ) or index == length - 1:
593 mean_mess_ratio = sum(dt.ratio for dt in detectors)
594
595 if mean_mess_ratio >= maximum_threshold:
596 break
597
598 if debug:
599 logger = getLogger("charset_normalizer")
600
601 logger.log(
602 TRACE,
603 "Mess-detector extended-analysis start. "
604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
605 f"maximum_threshold={maximum_threshold}",
606 )
607
608 if len(decoded_sequence) > 16:
609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
611
612 for dt in detectors: # pragma: nocover
613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
614
615 return round(mean_mess_ratio, 3)