Mercurial > repos > jpayne > bioproject_to_srr_2
comparison charset_normalizer/md.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 from functools import lru_cache | |
2 from logging import getLogger | |
3 from typing import List, Optional | |
4 | |
5 from .constant import ( | |
6 COMMON_SAFE_ASCII_CHARACTERS, | |
7 TRACE, | |
8 UNICODE_SECONDARY_RANGE_KEYWORD, | |
9 ) | |
10 from .utils import ( | |
11 is_accentuated, | |
12 is_arabic, | |
13 is_arabic_isolated_form, | |
14 is_case_variable, | |
15 is_cjk, | |
16 is_emoticon, | |
17 is_hangul, | |
18 is_hiragana, | |
19 is_katakana, | |
20 is_latin, | |
21 is_punctuation, | |
22 is_separator, | |
23 is_symbol, | |
24 is_thai, | |
25 is_unprintable, | |
26 remove_accent, | |
27 unicode_range, | |
28 ) | |
29 | |
30 | |
31 class MessDetectorPlugin: | |
32 """ | |
33 Base abstract class used for mess detection plugins. | |
34 All detectors MUST extend and implement given methods. | |
35 """ | |
36 | |
37 def eligible(self, character: str) -> bool: | |
38 """ | |
39 Determine if given character should be fed in. | |
40 """ | |
41 raise NotImplementedError # pragma: nocover | |
42 | |
43 def feed(self, character: str) -> None: | |
44 """ | |
45 The main routine to be executed upon character. | |
46 Insert the logic in witch the text would be considered chaotic. | |
47 """ | |
48 raise NotImplementedError # pragma: nocover | |
49 | |
50 def reset(self) -> None: # pragma: no cover | |
51 """ | |
52 Permit to reset the plugin to the initial state. | |
53 """ | |
54 raise NotImplementedError | |
55 | |
56 @property | |
57 def ratio(self) -> float: | |
58 """ | |
59 Compute the chaos ratio based on what your feed() has seen. | |
60 Must NOT be lower than 0.; No restriction gt 0. | |
61 """ | |
62 raise NotImplementedError # pragma: nocover | |
63 | |
64 | |
65 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): | |
66 def __init__(self) -> None: | |
67 self._punctuation_count: int = 0 | |
68 self._symbol_count: int = 0 | |
69 self._character_count: int = 0 | |
70 | |
71 self._last_printable_char: Optional[str] = None | |
72 self._frenzy_symbol_in_word: bool = False | |
73 | |
74 def eligible(self, character: str) -> bool: | |
75 return character.isprintable() | |
76 | |
77 def feed(self, character: str) -> None: | |
78 self._character_count += 1 | |
79 | |
80 if ( | |
81 character != self._last_printable_char | |
82 and character not in COMMON_SAFE_ASCII_CHARACTERS | |
83 ): | |
84 if is_punctuation(character): | |
85 self._punctuation_count += 1 | |
86 elif ( | |
87 character.isdigit() is False | |
88 and is_symbol(character) | |
89 and is_emoticon(character) is False | |
90 ): | |
91 self._symbol_count += 2 | |
92 | |
93 self._last_printable_char = character | |
94 | |
95 def reset(self) -> None: # pragma: no cover | |
96 self._punctuation_count = 0 | |
97 self._character_count = 0 | |
98 self._symbol_count = 0 | |
99 | |
100 @property | |
101 def ratio(self) -> float: | |
102 if self._character_count == 0: | |
103 return 0.0 | |
104 | |
105 ratio_of_punctuation: float = ( | |
106 self._punctuation_count + self._symbol_count | |
107 ) / self._character_count | |
108 | |
109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 | |
110 | |
111 | |
112 class TooManyAccentuatedPlugin(MessDetectorPlugin): | |
113 def __init__(self) -> None: | |
114 self._character_count: int = 0 | |
115 self._accentuated_count: int = 0 | |
116 | |
117 def eligible(self, character: str) -> bool: | |
118 return character.isalpha() | |
119 | |
120 def feed(self, character: str) -> None: | |
121 self._character_count += 1 | |
122 | |
123 if is_accentuated(character): | |
124 self._accentuated_count += 1 | |
125 | |
126 def reset(self) -> None: # pragma: no cover | |
127 self._character_count = 0 | |
128 self._accentuated_count = 0 | |
129 | |
130 @property | |
131 def ratio(self) -> float: | |
132 if self._character_count < 8: | |
133 return 0.0 | |
134 | |
135 ratio_of_accentuation: float = self._accentuated_count / self._character_count | |
136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 | |
137 | |
138 | |
139 class UnprintablePlugin(MessDetectorPlugin): | |
140 def __init__(self) -> None: | |
141 self._unprintable_count: int = 0 | |
142 self._character_count: int = 0 | |
143 | |
144 def eligible(self, character: str) -> bool: | |
145 return True | |
146 | |
147 def feed(self, character: str) -> None: | |
148 if is_unprintable(character): | |
149 self._unprintable_count += 1 | |
150 self._character_count += 1 | |
151 | |
152 def reset(self) -> None: # pragma: no cover | |
153 self._unprintable_count = 0 | |
154 | |
155 @property | |
156 def ratio(self) -> float: | |
157 if self._character_count == 0: | |
158 return 0.0 | |
159 | |
160 return (self._unprintable_count * 8) / self._character_count | |
161 | |
162 | |
163 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): | |
164 def __init__(self) -> None: | |
165 self._successive_count: int = 0 | |
166 self._character_count: int = 0 | |
167 | |
168 self._last_latin_character: Optional[str] = None | |
169 | |
170 def eligible(self, character: str) -> bool: | |
171 return character.isalpha() and is_latin(character) | |
172 | |
173 def feed(self, character: str) -> None: | |
174 self._character_count += 1 | |
175 if ( | |
176 self._last_latin_character is not None | |
177 and is_accentuated(character) | |
178 and is_accentuated(self._last_latin_character) | |
179 ): | |
180 if character.isupper() and self._last_latin_character.isupper(): | |
181 self._successive_count += 1 | |
182 # Worse if its the same char duplicated with different accent. | |
183 if remove_accent(character) == remove_accent(self._last_latin_character): | |
184 self._successive_count += 1 | |
185 self._last_latin_character = character | |
186 | |
187 def reset(self) -> None: # pragma: no cover | |
188 self._successive_count = 0 | |
189 self._character_count = 0 | |
190 self._last_latin_character = None | |
191 | |
192 @property | |
193 def ratio(self) -> float: | |
194 if self._character_count == 0: | |
195 return 0.0 | |
196 | |
197 return (self._successive_count * 2) / self._character_count | |
198 | |
199 | |
200 class SuspiciousRange(MessDetectorPlugin): | |
201 def __init__(self) -> None: | |
202 self._suspicious_successive_range_count: int = 0 | |
203 self._character_count: int = 0 | |
204 self._last_printable_seen: Optional[str] = None | |
205 | |
206 def eligible(self, character: str) -> bool: | |
207 return character.isprintable() | |
208 | |
209 def feed(self, character: str) -> None: | |
210 self._character_count += 1 | |
211 | |
212 if ( | |
213 character.isspace() | |
214 or is_punctuation(character) | |
215 or character in COMMON_SAFE_ASCII_CHARACTERS | |
216 ): | |
217 self._last_printable_seen = None | |
218 return | |
219 | |
220 if self._last_printable_seen is None: | |
221 self._last_printable_seen = character | |
222 return | |
223 | |
224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) | |
225 unicode_range_b: Optional[str] = unicode_range(character) | |
226 | |
227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): | |
228 self._suspicious_successive_range_count += 1 | |
229 | |
230 self._last_printable_seen = character | |
231 | |
232 def reset(self) -> None: # pragma: no cover | |
233 self._character_count = 0 | |
234 self._suspicious_successive_range_count = 0 | |
235 self._last_printable_seen = None | |
236 | |
237 @property | |
238 def ratio(self) -> float: | |
239 if self._character_count <= 24: | |
240 return 0.0 | |
241 | |
242 ratio_of_suspicious_range_usage: float = ( | |
243 self._suspicious_successive_range_count * 2 | |
244 ) / self._character_count | |
245 | |
246 return ratio_of_suspicious_range_usage | |
247 | |
248 | |
249 class SuperWeirdWordPlugin(MessDetectorPlugin): | |
250 def __init__(self) -> None: | |
251 self._word_count: int = 0 | |
252 self._bad_word_count: int = 0 | |
253 self._foreign_long_count: int = 0 | |
254 | |
255 self._is_current_word_bad: bool = False | |
256 self._foreign_long_watch: bool = False | |
257 | |
258 self._character_count: int = 0 | |
259 self._bad_character_count: int = 0 | |
260 | |
261 self._buffer: str = "" | |
262 self._buffer_accent_count: int = 0 | |
263 | |
264 def eligible(self, character: str) -> bool: | |
265 return True | |
266 | |
267 def feed(self, character: str) -> None: | |
268 if character.isalpha(): | |
269 self._buffer += character | |
270 if is_accentuated(character): | |
271 self._buffer_accent_count += 1 | |
272 if ( | |
273 self._foreign_long_watch is False | |
274 and (is_latin(character) is False or is_accentuated(character)) | |
275 and is_cjk(character) is False | |
276 and is_hangul(character) is False | |
277 and is_katakana(character) is False | |
278 and is_hiragana(character) is False | |
279 and is_thai(character) is False | |
280 ): | |
281 self._foreign_long_watch = True | |
282 return | |
283 if not self._buffer: | |
284 return | |
285 if ( | |
286 character.isspace() or is_punctuation(character) or is_separator(character) | |
287 ) and self._buffer: | |
288 self._word_count += 1 | |
289 buffer_length: int = len(self._buffer) | |
290 | |
291 self._character_count += buffer_length | |
292 | |
293 if buffer_length >= 4: | |
294 if self._buffer_accent_count / buffer_length > 0.34: | |
295 self._is_current_word_bad = True | |
296 # Word/Buffer ending with an upper case accentuated letter are so rare, | |
297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious. | |
298 if ( | |
299 is_accentuated(self._buffer[-1]) | |
300 and self._buffer[-1].isupper() | |
301 and all(_.isupper() for _ in self._buffer) is False | |
302 ): | |
303 self._foreign_long_count += 1 | |
304 self._is_current_word_bad = True | |
305 if buffer_length >= 24 and self._foreign_long_watch: | |
306 camel_case_dst = [ | |
307 i | |
308 for c, i in zip(self._buffer, range(0, buffer_length)) | |
309 if c.isupper() | |
310 ] | |
311 probable_camel_cased: bool = False | |
312 | |
313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): | |
314 probable_camel_cased = True | |
315 | |
316 if not probable_camel_cased: | |
317 self._foreign_long_count += 1 | |
318 self._is_current_word_bad = True | |
319 | |
320 if self._is_current_word_bad: | |
321 self._bad_word_count += 1 | |
322 self._bad_character_count += len(self._buffer) | |
323 self._is_current_word_bad = False | |
324 | |
325 self._foreign_long_watch = False | |
326 self._buffer = "" | |
327 self._buffer_accent_count = 0 | |
328 elif ( | |
329 character not in {"<", ">", "-", "=", "~", "|", "_"} | |
330 and character.isdigit() is False | |
331 and is_symbol(character) | |
332 ): | |
333 self._is_current_word_bad = True | |
334 self._buffer += character | |
335 | |
336 def reset(self) -> None: # pragma: no cover | |
337 self._buffer = "" | |
338 self._is_current_word_bad = False | |
339 self._foreign_long_watch = False | |
340 self._bad_word_count = 0 | |
341 self._word_count = 0 | |
342 self._character_count = 0 | |
343 self._bad_character_count = 0 | |
344 self._foreign_long_count = 0 | |
345 | |
346 @property | |
347 def ratio(self) -> float: | |
348 if self._word_count <= 10 and self._foreign_long_count == 0: | |
349 return 0.0 | |
350 | |
351 return self._bad_character_count / self._character_count | |
352 | |
353 | |
354 class CjkInvalidStopPlugin(MessDetectorPlugin): | |
355 """ | |
356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and | |
357 can be easily detected. Searching for the overuse of '丅' and '丄'. | |
358 """ | |
359 | |
360 def __init__(self) -> None: | |
361 self._wrong_stop_count: int = 0 | |
362 self._cjk_character_count: int = 0 | |
363 | |
364 def eligible(self, character: str) -> bool: | |
365 return True | |
366 | |
367 def feed(self, character: str) -> None: | |
368 if character in {"丅", "丄"}: | |
369 self._wrong_stop_count += 1 | |
370 return | |
371 if is_cjk(character): | |
372 self._cjk_character_count += 1 | |
373 | |
374 def reset(self) -> None: # pragma: no cover | |
375 self._wrong_stop_count = 0 | |
376 self._cjk_character_count = 0 | |
377 | |
378 @property | |
379 def ratio(self) -> float: | |
380 if self._cjk_character_count < 16: | |
381 return 0.0 | |
382 return self._wrong_stop_count / self._cjk_character_count | |
383 | |
384 | |
385 class ArchaicUpperLowerPlugin(MessDetectorPlugin): | |
386 def __init__(self) -> None: | |
387 self._buf: bool = False | |
388 | |
389 self._character_count_since_last_sep: int = 0 | |
390 | |
391 self._successive_upper_lower_count: int = 0 | |
392 self._successive_upper_lower_count_final: int = 0 | |
393 | |
394 self._character_count: int = 0 | |
395 | |
396 self._last_alpha_seen: Optional[str] = None | |
397 self._current_ascii_only: bool = True | |
398 | |
399 def eligible(self, character: str) -> bool: | |
400 return True | |
401 | |
402 def feed(self, character: str) -> None: | |
403 is_concerned = character.isalpha() and is_case_variable(character) | |
404 chunk_sep = is_concerned is False | |
405 | |
406 if chunk_sep and self._character_count_since_last_sep > 0: | |
407 if ( | |
408 self._character_count_since_last_sep <= 64 | |
409 and character.isdigit() is False | |
410 and self._current_ascii_only is False | |
411 ): | |
412 self._successive_upper_lower_count_final += ( | |
413 self._successive_upper_lower_count | |
414 ) | |
415 | |
416 self._successive_upper_lower_count = 0 | |
417 self._character_count_since_last_sep = 0 | |
418 self._last_alpha_seen = None | |
419 self._buf = False | |
420 self._character_count += 1 | |
421 self._current_ascii_only = True | |
422 | |
423 return | |
424 | |
425 if self._current_ascii_only is True and character.isascii() is False: | |
426 self._current_ascii_only = False | |
427 | |
428 if self._last_alpha_seen is not None: | |
429 if (character.isupper() and self._last_alpha_seen.islower()) or ( | |
430 character.islower() and self._last_alpha_seen.isupper() | |
431 ): | |
432 if self._buf is True: | |
433 self._successive_upper_lower_count += 2 | |
434 self._buf = False | |
435 else: | |
436 self._buf = True | |
437 else: | |
438 self._buf = False | |
439 | |
440 self._character_count += 1 | |
441 self._character_count_since_last_sep += 1 | |
442 self._last_alpha_seen = character | |
443 | |
444 def reset(self) -> None: # pragma: no cover | |
445 self._character_count = 0 | |
446 self._character_count_since_last_sep = 0 | |
447 self._successive_upper_lower_count = 0 | |
448 self._successive_upper_lower_count_final = 0 | |
449 self._last_alpha_seen = None | |
450 self._buf = False | |
451 self._current_ascii_only = True | |
452 | |
453 @property | |
454 def ratio(self) -> float: | |
455 if self._character_count == 0: | |
456 return 0.0 | |
457 | |
458 return self._successive_upper_lower_count_final / self._character_count | |
459 | |
460 | |
461 class ArabicIsolatedFormPlugin(MessDetectorPlugin): | |
462 def __init__(self) -> None: | |
463 self._character_count: int = 0 | |
464 self._isolated_form_count: int = 0 | |
465 | |
466 def reset(self) -> None: # pragma: no cover | |
467 self._character_count = 0 | |
468 self._isolated_form_count = 0 | |
469 | |
470 def eligible(self, character: str) -> bool: | |
471 return is_arabic(character) | |
472 | |
473 def feed(self, character: str) -> None: | |
474 self._character_count += 1 | |
475 | |
476 if is_arabic_isolated_form(character): | |
477 self._isolated_form_count += 1 | |
478 | |
479 @property | |
480 def ratio(self) -> float: | |
481 if self._character_count < 8: | |
482 return 0.0 | |
483 | |
484 isolated_form_usage: float = self._isolated_form_count / self._character_count | |
485 | |
486 return isolated_form_usage | |
487 | |
488 | |
489 @lru_cache(maxsize=1024) | |
490 def is_suspiciously_successive_range( | |
491 unicode_range_a: Optional[str], unicode_range_b: Optional[str] | |
492 ) -> bool: | |
493 """ | |
494 Determine if two Unicode range seen next to each other can be considered as suspicious. | |
495 """ | |
496 if unicode_range_a is None or unicode_range_b is None: | |
497 return True | |
498 | |
499 if unicode_range_a == unicode_range_b: | |
500 return False | |
501 | |
502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b: | |
503 return False | |
504 | |
505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: | |
506 return False | |
507 | |
508 # Latin characters can be accompanied with a combining diacritical mark | |
509 # eg. Vietnamese. | |
510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( | |
511 "Combining" in unicode_range_a or "Combining" in unicode_range_b | |
512 ): | |
513 return False | |
514 | |
515 keywords_range_a, keywords_range_b = unicode_range_a.split( | |
516 " " | |
517 ), unicode_range_b.split(" ") | |
518 | |
519 for el in keywords_range_a: | |
520 if el in UNICODE_SECONDARY_RANGE_KEYWORD: | |
521 continue | |
522 if el in keywords_range_b: | |
523 return False | |
524 | |
525 # Japanese Exception | |
526 range_a_jp_chars, range_b_jp_chars = ( | |
527 unicode_range_a | |
528 in ( | |
529 "Hiragana", | |
530 "Katakana", | |
531 ), | |
532 unicode_range_b in ("Hiragana", "Katakana"), | |
533 ) | |
534 if (range_a_jp_chars or range_b_jp_chars) and ( | |
535 "CJK" in unicode_range_a or "CJK" in unicode_range_b | |
536 ): | |
537 return False | |
538 if range_a_jp_chars and range_b_jp_chars: | |
539 return False | |
540 | |
541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: | |
542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b: | |
543 return False | |
544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | |
545 return False | |
546 | |
547 # Chinese/Japanese use dedicated range for punctuation and/or separators. | |
548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( | |
549 unicode_range_a in ["Katakana", "Hiragana"] | |
550 and unicode_range_b in ["Katakana", "Hiragana"] | |
551 ): | |
552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: | |
553 return False | |
554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b: | |
555 return False | |
556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": | |
557 return False | |
558 | |
559 return True | |
560 | |
561 | |
562 @lru_cache(maxsize=2048) | |
563 def mess_ratio( | |
564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False | |
565 ) -> float: | |
566 """ | |
567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. | |
568 """ | |
569 | |
570 detectors: List[MessDetectorPlugin] = [ | |
571 md_class() for md_class in MessDetectorPlugin.__subclasses__() | |
572 ] | |
573 | |
574 length: int = len(decoded_sequence) + 1 | |
575 | |
576 mean_mess_ratio: float = 0.0 | |
577 | |
578 if length < 512: | |
579 intermediary_mean_mess_ratio_calc: int = 32 | |
580 elif length <= 1024: | |
581 intermediary_mean_mess_ratio_calc = 64 | |
582 else: | |
583 intermediary_mean_mess_ratio_calc = 128 | |
584 | |
585 for character, index in zip(decoded_sequence + "\n", range(length)): | |
586 for detector in detectors: | |
587 if detector.eligible(character): | |
588 detector.feed(character) | |
589 | |
590 if ( | |
591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0 | |
592 ) or index == length - 1: | |
593 mean_mess_ratio = sum(dt.ratio for dt in detectors) | |
594 | |
595 if mean_mess_ratio >= maximum_threshold: | |
596 break | |
597 | |
598 if debug: | |
599 logger = getLogger("charset_normalizer") | |
600 | |
601 logger.log( | |
602 TRACE, | |
603 "Mess-detector extended-analysis start. " | |
604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " | |
605 f"maximum_threshold={maximum_threshold}", | |
606 ) | |
607 | |
608 if len(decoded_sequence) > 16: | |
609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") | |
610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") | |
611 | |
612 for dt in detectors: # pragma: nocover | |
613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") | |
614 | |
615 return round(mean_mess_ratio, 3) |