jpayne@7
|
1 from functools import lru_cache
|
jpayne@7
|
2 from logging import getLogger
|
jpayne@7
|
3 from typing import List, Optional
|
jpayne@7
|
4
|
jpayne@7
|
5 from .constant import (
|
jpayne@7
|
6 COMMON_SAFE_ASCII_CHARACTERS,
|
jpayne@7
|
7 TRACE,
|
jpayne@7
|
8 UNICODE_SECONDARY_RANGE_KEYWORD,
|
jpayne@7
|
9 )
|
jpayne@7
|
10 from .utils import (
|
jpayne@7
|
11 is_accentuated,
|
jpayne@7
|
12 is_arabic,
|
jpayne@7
|
13 is_arabic_isolated_form,
|
jpayne@7
|
14 is_case_variable,
|
jpayne@7
|
15 is_cjk,
|
jpayne@7
|
16 is_emoticon,
|
jpayne@7
|
17 is_hangul,
|
jpayne@7
|
18 is_hiragana,
|
jpayne@7
|
19 is_katakana,
|
jpayne@7
|
20 is_latin,
|
jpayne@7
|
21 is_punctuation,
|
jpayne@7
|
22 is_separator,
|
jpayne@7
|
23 is_symbol,
|
jpayne@7
|
24 is_thai,
|
jpayne@7
|
25 is_unprintable,
|
jpayne@7
|
26 remove_accent,
|
jpayne@7
|
27 unicode_range,
|
jpayne@7
|
28 )
|
jpayne@7
|
29
|
jpayne@7
|
30
|
jpayne@7
|
31 class MessDetectorPlugin:
|
jpayne@7
|
32 """
|
jpayne@7
|
33 Base abstract class used for mess detection plugins.
|
jpayne@7
|
34 All detectors MUST extend and implement given methods.
|
jpayne@7
|
35 """
|
jpayne@7
|
36
|
jpayne@7
|
37 def eligible(self, character: str) -> bool:
|
jpayne@7
|
38 """
|
jpayne@7
|
39 Determine if given character should be fed in.
|
jpayne@7
|
40 """
|
jpayne@7
|
41 raise NotImplementedError # pragma: nocover
|
jpayne@7
|
42
|
jpayne@7
|
43 def feed(self, character: str) -> None:
|
jpayne@7
|
44 """
|
jpayne@7
|
45 The main routine to be executed upon character.
|
jpayne@7
|
46 Insert the logic in witch the text would be considered chaotic.
|
jpayne@7
|
47 """
|
jpayne@7
|
48 raise NotImplementedError # pragma: nocover
|
jpayne@7
|
49
|
jpayne@7
|
50 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
51 """
|
jpayne@7
|
52 Permit to reset the plugin to the initial state.
|
jpayne@7
|
53 """
|
jpayne@7
|
54 raise NotImplementedError
|
jpayne@7
|
55
|
jpayne@7
|
56 @property
|
jpayne@7
|
57 def ratio(self) -> float:
|
jpayne@7
|
58 """
|
jpayne@7
|
59 Compute the chaos ratio based on what your feed() has seen.
|
jpayne@7
|
60 Must NOT be lower than 0.; No restriction gt 0.
|
jpayne@7
|
61 """
|
jpayne@7
|
62 raise NotImplementedError # pragma: nocover
|
jpayne@7
|
63
|
jpayne@7
|
64
|
jpayne@7
|
65 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
jpayne@7
|
66 def __init__(self) -> None:
|
jpayne@7
|
67 self._punctuation_count: int = 0
|
jpayne@7
|
68 self._symbol_count: int = 0
|
jpayne@7
|
69 self._character_count: int = 0
|
jpayne@7
|
70
|
jpayne@7
|
71 self._last_printable_char: Optional[str] = None
|
jpayne@7
|
72 self._frenzy_symbol_in_word: bool = False
|
jpayne@7
|
73
|
jpayne@7
|
74 def eligible(self, character: str) -> bool:
|
jpayne@7
|
75 return character.isprintable()
|
jpayne@7
|
76
|
jpayne@7
|
77 def feed(self, character: str) -> None:
|
jpayne@7
|
78 self._character_count += 1
|
jpayne@7
|
79
|
jpayne@7
|
80 if (
|
jpayne@7
|
81 character != self._last_printable_char
|
jpayne@7
|
82 and character not in COMMON_SAFE_ASCII_CHARACTERS
|
jpayne@7
|
83 ):
|
jpayne@7
|
84 if is_punctuation(character):
|
jpayne@7
|
85 self._punctuation_count += 1
|
jpayne@7
|
86 elif (
|
jpayne@7
|
87 character.isdigit() is False
|
jpayne@7
|
88 and is_symbol(character)
|
jpayne@7
|
89 and is_emoticon(character) is False
|
jpayne@7
|
90 ):
|
jpayne@7
|
91 self._symbol_count += 2
|
jpayne@7
|
92
|
jpayne@7
|
93 self._last_printable_char = character
|
jpayne@7
|
94
|
jpayne@7
|
95 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
96 self._punctuation_count = 0
|
jpayne@7
|
97 self._character_count = 0
|
jpayne@7
|
98 self._symbol_count = 0
|
jpayne@7
|
99
|
jpayne@7
|
100 @property
|
jpayne@7
|
101 def ratio(self) -> float:
|
jpayne@7
|
102 if self._character_count == 0:
|
jpayne@7
|
103 return 0.0
|
jpayne@7
|
104
|
jpayne@7
|
105 ratio_of_punctuation: float = (
|
jpayne@7
|
106 self._punctuation_count + self._symbol_count
|
jpayne@7
|
107 ) / self._character_count
|
jpayne@7
|
108
|
jpayne@7
|
109 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
jpayne@7
|
110
|
jpayne@7
|
111
|
jpayne@7
|
112 class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
jpayne@7
|
113 def __init__(self) -> None:
|
jpayne@7
|
114 self._character_count: int = 0
|
jpayne@7
|
115 self._accentuated_count: int = 0
|
jpayne@7
|
116
|
jpayne@7
|
117 def eligible(self, character: str) -> bool:
|
jpayne@7
|
118 return character.isalpha()
|
jpayne@7
|
119
|
jpayne@7
|
120 def feed(self, character: str) -> None:
|
jpayne@7
|
121 self._character_count += 1
|
jpayne@7
|
122
|
jpayne@7
|
123 if is_accentuated(character):
|
jpayne@7
|
124 self._accentuated_count += 1
|
jpayne@7
|
125
|
jpayne@7
|
126 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
127 self._character_count = 0
|
jpayne@7
|
128 self._accentuated_count = 0
|
jpayne@7
|
129
|
jpayne@7
|
130 @property
|
jpayne@7
|
131 def ratio(self) -> float:
|
jpayne@7
|
132 if self._character_count < 8:
|
jpayne@7
|
133 return 0.0
|
jpayne@7
|
134
|
jpayne@7
|
135 ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
jpayne@7
|
136 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
jpayne@7
|
137
|
jpayne@7
|
138
|
jpayne@7
|
139 class UnprintablePlugin(MessDetectorPlugin):
|
jpayne@7
|
140 def __init__(self) -> None:
|
jpayne@7
|
141 self._unprintable_count: int = 0
|
jpayne@7
|
142 self._character_count: int = 0
|
jpayne@7
|
143
|
jpayne@7
|
144 def eligible(self, character: str) -> bool:
|
jpayne@7
|
145 return True
|
jpayne@7
|
146
|
jpayne@7
|
147 def feed(self, character: str) -> None:
|
jpayne@7
|
148 if is_unprintable(character):
|
jpayne@7
|
149 self._unprintable_count += 1
|
jpayne@7
|
150 self._character_count += 1
|
jpayne@7
|
151
|
jpayne@7
|
152 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
153 self._unprintable_count = 0
|
jpayne@7
|
154
|
jpayne@7
|
155 @property
|
jpayne@7
|
156 def ratio(self) -> float:
|
jpayne@7
|
157 if self._character_count == 0:
|
jpayne@7
|
158 return 0.0
|
jpayne@7
|
159
|
jpayne@7
|
160 return (self._unprintable_count * 8) / self._character_count
|
jpayne@7
|
161
|
jpayne@7
|
162
|
jpayne@7
|
163 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
jpayne@7
|
164 def __init__(self) -> None:
|
jpayne@7
|
165 self._successive_count: int = 0
|
jpayne@7
|
166 self._character_count: int = 0
|
jpayne@7
|
167
|
jpayne@7
|
168 self._last_latin_character: Optional[str] = None
|
jpayne@7
|
169
|
jpayne@7
|
170 def eligible(self, character: str) -> bool:
|
jpayne@7
|
171 return character.isalpha() and is_latin(character)
|
jpayne@7
|
172
|
jpayne@7
|
173 def feed(self, character: str) -> None:
|
jpayne@7
|
174 self._character_count += 1
|
jpayne@7
|
175 if (
|
jpayne@7
|
176 self._last_latin_character is not None
|
jpayne@7
|
177 and is_accentuated(character)
|
jpayne@7
|
178 and is_accentuated(self._last_latin_character)
|
jpayne@7
|
179 ):
|
jpayne@7
|
180 if character.isupper() and self._last_latin_character.isupper():
|
jpayne@7
|
181 self._successive_count += 1
|
jpayne@7
|
182 # Worse if its the same char duplicated with different accent.
|
jpayne@7
|
183 if remove_accent(character) == remove_accent(self._last_latin_character):
|
jpayne@7
|
184 self._successive_count += 1
|
jpayne@7
|
185 self._last_latin_character = character
|
jpayne@7
|
186
|
jpayne@7
|
187 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
188 self._successive_count = 0
|
jpayne@7
|
189 self._character_count = 0
|
jpayne@7
|
190 self._last_latin_character = None
|
jpayne@7
|
191
|
jpayne@7
|
192 @property
|
jpayne@7
|
193 def ratio(self) -> float:
|
jpayne@7
|
194 if self._character_count == 0:
|
jpayne@7
|
195 return 0.0
|
jpayne@7
|
196
|
jpayne@7
|
197 return (self._successive_count * 2) / self._character_count
|
jpayne@7
|
198
|
jpayne@7
|
199
|
jpayne@7
|
200 class SuspiciousRange(MessDetectorPlugin):
|
jpayne@7
|
201 def __init__(self) -> None:
|
jpayne@7
|
202 self._suspicious_successive_range_count: int = 0
|
jpayne@7
|
203 self._character_count: int = 0
|
jpayne@7
|
204 self._last_printable_seen: Optional[str] = None
|
jpayne@7
|
205
|
jpayne@7
|
206 def eligible(self, character: str) -> bool:
|
jpayne@7
|
207 return character.isprintable()
|
jpayne@7
|
208
|
jpayne@7
|
209 def feed(self, character: str) -> None:
|
jpayne@7
|
210 self._character_count += 1
|
jpayne@7
|
211
|
jpayne@7
|
212 if (
|
jpayne@7
|
213 character.isspace()
|
jpayne@7
|
214 or is_punctuation(character)
|
jpayne@7
|
215 or character in COMMON_SAFE_ASCII_CHARACTERS
|
jpayne@7
|
216 ):
|
jpayne@7
|
217 self._last_printable_seen = None
|
jpayne@7
|
218 return
|
jpayne@7
|
219
|
jpayne@7
|
220 if self._last_printable_seen is None:
|
jpayne@7
|
221 self._last_printable_seen = character
|
jpayne@7
|
222 return
|
jpayne@7
|
223
|
jpayne@7
|
224 unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
jpayne@7
|
225 unicode_range_b: Optional[str] = unicode_range(character)
|
jpayne@7
|
226
|
jpayne@7
|
227 if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
jpayne@7
|
228 self._suspicious_successive_range_count += 1
|
jpayne@7
|
229
|
jpayne@7
|
230 self._last_printable_seen = character
|
jpayne@7
|
231
|
jpayne@7
|
232 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
233 self._character_count = 0
|
jpayne@7
|
234 self._suspicious_successive_range_count = 0
|
jpayne@7
|
235 self._last_printable_seen = None
|
jpayne@7
|
236
|
jpayne@7
|
237 @property
|
jpayne@7
|
238 def ratio(self) -> float:
|
jpayne@7
|
239 if self._character_count <= 24:
|
jpayne@7
|
240 return 0.0
|
jpayne@7
|
241
|
jpayne@7
|
242 ratio_of_suspicious_range_usage: float = (
|
jpayne@7
|
243 self._suspicious_successive_range_count * 2
|
jpayne@7
|
244 ) / self._character_count
|
jpayne@7
|
245
|
jpayne@7
|
246 return ratio_of_suspicious_range_usage
|
jpayne@7
|
247
|
jpayne@7
|
248
|
jpayne@7
|
249 class SuperWeirdWordPlugin(MessDetectorPlugin):
|
jpayne@7
|
250 def __init__(self) -> None:
|
jpayne@7
|
251 self._word_count: int = 0
|
jpayne@7
|
252 self._bad_word_count: int = 0
|
jpayne@7
|
253 self._foreign_long_count: int = 0
|
jpayne@7
|
254
|
jpayne@7
|
255 self._is_current_word_bad: bool = False
|
jpayne@7
|
256 self._foreign_long_watch: bool = False
|
jpayne@7
|
257
|
jpayne@7
|
258 self._character_count: int = 0
|
jpayne@7
|
259 self._bad_character_count: int = 0
|
jpayne@7
|
260
|
jpayne@7
|
261 self._buffer: str = ""
|
jpayne@7
|
262 self._buffer_accent_count: int = 0
|
jpayne@7
|
263
|
jpayne@7
|
264 def eligible(self, character: str) -> bool:
|
jpayne@7
|
265 return True
|
jpayne@7
|
266
|
jpayne@7
|
267 def feed(self, character: str) -> None:
|
jpayne@7
|
268 if character.isalpha():
|
jpayne@7
|
269 self._buffer += character
|
jpayne@7
|
270 if is_accentuated(character):
|
jpayne@7
|
271 self._buffer_accent_count += 1
|
jpayne@7
|
272 if (
|
jpayne@7
|
273 self._foreign_long_watch is False
|
jpayne@7
|
274 and (is_latin(character) is False or is_accentuated(character))
|
jpayne@7
|
275 and is_cjk(character) is False
|
jpayne@7
|
276 and is_hangul(character) is False
|
jpayne@7
|
277 and is_katakana(character) is False
|
jpayne@7
|
278 and is_hiragana(character) is False
|
jpayne@7
|
279 and is_thai(character) is False
|
jpayne@7
|
280 ):
|
jpayne@7
|
281 self._foreign_long_watch = True
|
jpayne@7
|
282 return
|
jpayne@7
|
283 if not self._buffer:
|
jpayne@7
|
284 return
|
jpayne@7
|
285 if (
|
jpayne@7
|
286 character.isspace() or is_punctuation(character) or is_separator(character)
|
jpayne@7
|
287 ) and self._buffer:
|
jpayne@7
|
288 self._word_count += 1
|
jpayne@7
|
289 buffer_length: int = len(self._buffer)
|
jpayne@7
|
290
|
jpayne@7
|
291 self._character_count += buffer_length
|
jpayne@7
|
292
|
jpayne@7
|
293 if buffer_length >= 4:
|
jpayne@7
|
294 if self._buffer_accent_count / buffer_length > 0.34:
|
jpayne@7
|
295 self._is_current_word_bad = True
|
jpayne@7
|
296 # Word/Buffer ending with an upper case accentuated letter are so rare,
|
jpayne@7
|
297 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
jpayne@7
|
298 if (
|
jpayne@7
|
299 is_accentuated(self._buffer[-1])
|
jpayne@7
|
300 and self._buffer[-1].isupper()
|
jpayne@7
|
301 and all(_.isupper() for _ in self._buffer) is False
|
jpayne@7
|
302 ):
|
jpayne@7
|
303 self._foreign_long_count += 1
|
jpayne@7
|
304 self._is_current_word_bad = True
|
jpayne@7
|
305 if buffer_length >= 24 and self._foreign_long_watch:
|
jpayne@7
|
306 camel_case_dst = [
|
jpayne@7
|
307 i
|
jpayne@7
|
308 for c, i in zip(self._buffer, range(0, buffer_length))
|
jpayne@7
|
309 if c.isupper()
|
jpayne@7
|
310 ]
|
jpayne@7
|
311 probable_camel_cased: bool = False
|
jpayne@7
|
312
|
jpayne@7
|
313 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
jpayne@7
|
314 probable_camel_cased = True
|
jpayne@7
|
315
|
jpayne@7
|
316 if not probable_camel_cased:
|
jpayne@7
|
317 self._foreign_long_count += 1
|
jpayne@7
|
318 self._is_current_word_bad = True
|
jpayne@7
|
319
|
jpayne@7
|
320 if self._is_current_word_bad:
|
jpayne@7
|
321 self._bad_word_count += 1
|
jpayne@7
|
322 self._bad_character_count += len(self._buffer)
|
jpayne@7
|
323 self._is_current_word_bad = False
|
jpayne@7
|
324
|
jpayne@7
|
325 self._foreign_long_watch = False
|
jpayne@7
|
326 self._buffer = ""
|
jpayne@7
|
327 self._buffer_accent_count = 0
|
jpayne@7
|
328 elif (
|
jpayne@7
|
329 character not in {"<", ">", "-", "=", "~", "|", "_"}
|
jpayne@7
|
330 and character.isdigit() is False
|
jpayne@7
|
331 and is_symbol(character)
|
jpayne@7
|
332 ):
|
jpayne@7
|
333 self._is_current_word_bad = True
|
jpayne@7
|
334 self._buffer += character
|
jpayne@7
|
335
|
jpayne@7
|
336 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
337 self._buffer = ""
|
jpayne@7
|
338 self._is_current_word_bad = False
|
jpayne@7
|
339 self._foreign_long_watch = False
|
jpayne@7
|
340 self._bad_word_count = 0
|
jpayne@7
|
341 self._word_count = 0
|
jpayne@7
|
342 self._character_count = 0
|
jpayne@7
|
343 self._bad_character_count = 0
|
jpayne@7
|
344 self._foreign_long_count = 0
|
jpayne@7
|
345
|
jpayne@7
|
346 @property
|
jpayne@7
|
347 def ratio(self) -> float:
|
jpayne@7
|
348 if self._word_count <= 10 and self._foreign_long_count == 0:
|
jpayne@7
|
349 return 0.0
|
jpayne@7
|
350
|
jpayne@7
|
351 return self._bad_character_count / self._character_count
|
jpayne@7
|
352
|
jpayne@7
|
353
|
jpayne@7
|
354 class CjkInvalidStopPlugin(MessDetectorPlugin):
|
jpayne@7
|
355 """
|
jpayne@7
|
356 GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
jpayne@7
|
357 can be easily detected. Searching for the overuse of '丅' and '丄'.
|
jpayne@7
|
358 """
|
jpayne@7
|
359
|
jpayne@7
|
360 def __init__(self) -> None:
|
jpayne@7
|
361 self._wrong_stop_count: int = 0
|
jpayne@7
|
362 self._cjk_character_count: int = 0
|
jpayne@7
|
363
|
jpayne@7
|
364 def eligible(self, character: str) -> bool:
|
jpayne@7
|
365 return True
|
jpayne@7
|
366
|
jpayne@7
|
367 def feed(self, character: str) -> None:
|
jpayne@7
|
368 if character in {"丅", "丄"}:
|
jpayne@7
|
369 self._wrong_stop_count += 1
|
jpayne@7
|
370 return
|
jpayne@7
|
371 if is_cjk(character):
|
jpayne@7
|
372 self._cjk_character_count += 1
|
jpayne@7
|
373
|
jpayne@7
|
374 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
375 self._wrong_stop_count = 0
|
jpayne@7
|
376 self._cjk_character_count = 0
|
jpayne@7
|
377
|
jpayne@7
|
378 @property
|
jpayne@7
|
379 def ratio(self) -> float:
|
jpayne@7
|
380 if self._cjk_character_count < 16:
|
jpayne@7
|
381 return 0.0
|
jpayne@7
|
382 return self._wrong_stop_count / self._cjk_character_count
|
jpayne@7
|
383
|
jpayne@7
|
384
|
jpayne@7
|
385 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
jpayne@7
|
386 def __init__(self) -> None:
|
jpayne@7
|
387 self._buf: bool = False
|
jpayne@7
|
388
|
jpayne@7
|
389 self._character_count_since_last_sep: int = 0
|
jpayne@7
|
390
|
jpayne@7
|
391 self._successive_upper_lower_count: int = 0
|
jpayne@7
|
392 self._successive_upper_lower_count_final: int = 0
|
jpayne@7
|
393
|
jpayne@7
|
394 self._character_count: int = 0
|
jpayne@7
|
395
|
jpayne@7
|
396 self._last_alpha_seen: Optional[str] = None
|
jpayne@7
|
397 self._current_ascii_only: bool = True
|
jpayne@7
|
398
|
jpayne@7
|
399 def eligible(self, character: str) -> bool:
|
jpayne@7
|
400 return True
|
jpayne@7
|
401
|
jpayne@7
|
402 def feed(self, character: str) -> None:
|
jpayne@7
|
403 is_concerned = character.isalpha() and is_case_variable(character)
|
jpayne@7
|
404 chunk_sep = is_concerned is False
|
jpayne@7
|
405
|
jpayne@7
|
406 if chunk_sep and self._character_count_since_last_sep > 0:
|
jpayne@7
|
407 if (
|
jpayne@7
|
408 self._character_count_since_last_sep <= 64
|
jpayne@7
|
409 and character.isdigit() is False
|
jpayne@7
|
410 and self._current_ascii_only is False
|
jpayne@7
|
411 ):
|
jpayne@7
|
412 self._successive_upper_lower_count_final += (
|
jpayne@7
|
413 self._successive_upper_lower_count
|
jpayne@7
|
414 )
|
jpayne@7
|
415
|
jpayne@7
|
416 self._successive_upper_lower_count = 0
|
jpayne@7
|
417 self._character_count_since_last_sep = 0
|
jpayne@7
|
418 self._last_alpha_seen = None
|
jpayne@7
|
419 self._buf = False
|
jpayne@7
|
420 self._character_count += 1
|
jpayne@7
|
421 self._current_ascii_only = True
|
jpayne@7
|
422
|
jpayne@7
|
423 return
|
jpayne@7
|
424
|
jpayne@7
|
425 if self._current_ascii_only is True and character.isascii() is False:
|
jpayne@7
|
426 self._current_ascii_only = False
|
jpayne@7
|
427
|
jpayne@7
|
428 if self._last_alpha_seen is not None:
|
jpayne@7
|
429 if (character.isupper() and self._last_alpha_seen.islower()) or (
|
jpayne@7
|
430 character.islower() and self._last_alpha_seen.isupper()
|
jpayne@7
|
431 ):
|
jpayne@7
|
432 if self._buf is True:
|
jpayne@7
|
433 self._successive_upper_lower_count += 2
|
jpayne@7
|
434 self._buf = False
|
jpayne@7
|
435 else:
|
jpayne@7
|
436 self._buf = True
|
jpayne@7
|
437 else:
|
jpayne@7
|
438 self._buf = False
|
jpayne@7
|
439
|
jpayne@7
|
440 self._character_count += 1
|
jpayne@7
|
441 self._character_count_since_last_sep += 1
|
jpayne@7
|
442 self._last_alpha_seen = character
|
jpayne@7
|
443
|
jpayne@7
|
444 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
445 self._character_count = 0
|
jpayne@7
|
446 self._character_count_since_last_sep = 0
|
jpayne@7
|
447 self._successive_upper_lower_count = 0
|
jpayne@7
|
448 self._successive_upper_lower_count_final = 0
|
jpayne@7
|
449 self._last_alpha_seen = None
|
jpayne@7
|
450 self._buf = False
|
jpayne@7
|
451 self._current_ascii_only = True
|
jpayne@7
|
452
|
jpayne@7
|
453 @property
|
jpayne@7
|
454 def ratio(self) -> float:
|
jpayne@7
|
455 if self._character_count == 0:
|
jpayne@7
|
456 return 0.0
|
jpayne@7
|
457
|
jpayne@7
|
458 return self._successive_upper_lower_count_final / self._character_count
|
jpayne@7
|
459
|
jpayne@7
|
460
|
jpayne@7
|
461 class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
jpayne@7
|
462 def __init__(self) -> None:
|
jpayne@7
|
463 self._character_count: int = 0
|
jpayne@7
|
464 self._isolated_form_count: int = 0
|
jpayne@7
|
465
|
jpayne@7
|
466 def reset(self) -> None: # pragma: no cover
|
jpayne@7
|
467 self._character_count = 0
|
jpayne@7
|
468 self._isolated_form_count = 0
|
jpayne@7
|
469
|
jpayne@7
|
470 def eligible(self, character: str) -> bool:
|
jpayne@7
|
471 return is_arabic(character)
|
jpayne@7
|
472
|
jpayne@7
|
473 def feed(self, character: str) -> None:
|
jpayne@7
|
474 self._character_count += 1
|
jpayne@7
|
475
|
jpayne@7
|
476 if is_arabic_isolated_form(character):
|
jpayne@7
|
477 self._isolated_form_count += 1
|
jpayne@7
|
478
|
jpayne@7
|
479 @property
|
jpayne@7
|
480 def ratio(self) -> float:
|
jpayne@7
|
481 if self._character_count < 8:
|
jpayne@7
|
482 return 0.0
|
jpayne@7
|
483
|
jpayne@7
|
484 isolated_form_usage: float = self._isolated_form_count / self._character_count
|
jpayne@7
|
485
|
jpayne@7
|
486 return isolated_form_usage
|
jpayne@7
|
487
|
jpayne@7
|
488
|
jpayne@7
|
489 @lru_cache(maxsize=1024)
|
jpayne@7
|
490 def is_suspiciously_successive_range(
|
jpayne@7
|
491 unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
jpayne@7
|
492 ) -> bool:
|
jpayne@7
|
493 """
|
jpayne@7
|
494 Determine if two Unicode range seen next to each other can be considered as suspicious.
|
jpayne@7
|
495 """
|
jpayne@7
|
496 if unicode_range_a is None or unicode_range_b is None:
|
jpayne@7
|
497 return True
|
jpayne@7
|
498
|
jpayne@7
|
499 if unicode_range_a == unicode_range_b:
|
jpayne@7
|
500 return False
|
jpayne@7
|
501
|
jpayne@7
|
502 if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
jpayne@7
|
503 return False
|
jpayne@7
|
504
|
jpayne@7
|
505 if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
jpayne@7
|
506 return False
|
jpayne@7
|
507
|
jpayne@7
|
508 # Latin characters can be accompanied with a combining diacritical mark
|
jpayne@7
|
509 # eg. Vietnamese.
|
jpayne@7
|
510 if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
jpayne@7
|
511 "Combining" in unicode_range_a or "Combining" in unicode_range_b
|
jpayne@7
|
512 ):
|
jpayne@7
|
513 return False
|
jpayne@7
|
514
|
jpayne@7
|
515 keywords_range_a, keywords_range_b = unicode_range_a.split(
|
jpayne@7
|
516 " "
|
jpayne@7
|
517 ), unicode_range_b.split(" ")
|
jpayne@7
|
518
|
jpayne@7
|
519 for el in keywords_range_a:
|
jpayne@7
|
520 if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
jpayne@7
|
521 continue
|
jpayne@7
|
522 if el in keywords_range_b:
|
jpayne@7
|
523 return False
|
jpayne@7
|
524
|
jpayne@7
|
525 # Japanese Exception
|
jpayne@7
|
526 range_a_jp_chars, range_b_jp_chars = (
|
jpayne@7
|
527 unicode_range_a
|
jpayne@7
|
528 in (
|
jpayne@7
|
529 "Hiragana",
|
jpayne@7
|
530 "Katakana",
|
jpayne@7
|
531 ),
|
jpayne@7
|
532 unicode_range_b in ("Hiragana", "Katakana"),
|
jpayne@7
|
533 )
|
jpayne@7
|
534 if (range_a_jp_chars or range_b_jp_chars) and (
|
jpayne@7
|
535 "CJK" in unicode_range_a or "CJK" in unicode_range_b
|
jpayne@7
|
536 ):
|
jpayne@7
|
537 return False
|
jpayne@7
|
538 if range_a_jp_chars and range_b_jp_chars:
|
jpayne@7
|
539 return False
|
jpayne@7
|
540
|
jpayne@7
|
541 if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
jpayne@7
|
542 if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
jpayne@7
|
543 return False
|
jpayne@7
|
544 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
jpayne@7
|
545 return False
|
jpayne@7
|
546
|
jpayne@7
|
547 # Chinese/Japanese use dedicated range for punctuation and/or separators.
|
jpayne@7
|
548 if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
jpayne@7
|
549 unicode_range_a in ["Katakana", "Hiragana"]
|
jpayne@7
|
550 and unicode_range_b in ["Katakana", "Hiragana"]
|
jpayne@7
|
551 ):
|
jpayne@7
|
552 if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
jpayne@7
|
553 return False
|
jpayne@7
|
554 if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
jpayne@7
|
555 return False
|
jpayne@7
|
556 if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
jpayne@7
|
557 return False
|
jpayne@7
|
558
|
jpayne@7
|
559 return True
|
jpayne@7
|
560
|
jpayne@7
|
561
|
jpayne@7
|
562 @lru_cache(maxsize=2048)
|
jpayne@7
|
563 def mess_ratio(
|
jpayne@7
|
564 decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
jpayne@7
|
565 ) -> float:
|
jpayne@7
|
566 """
|
jpayne@7
|
567 Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
jpayne@7
|
568 """
|
jpayne@7
|
569
|
jpayne@7
|
570 detectors: List[MessDetectorPlugin] = [
|
jpayne@7
|
571 md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
jpayne@7
|
572 ]
|
jpayne@7
|
573
|
jpayne@7
|
574 length: int = len(decoded_sequence) + 1
|
jpayne@7
|
575
|
jpayne@7
|
576 mean_mess_ratio: float = 0.0
|
jpayne@7
|
577
|
jpayne@7
|
578 if length < 512:
|
jpayne@7
|
579 intermediary_mean_mess_ratio_calc: int = 32
|
jpayne@7
|
580 elif length <= 1024:
|
jpayne@7
|
581 intermediary_mean_mess_ratio_calc = 64
|
jpayne@7
|
582 else:
|
jpayne@7
|
583 intermediary_mean_mess_ratio_calc = 128
|
jpayne@7
|
584
|
jpayne@7
|
585 for character, index in zip(decoded_sequence + "\n", range(length)):
|
jpayne@7
|
586 for detector in detectors:
|
jpayne@7
|
587 if detector.eligible(character):
|
jpayne@7
|
588 detector.feed(character)
|
jpayne@7
|
589
|
jpayne@7
|
590 if (
|
jpayne@7
|
591 index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
jpayne@7
|
592 ) or index == length - 1:
|
jpayne@7
|
593 mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
jpayne@7
|
594
|
jpayne@7
|
595 if mean_mess_ratio >= maximum_threshold:
|
jpayne@7
|
596 break
|
jpayne@7
|
597
|
jpayne@7
|
598 if debug:
|
jpayne@7
|
599 logger = getLogger("charset_normalizer")
|
jpayne@7
|
600
|
jpayne@7
|
601 logger.log(
|
jpayne@7
|
602 TRACE,
|
jpayne@7
|
603 "Mess-detector extended-analysis start. "
|
jpayne@7
|
604 f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
jpayne@7
|
605 f"maximum_threshold={maximum_threshold}",
|
jpayne@7
|
606 )
|
jpayne@7
|
607
|
jpayne@7
|
608 if len(decoded_sequence) > 16:
|
jpayne@7
|
609 logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
jpayne@7
|
610 logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
jpayne@7
|
611
|
jpayne@7
|
612 for dt in detectors: # pragma: nocover
|
jpayne@7
|
613 logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
jpayne@7
|
614
|
jpayne@7
|
615 return round(mean_mess_ratio, 3)
|