Mercurial > repos > jpayne > bioproject_to_srr_2
comparison charset_normalizer/utils.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 import importlib | |
2 import logging | |
3 import unicodedata | |
4 from codecs import IncrementalDecoder | |
5 from encodings.aliases import aliases | |
6 from functools import lru_cache | |
7 from re import findall | |
8 from typing import Generator, List, Optional, Set, Tuple, Union | |
9 | |
10 from _multibytecodec import MultibyteIncrementalDecoder | |
11 | |
12 from .constant import ( | |
13 ENCODING_MARKS, | |
14 IANA_SUPPORTED_SIMILAR, | |
15 RE_POSSIBLE_ENCODING_INDICATION, | |
16 UNICODE_RANGES_COMBINED, | |
17 UNICODE_SECONDARY_RANGE_KEYWORD, | |
18 UTF8_MAXIMAL_ALLOCATION, | |
19 ) | |
20 | |
21 | |
22 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
23 def is_accentuated(character: str) -> bool: | |
24 try: | |
25 description: str = unicodedata.name(character) | |
26 except ValueError: | |
27 return False | |
28 return ( | |
29 "WITH GRAVE" in description | |
30 or "WITH ACUTE" in description | |
31 or "WITH CEDILLA" in description | |
32 or "WITH DIAERESIS" in description | |
33 or "WITH CIRCUMFLEX" in description | |
34 or "WITH TILDE" in description | |
35 or "WITH MACRON" in description | |
36 or "WITH RING ABOVE" in description | |
37 ) | |
38 | |
39 | |
40 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
41 def remove_accent(character: str) -> str: | |
42 decomposed: str = unicodedata.decomposition(character) | |
43 if not decomposed: | |
44 return character | |
45 | |
46 codes: List[str] = decomposed.split(" ") | |
47 | |
48 return chr(int(codes[0], 16)) | |
49 | |
50 | |
51 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
52 def unicode_range(character: str) -> Optional[str]: | |
53 """ | |
54 Retrieve the Unicode range official name from a single character. | |
55 """ | |
56 character_ord: int = ord(character) | |
57 | |
58 for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): | |
59 if character_ord in ord_range: | |
60 return range_name | |
61 | |
62 return None | |
63 | |
64 | |
65 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
66 def is_latin(character: str) -> bool: | |
67 try: | |
68 description: str = unicodedata.name(character) | |
69 except ValueError: | |
70 return False | |
71 return "LATIN" in description | |
72 | |
73 | |
74 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
75 def is_punctuation(character: str) -> bool: | |
76 character_category: str = unicodedata.category(character) | |
77 | |
78 if "P" in character_category: | |
79 return True | |
80 | |
81 character_range: Optional[str] = unicode_range(character) | |
82 | |
83 if character_range is None: | |
84 return False | |
85 | |
86 return "Punctuation" in character_range | |
87 | |
88 | |
89 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
90 def is_symbol(character: str) -> bool: | |
91 character_category: str = unicodedata.category(character) | |
92 | |
93 if "S" in character_category or "N" in character_category: | |
94 return True | |
95 | |
96 character_range: Optional[str] = unicode_range(character) | |
97 | |
98 if character_range is None: | |
99 return False | |
100 | |
101 return "Forms" in character_range and character_category != "Lo" | |
102 | |
103 | |
104 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
105 def is_emoticon(character: str) -> bool: | |
106 character_range: Optional[str] = unicode_range(character) | |
107 | |
108 if character_range is None: | |
109 return False | |
110 | |
111 return "Emoticons" in character_range or "Pictographs" in character_range | |
112 | |
113 | |
114 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
115 def is_separator(character: str) -> bool: | |
116 if character.isspace() or character in {"|", "+", "<", ">"}: | |
117 return True | |
118 | |
119 character_category: str = unicodedata.category(character) | |
120 | |
121 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} | |
122 | |
123 | |
124 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
125 def is_case_variable(character: str) -> bool: | |
126 return character.islower() != character.isupper() | |
127 | |
128 | |
129 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
130 def is_cjk(character: str) -> bool: | |
131 try: | |
132 character_name = unicodedata.name(character) | |
133 except ValueError: | |
134 return False | |
135 | |
136 return "CJK" in character_name | |
137 | |
138 | |
139 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
140 def is_hiragana(character: str) -> bool: | |
141 try: | |
142 character_name = unicodedata.name(character) | |
143 except ValueError: | |
144 return False | |
145 | |
146 return "HIRAGANA" in character_name | |
147 | |
148 | |
149 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
150 def is_katakana(character: str) -> bool: | |
151 try: | |
152 character_name = unicodedata.name(character) | |
153 except ValueError: | |
154 return False | |
155 | |
156 return "KATAKANA" in character_name | |
157 | |
158 | |
159 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
160 def is_hangul(character: str) -> bool: | |
161 try: | |
162 character_name = unicodedata.name(character) | |
163 except ValueError: | |
164 return False | |
165 | |
166 return "HANGUL" in character_name | |
167 | |
168 | |
169 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
170 def is_thai(character: str) -> bool: | |
171 try: | |
172 character_name = unicodedata.name(character) | |
173 except ValueError: | |
174 return False | |
175 | |
176 return "THAI" in character_name | |
177 | |
178 | |
179 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
180 def is_arabic(character: str) -> bool: | |
181 try: | |
182 character_name = unicodedata.name(character) | |
183 except ValueError: | |
184 return False | |
185 | |
186 return "ARABIC" in character_name | |
187 | |
188 | |
189 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
190 def is_arabic_isolated_form(character: str) -> bool: | |
191 try: | |
192 character_name = unicodedata.name(character) | |
193 except ValueError: | |
194 return False | |
195 | |
196 return "ARABIC" in character_name and "ISOLATED FORM" in character_name | |
197 | |
198 | |
199 @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) | |
200 def is_unicode_range_secondary(range_name: str) -> bool: | |
201 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) | |
202 | |
203 | |
204 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) | |
205 def is_unprintable(character: str) -> bool: | |
206 return ( | |
207 character.isspace() is False # includes \n \t \r \v | |
208 and character.isprintable() is False | |
209 and character != "\x1A" # Why? Its the ASCII substitute character. | |
210 and character != "\ufeff" # bug discovered in Python, | |
211 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. | |
212 ) | |
213 | |
214 | |
215 def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: | |
216 """ | |
217 Extract using ASCII-only decoder any specified encoding in the first n-bytes. | |
218 """ | |
219 if not isinstance(sequence, bytes): | |
220 raise TypeError | |
221 | |
222 seq_len: int = len(sequence) | |
223 | |
224 results: List[str] = findall( | |
225 RE_POSSIBLE_ENCODING_INDICATION, | |
226 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), | |
227 ) | |
228 | |
229 if len(results) == 0: | |
230 return None | |
231 | |
232 for specified_encoding in results: | |
233 specified_encoding = specified_encoding.lower().replace("-", "_") | |
234 | |
235 encoding_alias: str | |
236 encoding_iana: str | |
237 | |
238 for encoding_alias, encoding_iana in aliases.items(): | |
239 if encoding_alias == specified_encoding: | |
240 return encoding_iana | |
241 if encoding_iana == specified_encoding: | |
242 return encoding_iana | |
243 | |
244 return None | |
245 | |
246 | |
247 @lru_cache(maxsize=128) | |
248 def is_multi_byte_encoding(name: str) -> bool: | |
249 """ | |
250 Verify is a specific encoding is a multi byte one based on it IANA name | |
251 """ | |
252 return name in { | |
253 "utf_8", | |
254 "utf_8_sig", | |
255 "utf_16", | |
256 "utf_16_be", | |
257 "utf_16_le", | |
258 "utf_32", | |
259 "utf_32_le", | |
260 "utf_32_be", | |
261 "utf_7", | |
262 } or issubclass( | |
263 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, | |
264 MultibyteIncrementalDecoder, | |
265 ) | |
266 | |
267 | |
268 def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: | |
269 """ | |
270 Identify and extract SIG/BOM in given sequence. | |
271 """ | |
272 | |
273 for iana_encoding in ENCODING_MARKS: | |
274 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] | |
275 | |
276 if isinstance(marks, bytes): | |
277 marks = [marks] | |
278 | |
279 for mark in marks: | |
280 if sequence.startswith(mark): | |
281 return iana_encoding, mark | |
282 | |
283 return None, b"" | |
284 | |
285 | |
286 def should_strip_sig_or_bom(iana_encoding: str) -> bool: | |
287 return iana_encoding not in {"utf_16", "utf_32"} | |
288 | |
289 | |
290 def iana_name(cp_name: str, strict: bool = True) -> str: | |
291 cp_name = cp_name.lower().replace("-", "_") | |
292 | |
293 encoding_alias: str | |
294 encoding_iana: str | |
295 | |
296 for encoding_alias, encoding_iana in aliases.items(): | |
297 if cp_name in [encoding_alias, encoding_iana]: | |
298 return encoding_iana | |
299 | |
300 if strict: | |
301 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) | |
302 | |
303 return cp_name | |
304 | |
305 | |
306 def range_scan(decoded_sequence: str) -> List[str]: | |
307 ranges: Set[str] = set() | |
308 | |
309 for character in decoded_sequence: | |
310 character_range: Optional[str] = unicode_range(character) | |
311 | |
312 if character_range is None: | |
313 continue | |
314 | |
315 ranges.add(character_range) | |
316 | |
317 return list(ranges) | |
318 | |
319 | |
320 def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: | |
321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): | |
322 return 0.0 | |
323 | |
324 decoder_a = importlib.import_module( | |
325 "encodings.{}".format(iana_name_a) | |
326 ).IncrementalDecoder | |
327 decoder_b = importlib.import_module( | |
328 "encodings.{}".format(iana_name_b) | |
329 ).IncrementalDecoder | |
330 | |
331 id_a: IncrementalDecoder = decoder_a(errors="ignore") | |
332 id_b: IncrementalDecoder = decoder_b(errors="ignore") | |
333 | |
334 character_match_count: int = 0 | |
335 | |
336 for i in range(255): | |
337 to_be_decoded: bytes = bytes([i]) | |
338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): | |
339 character_match_count += 1 | |
340 | |
341 return character_match_count / 254 | |
342 | |
343 | |
344 def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: | |
345 """ | |
346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using | |
347 the function cp_similarity. | |
348 """ | |
349 return ( | |
350 iana_name_a in IANA_SUPPORTED_SIMILAR | |
351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] | |
352 ) | |
353 | |
354 | |
355 def set_logging_handler( | |
356 name: str = "charset_normalizer", | |
357 level: int = logging.INFO, | |
358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", | |
359 ) -> None: | |
360 logger = logging.getLogger(name) | |
361 logger.setLevel(level) | |
362 | |
363 handler = logging.StreamHandler() | |
364 handler.setFormatter(logging.Formatter(format_string)) | |
365 logger.addHandler(handler) | |
366 | |
367 | |
368 def cut_sequence_chunks( | |
369 sequences: bytes, | |
370 encoding_iana: str, | |
371 offsets: range, | |
372 chunk_size: int, | |
373 bom_or_sig_available: bool, | |
374 strip_sig_or_bom: bool, | |
375 sig_payload: bytes, | |
376 is_multi_byte_decoder: bool, | |
377 decoded_payload: Optional[str] = None, | |
378 ) -> Generator[str, None, None]: | |
379 if decoded_payload and is_multi_byte_decoder is False: | |
380 for i in offsets: | |
381 chunk = decoded_payload[i : i + chunk_size] | |
382 if not chunk: | |
383 break | |
384 yield chunk | |
385 else: | |
386 for i in offsets: | |
387 chunk_end = i + chunk_size | |
388 if chunk_end > len(sequences) + 8: | |
389 continue | |
390 | |
391 cut_sequence = sequences[i : i + chunk_size] | |
392 | |
393 if bom_or_sig_available and strip_sig_or_bom is False: | |
394 cut_sequence = sig_payload + cut_sequence | |
395 | |
396 chunk = cut_sequence.decode( | |
397 encoding_iana, | |
398 errors="ignore" if is_multi_byte_decoder else "strict", | |
399 ) | |
400 | |
401 # multi-byte bad cutting detector and adjustment | |
402 # not the cleanest way to perform that fix but clever enough for now. | |
403 if is_multi_byte_decoder and i > 0: | |
404 chunk_partial_size_chk: int = min(chunk_size, 16) | |
405 | |
406 if ( | |
407 decoded_payload | |
408 and chunk[:chunk_partial_size_chk] not in decoded_payload | |
409 ): | |
410 for j in range(i, i - 4, -1): | |
411 cut_sequence = sequences[j:chunk_end] | |
412 | |
413 if bom_or_sig_available and strip_sig_or_bom is False: | |
414 cut_sequence = sig_payload + cut_sequence | |
415 | |
416 chunk = cut_sequence.decode(encoding_iana, errors="ignore") | |
417 | |
418 if chunk[:chunk_partial_size_chk] in decoded_payload: | |
419 break | |
420 | |
421 yield chunk |