comparison charset_normalizer/utils.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
6:b2745907b1eb 7:5eb2d5e3bf22
1 import importlib
2 import logging
3 import unicodedata
4 from codecs import IncrementalDecoder
5 from encodings.aliases import aliases
6 from functools import lru_cache
7 from re import findall
8 from typing import Generator, List, Optional, Set, Tuple, Union
9
10 from _multibytecodec import MultibyteIncrementalDecoder
11
12 from .constant import (
13 ENCODING_MARKS,
14 IANA_SUPPORTED_SIMILAR,
15 RE_POSSIBLE_ENCODING_INDICATION,
16 UNICODE_RANGES_COMBINED,
17 UNICODE_SECONDARY_RANGE_KEYWORD,
18 UTF8_MAXIMAL_ALLOCATION,
19 )
20
21
22 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
23 def is_accentuated(character: str) -> bool:
24 try:
25 description: str = unicodedata.name(character)
26 except ValueError:
27 return False
28 return (
29 "WITH GRAVE" in description
30 or "WITH ACUTE" in description
31 or "WITH CEDILLA" in description
32 or "WITH DIAERESIS" in description
33 or "WITH CIRCUMFLEX" in description
34 or "WITH TILDE" in description
35 or "WITH MACRON" in description
36 or "WITH RING ABOVE" in description
37 )
38
39
40 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
41 def remove_accent(character: str) -> str:
42 decomposed: str = unicodedata.decomposition(character)
43 if not decomposed:
44 return character
45
46 codes: List[str] = decomposed.split(" ")
47
48 return chr(int(codes[0], 16))
49
50
51 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
52 def unicode_range(character: str) -> Optional[str]:
53 """
54 Retrieve the Unicode range official name from a single character.
55 """
56 character_ord: int = ord(character)
57
58 for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
59 if character_ord in ord_range:
60 return range_name
61
62 return None
63
64
65 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
66 def is_latin(character: str) -> bool:
67 try:
68 description: str = unicodedata.name(character)
69 except ValueError:
70 return False
71 return "LATIN" in description
72
73
74 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
75 def is_punctuation(character: str) -> bool:
76 character_category: str = unicodedata.category(character)
77
78 if "P" in character_category:
79 return True
80
81 character_range: Optional[str] = unicode_range(character)
82
83 if character_range is None:
84 return False
85
86 return "Punctuation" in character_range
87
88
89 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
90 def is_symbol(character: str) -> bool:
91 character_category: str = unicodedata.category(character)
92
93 if "S" in character_category or "N" in character_category:
94 return True
95
96 character_range: Optional[str] = unicode_range(character)
97
98 if character_range is None:
99 return False
100
101 return "Forms" in character_range and character_category != "Lo"
102
103
104 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
105 def is_emoticon(character: str) -> bool:
106 character_range: Optional[str] = unicode_range(character)
107
108 if character_range is None:
109 return False
110
111 return "Emoticons" in character_range or "Pictographs" in character_range
112
113
114 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
115 def is_separator(character: str) -> bool:
116 if character.isspace() or character in {"|", "+", "<", ">"}:
117 return True
118
119 character_category: str = unicodedata.category(character)
120
121 return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
122
123
124 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
125 def is_case_variable(character: str) -> bool:
126 return character.islower() != character.isupper()
127
128
129 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
130 def is_cjk(character: str) -> bool:
131 try:
132 character_name = unicodedata.name(character)
133 except ValueError:
134 return False
135
136 return "CJK" in character_name
137
138
139 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
140 def is_hiragana(character: str) -> bool:
141 try:
142 character_name = unicodedata.name(character)
143 except ValueError:
144 return False
145
146 return "HIRAGANA" in character_name
147
148
149 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
150 def is_katakana(character: str) -> bool:
151 try:
152 character_name = unicodedata.name(character)
153 except ValueError:
154 return False
155
156 return "KATAKANA" in character_name
157
158
159 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
160 def is_hangul(character: str) -> bool:
161 try:
162 character_name = unicodedata.name(character)
163 except ValueError:
164 return False
165
166 return "HANGUL" in character_name
167
168
169 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
170 def is_thai(character: str) -> bool:
171 try:
172 character_name = unicodedata.name(character)
173 except ValueError:
174 return False
175
176 return "THAI" in character_name
177
178
179 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
180 def is_arabic(character: str) -> bool:
181 try:
182 character_name = unicodedata.name(character)
183 except ValueError:
184 return False
185
186 return "ARABIC" in character_name
187
188
189 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
190 def is_arabic_isolated_form(character: str) -> bool:
191 try:
192 character_name = unicodedata.name(character)
193 except ValueError:
194 return False
195
196 return "ARABIC" in character_name and "ISOLATED FORM" in character_name
197
198
199 @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
200 def is_unicode_range_secondary(range_name: str) -> bool:
201 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
202
203
204 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
205 def is_unprintable(character: str) -> bool:
206 return (
207 character.isspace() is False # includes \n \t \r \v
208 and character.isprintable() is False
209 and character != "\x1A" # Why? Its the ASCII substitute character.
210 and character != "\ufeff" # bug discovered in Python,
211 # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
212 )
213
214
215 def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
216 """
217 Extract using ASCII-only decoder any specified encoding in the first n-bytes.
218 """
219 if not isinstance(sequence, bytes):
220 raise TypeError
221
222 seq_len: int = len(sequence)
223
224 results: List[str] = findall(
225 RE_POSSIBLE_ENCODING_INDICATION,
226 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
227 )
228
229 if len(results) == 0:
230 return None
231
232 for specified_encoding in results:
233 specified_encoding = specified_encoding.lower().replace("-", "_")
234
235 encoding_alias: str
236 encoding_iana: str
237
238 for encoding_alias, encoding_iana in aliases.items():
239 if encoding_alias == specified_encoding:
240 return encoding_iana
241 if encoding_iana == specified_encoding:
242 return encoding_iana
243
244 return None
245
246
247 @lru_cache(maxsize=128)
248 def is_multi_byte_encoding(name: str) -> bool:
249 """
250 Verify is a specific encoding is a multi byte one based on it IANA name
251 """
252 return name in {
253 "utf_8",
254 "utf_8_sig",
255 "utf_16",
256 "utf_16_be",
257 "utf_16_le",
258 "utf_32",
259 "utf_32_le",
260 "utf_32_be",
261 "utf_7",
262 } or issubclass(
263 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
264 MultibyteIncrementalDecoder,
265 )
266
267
268 def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
269 """
270 Identify and extract SIG/BOM in given sequence.
271 """
272
273 for iana_encoding in ENCODING_MARKS:
274 marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
275
276 if isinstance(marks, bytes):
277 marks = [marks]
278
279 for mark in marks:
280 if sequence.startswith(mark):
281 return iana_encoding, mark
282
283 return None, b""
284
285
286 def should_strip_sig_or_bom(iana_encoding: str) -> bool:
287 return iana_encoding not in {"utf_16", "utf_32"}
288
289
290 def iana_name(cp_name: str, strict: bool = True) -> str:
291 cp_name = cp_name.lower().replace("-", "_")
292
293 encoding_alias: str
294 encoding_iana: str
295
296 for encoding_alias, encoding_iana in aliases.items():
297 if cp_name in [encoding_alias, encoding_iana]:
298 return encoding_iana
299
300 if strict:
301 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
302
303 return cp_name
304
305
306 def range_scan(decoded_sequence: str) -> List[str]:
307 ranges: Set[str] = set()
308
309 for character in decoded_sequence:
310 character_range: Optional[str] = unicode_range(character)
311
312 if character_range is None:
313 continue
314
315 ranges.add(character_range)
316
317 return list(ranges)
318
319
320 def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
321 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
322 return 0.0
323
324 decoder_a = importlib.import_module(
325 "encodings.{}".format(iana_name_a)
326 ).IncrementalDecoder
327 decoder_b = importlib.import_module(
328 "encodings.{}".format(iana_name_b)
329 ).IncrementalDecoder
330
331 id_a: IncrementalDecoder = decoder_a(errors="ignore")
332 id_b: IncrementalDecoder = decoder_b(errors="ignore")
333
334 character_match_count: int = 0
335
336 for i in range(255):
337 to_be_decoded: bytes = bytes([i])
338 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
339 character_match_count += 1
340
341 return character_match_count / 254
342
343
344 def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
345 """
346 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
347 the function cp_similarity.
348 """
349 return (
350 iana_name_a in IANA_SUPPORTED_SIMILAR
351 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
352 )
353
354
355 def set_logging_handler(
356 name: str = "charset_normalizer",
357 level: int = logging.INFO,
358 format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
359 ) -> None:
360 logger = logging.getLogger(name)
361 logger.setLevel(level)
362
363 handler = logging.StreamHandler()
364 handler.setFormatter(logging.Formatter(format_string))
365 logger.addHandler(handler)
366
367
368 def cut_sequence_chunks(
369 sequences: bytes,
370 encoding_iana: str,
371 offsets: range,
372 chunk_size: int,
373 bom_or_sig_available: bool,
374 strip_sig_or_bom: bool,
375 sig_payload: bytes,
376 is_multi_byte_decoder: bool,
377 decoded_payload: Optional[str] = None,
378 ) -> Generator[str, None, None]:
379 if decoded_payload and is_multi_byte_decoder is False:
380 for i in offsets:
381 chunk = decoded_payload[i : i + chunk_size]
382 if not chunk:
383 break
384 yield chunk
385 else:
386 for i in offsets:
387 chunk_end = i + chunk_size
388 if chunk_end > len(sequences) + 8:
389 continue
390
391 cut_sequence = sequences[i : i + chunk_size]
392
393 if bom_or_sig_available and strip_sig_or_bom is False:
394 cut_sequence = sig_payload + cut_sequence
395
396 chunk = cut_sequence.decode(
397 encoding_iana,
398 errors="ignore" if is_multi_byte_decoder else "strict",
399 )
400
401 # multi-byte bad cutting detector and adjustment
402 # not the cleanest way to perform that fix but clever enough for now.
403 if is_multi_byte_decoder and i > 0:
404 chunk_partial_size_chk: int = min(chunk_size, 16)
405
406 if (
407 decoded_payload
408 and chunk[:chunk_partial_size_chk] not in decoded_payload
409 ):
410 for j in range(i, i - 4, -1):
411 cut_sequence = sequences[j:chunk_end]
412
413 if bom_or_sig_available and strip_sig_or_bom is False:
414 cut_sequence = sig_payload + cut_sequence
415
416 chunk = cut_sequence.decode(encoding_iana, errors="ignore")
417
418 if chunk[:chunk_partial_size_chk] in decoded_payload:
419 break
420
421 yield chunk