comparison charset_normalizer/cd.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
6:b2745907b1eb 7:5eb2d5e3bf22
1 import importlib
2 from codecs import IncrementalDecoder
3 from collections import Counter
4 from functools import lru_cache
5 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
6
7 from .constant import (
8 FREQUENCIES,
9 KO_NAMES,
10 LANGUAGE_SUPPORTED_COUNT,
11 TOO_SMALL_SEQUENCE,
12 ZH_NAMES,
13 )
14 from .md import is_suspiciously_successive_range
15 from .models import CoherenceMatches
16 from .utils import (
17 is_accentuated,
18 is_latin,
19 is_multi_byte_encoding,
20 is_unicode_range_secondary,
21 unicode_range,
22 )
23
24
25 def encoding_unicode_range(iana_name: str) -> List[str]:
26 """
27 Return associated unicode ranges in a single byte code page.
28 """
29 if is_multi_byte_encoding(iana_name):
30 raise IOError("Function not supported on multi-byte code page")
31
32 decoder = importlib.import_module(
33 "encodings.{}".format(iana_name)
34 ).IncrementalDecoder
35
36 p: IncrementalDecoder = decoder(errors="ignore")
37 seen_ranges: Dict[str, int] = {}
38 character_count: int = 0
39
40 for i in range(0x40, 0xFF):
41 chunk: str = p.decode(bytes([i]))
42
43 if chunk:
44 character_range: Optional[str] = unicode_range(chunk)
45
46 if character_range is None:
47 continue
48
49 if is_unicode_range_secondary(character_range) is False:
50 if character_range not in seen_ranges:
51 seen_ranges[character_range] = 0
52 seen_ranges[character_range] += 1
53 character_count += 1
54
55 return sorted(
56 [
57 character_range
58 for character_range in seen_ranges
59 if seen_ranges[character_range] / character_count >= 0.15
60 ]
61 )
62
63
64 def unicode_range_languages(primary_range: str) -> List[str]:
65 """
66 Return inferred languages used with a unicode range.
67 """
68 languages: List[str] = []
69
70 for language, characters in FREQUENCIES.items():
71 for character in characters:
72 if unicode_range(character) == primary_range:
73 languages.append(language)
74 break
75
76 return languages
77
78
79 @lru_cache()
80 def encoding_languages(iana_name: str) -> List[str]:
81 """
82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
83 This function does the correspondence.
84 """
85 unicode_ranges: List[str] = encoding_unicode_range(iana_name)
86 primary_range: Optional[str] = None
87
88 for specified_range in unicode_ranges:
89 if "Latin" not in specified_range:
90 primary_range = specified_range
91 break
92
93 if primary_range is None:
94 return ["Latin Based"]
95
96 return unicode_range_languages(primary_range)
97
98
99 @lru_cache()
100 def mb_encoding_languages(iana_name: str) -> List[str]:
101 """
102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
103 This function does the correspondence.
104 """
105 if (
106 iana_name.startswith("shift_")
107 or iana_name.startswith("iso2022_jp")
108 or iana_name.startswith("euc_j")
109 or iana_name == "cp932"
110 ):
111 return ["Japanese"]
112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
113 return ["Chinese"]
114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
115 return ["Korean"]
116
117 return []
118
119
120 @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
121 def get_target_features(language: str) -> Tuple[bool, bool]:
122 """
123 Determine main aspects from a supported language if it contains accents and if is pure Latin.
124 """
125 target_have_accents: bool = False
126 target_pure_latin: bool = True
127
128 for character in FREQUENCIES[language]:
129 if not target_have_accents and is_accentuated(character):
130 target_have_accents = True
131 if target_pure_latin and is_latin(character) is False:
132 target_pure_latin = False
133
134 return target_have_accents, target_pure_latin
135
136
137 def alphabet_languages(
138 characters: List[str], ignore_non_latin: bool = False
139 ) -> List[str]:
140 """
141 Return associated languages associated to given characters.
142 """
143 languages: List[Tuple[str, float]] = []
144
145 source_have_accents = any(is_accentuated(character) for character in characters)
146
147 for language, language_characters in FREQUENCIES.items():
148 target_have_accents, target_pure_latin = get_target_features(language)
149
150 if ignore_non_latin and target_pure_latin is False:
151 continue
152
153 if target_have_accents is False and source_have_accents:
154 continue
155
156 character_count: int = len(language_characters)
157
158 character_match_count: int = len(
159 [c for c in language_characters if c in characters]
160 )
161
162 ratio: float = character_match_count / character_count
163
164 if ratio >= 0.2:
165 languages.append((language, ratio))
166
167 languages = sorted(languages, key=lambda x: x[1], reverse=True)
168
169 return [compatible_language[0] for compatible_language in languages]
170
171
172 def characters_popularity_compare(
173 language: str, ordered_characters: List[str]
174 ) -> float:
175 """
176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
179 """
180 if language not in FREQUENCIES:
181 raise ValueError("{} not available".format(language))
182
183 character_approved_count: int = 0
184 FREQUENCIES_language_set = set(FREQUENCIES[language])
185
186 ordered_characters_count: int = len(ordered_characters)
187 target_language_characters_count: int = len(FREQUENCIES[language])
188
189 large_alphabet: bool = target_language_characters_count > 26
190
191 for character, character_rank in zip(
192 ordered_characters, range(0, ordered_characters_count)
193 ):
194 if character not in FREQUENCIES_language_set:
195 continue
196
197 character_rank_in_language: int = FREQUENCIES[language].index(character)
198 expected_projection_ratio: float = (
199 target_language_characters_count / ordered_characters_count
200 )
201 character_rank_projection: int = int(character_rank * expected_projection_ratio)
202
203 if (
204 large_alphabet is False
205 and abs(character_rank_projection - character_rank_in_language) > 4
206 ):
207 continue
208
209 if (
210 large_alphabet is True
211 and abs(character_rank_projection - character_rank_in_language)
212 < target_language_characters_count / 3
213 ):
214 character_approved_count += 1
215 continue
216
217 characters_before_source: List[str] = FREQUENCIES[language][
218 0:character_rank_in_language
219 ]
220 characters_after_source: List[str] = FREQUENCIES[language][
221 character_rank_in_language:
222 ]
223 characters_before: List[str] = ordered_characters[0:character_rank]
224 characters_after: List[str] = ordered_characters[character_rank:]
225
226 before_match_count: int = len(
227 set(characters_before) & set(characters_before_source)
228 )
229
230 after_match_count: int = len(
231 set(characters_after) & set(characters_after_source)
232 )
233
234 if len(characters_before_source) == 0 and before_match_count <= 4:
235 character_approved_count += 1
236 continue
237
238 if len(characters_after_source) == 0 and after_match_count <= 4:
239 character_approved_count += 1
240 continue
241
242 if (
243 before_match_count / len(characters_before_source) >= 0.4
244 or after_match_count / len(characters_after_source) >= 0.4
245 ):
246 character_approved_count += 1
247 continue
248
249 return character_approved_count / len(ordered_characters)
250
251
252 def alpha_unicode_split(decoded_sequence: str) -> List[str]:
253 """
254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
256 One containing the latin letters and the other hebrew.
257 """
258 layers: Dict[str, str] = {}
259
260 for character in decoded_sequence:
261 if character.isalpha() is False:
262 continue
263
264 character_range: Optional[str] = unicode_range(character)
265
266 if character_range is None:
267 continue
268
269 layer_target_range: Optional[str] = None
270
271 for discovered_range in layers:
272 if (
273 is_suspiciously_successive_range(discovered_range, character_range)
274 is False
275 ):
276 layer_target_range = discovered_range
277 break
278
279 if layer_target_range is None:
280 layer_target_range = character_range
281
282 if layer_target_range not in layers:
283 layers[layer_target_range] = character.lower()
284 continue
285
286 layers[layer_target_range] += character.lower()
287
288 return list(layers.values())
289
290
291 def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
292 """
293 This function merge results previously given by the function coherence_ratio.
294 The return type is the same as coherence_ratio.
295 """
296 per_language_ratios: Dict[str, List[float]] = {}
297 for result in results:
298 for sub_result in result:
299 language, ratio = sub_result
300 if language not in per_language_ratios:
301 per_language_ratios[language] = [ratio]
302 continue
303 per_language_ratios[language].append(ratio)
304
305 merge = [
306 (
307 language,
308 round(
309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
310 4,
311 ),
312 )
313 for language in per_language_ratios
314 ]
315
316 return sorted(merge, key=lambda x: x[1], reverse=True)
317
318
319 def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
320 """
321 We shall NOT return "English—" in CoherenceMatches because it is an alternative
322 of "English". This function only keeps the best match and remove the em-dash in it.
323 """
324 index_results: Dict[str, List[float]] = dict()
325
326 for result in results:
327 language, ratio = result
328 no_em_name: str = language.replace("—", "")
329
330 if no_em_name not in index_results:
331 index_results[no_em_name] = []
332
333 index_results[no_em_name].append(ratio)
334
335 if any(len(index_results[e]) > 1 for e in index_results):
336 filtered_results: CoherenceMatches = []
337
338 for language in index_results:
339 filtered_results.append((language, max(index_results[language])))
340
341 return filtered_results
342
343 return results
344
345
346 @lru_cache(maxsize=2048)
347 def coherence_ratio(
348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
349 ) -> CoherenceMatches:
350 """
351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
352 A layer = Character extraction by alphabets/ranges.
353 """
354
355 results: List[Tuple[str, float]] = []
356 ignore_non_latin: bool = False
357
358 sufficient_match_count: int = 0
359
360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
361 if "Latin Based" in lg_inclusion_list:
362 ignore_non_latin = True
363 lg_inclusion_list.remove("Latin Based")
364
365 for layer in alpha_unicode_split(decoded_sequence):
366 sequence_frequencies: TypeCounter[str] = Counter(layer)
367 most_common = sequence_frequencies.most_common()
368
369 character_count: int = sum(o for c, o in most_common)
370
371 if character_count <= TOO_SMALL_SEQUENCE:
372 continue
373
374 popular_character_ordered: List[str] = [c for c, o in most_common]
375
376 for language in lg_inclusion_list or alphabet_languages(
377 popular_character_ordered, ignore_non_latin
378 ):
379 ratio: float = characters_popularity_compare(
380 language, popular_character_ordered
381 )
382
383 if ratio < threshold:
384 continue
385 elif ratio >= 0.8:
386 sufficient_match_count += 1
387
388 results.append((language, round(ratio, 4)))
389
390 if sufficient_match_count >= 3:
391 break
392
393 return sorted(
394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
395 )