Mercurial > repos > jpayne > bioproject_to_srr_2
comparison charset_normalizer/cd.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 import importlib | |
2 from codecs import IncrementalDecoder | |
3 from collections import Counter | |
4 from functools import lru_cache | |
5 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple | |
6 | |
7 from .constant import ( | |
8 FREQUENCIES, | |
9 KO_NAMES, | |
10 LANGUAGE_SUPPORTED_COUNT, | |
11 TOO_SMALL_SEQUENCE, | |
12 ZH_NAMES, | |
13 ) | |
14 from .md import is_suspiciously_successive_range | |
15 from .models import CoherenceMatches | |
16 from .utils import ( | |
17 is_accentuated, | |
18 is_latin, | |
19 is_multi_byte_encoding, | |
20 is_unicode_range_secondary, | |
21 unicode_range, | |
22 ) | |
23 | |
24 | |
25 def encoding_unicode_range(iana_name: str) -> List[str]: | |
26 """ | |
27 Return associated unicode ranges in a single byte code page. | |
28 """ | |
29 if is_multi_byte_encoding(iana_name): | |
30 raise IOError("Function not supported on multi-byte code page") | |
31 | |
32 decoder = importlib.import_module( | |
33 "encodings.{}".format(iana_name) | |
34 ).IncrementalDecoder | |
35 | |
36 p: IncrementalDecoder = decoder(errors="ignore") | |
37 seen_ranges: Dict[str, int] = {} | |
38 character_count: int = 0 | |
39 | |
40 for i in range(0x40, 0xFF): | |
41 chunk: str = p.decode(bytes([i])) | |
42 | |
43 if chunk: | |
44 character_range: Optional[str] = unicode_range(chunk) | |
45 | |
46 if character_range is None: | |
47 continue | |
48 | |
49 if is_unicode_range_secondary(character_range) is False: | |
50 if character_range not in seen_ranges: | |
51 seen_ranges[character_range] = 0 | |
52 seen_ranges[character_range] += 1 | |
53 character_count += 1 | |
54 | |
55 return sorted( | |
56 [ | |
57 character_range | |
58 for character_range in seen_ranges | |
59 if seen_ranges[character_range] / character_count >= 0.15 | |
60 ] | |
61 ) | |
62 | |
63 | |
64 def unicode_range_languages(primary_range: str) -> List[str]: | |
65 """ | |
66 Return inferred languages used with a unicode range. | |
67 """ | |
68 languages: List[str] = [] | |
69 | |
70 for language, characters in FREQUENCIES.items(): | |
71 for character in characters: | |
72 if unicode_range(character) == primary_range: | |
73 languages.append(language) | |
74 break | |
75 | |
76 return languages | |
77 | |
78 | |
79 @lru_cache() | |
80 def encoding_languages(iana_name: str) -> List[str]: | |
81 """ | |
82 Single-byte encoding language association. Some code page are heavily linked to particular language(s). | |
83 This function does the correspondence. | |
84 """ | |
85 unicode_ranges: List[str] = encoding_unicode_range(iana_name) | |
86 primary_range: Optional[str] = None | |
87 | |
88 for specified_range in unicode_ranges: | |
89 if "Latin" not in specified_range: | |
90 primary_range = specified_range | |
91 break | |
92 | |
93 if primary_range is None: | |
94 return ["Latin Based"] | |
95 | |
96 return unicode_range_languages(primary_range) | |
97 | |
98 | |
99 @lru_cache() | |
100 def mb_encoding_languages(iana_name: str) -> List[str]: | |
101 """ | |
102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s). | |
103 This function does the correspondence. | |
104 """ | |
105 if ( | |
106 iana_name.startswith("shift_") | |
107 or iana_name.startswith("iso2022_jp") | |
108 or iana_name.startswith("euc_j") | |
109 or iana_name == "cp932" | |
110 ): | |
111 return ["Japanese"] | |
112 if iana_name.startswith("gb") or iana_name in ZH_NAMES: | |
113 return ["Chinese"] | |
114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: | |
115 return ["Korean"] | |
116 | |
117 return [] | |
118 | |
119 | |
120 @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) | |
121 def get_target_features(language: str) -> Tuple[bool, bool]: | |
122 """ | |
123 Determine main aspects from a supported language if it contains accents and if is pure Latin. | |
124 """ | |
125 target_have_accents: bool = False | |
126 target_pure_latin: bool = True | |
127 | |
128 for character in FREQUENCIES[language]: | |
129 if not target_have_accents and is_accentuated(character): | |
130 target_have_accents = True | |
131 if target_pure_latin and is_latin(character) is False: | |
132 target_pure_latin = False | |
133 | |
134 return target_have_accents, target_pure_latin | |
135 | |
136 | |
137 def alphabet_languages( | |
138 characters: List[str], ignore_non_latin: bool = False | |
139 ) -> List[str]: | |
140 """ | |
141 Return associated languages associated to given characters. | |
142 """ | |
143 languages: List[Tuple[str, float]] = [] | |
144 | |
145 source_have_accents = any(is_accentuated(character) for character in characters) | |
146 | |
147 for language, language_characters in FREQUENCIES.items(): | |
148 target_have_accents, target_pure_latin = get_target_features(language) | |
149 | |
150 if ignore_non_latin and target_pure_latin is False: | |
151 continue | |
152 | |
153 if target_have_accents is False and source_have_accents: | |
154 continue | |
155 | |
156 character_count: int = len(language_characters) | |
157 | |
158 character_match_count: int = len( | |
159 [c for c in language_characters if c in characters] | |
160 ) | |
161 | |
162 ratio: float = character_match_count / character_count | |
163 | |
164 if ratio >= 0.2: | |
165 languages.append((language, ratio)) | |
166 | |
167 languages = sorted(languages, key=lambda x: x[1], reverse=True) | |
168 | |
169 return [compatible_language[0] for compatible_language in languages] | |
170 | |
171 | |
172 def characters_popularity_compare( | |
173 language: str, ordered_characters: List[str] | |
174 ) -> float: | |
175 """ | |
176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. | |
177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit). | |
178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) | |
179 """ | |
180 if language not in FREQUENCIES: | |
181 raise ValueError("{} not available".format(language)) | |
182 | |
183 character_approved_count: int = 0 | |
184 FREQUENCIES_language_set = set(FREQUENCIES[language]) | |
185 | |
186 ordered_characters_count: int = len(ordered_characters) | |
187 target_language_characters_count: int = len(FREQUENCIES[language]) | |
188 | |
189 large_alphabet: bool = target_language_characters_count > 26 | |
190 | |
191 for character, character_rank in zip( | |
192 ordered_characters, range(0, ordered_characters_count) | |
193 ): | |
194 if character not in FREQUENCIES_language_set: | |
195 continue | |
196 | |
197 character_rank_in_language: int = FREQUENCIES[language].index(character) | |
198 expected_projection_ratio: float = ( | |
199 target_language_characters_count / ordered_characters_count | |
200 ) | |
201 character_rank_projection: int = int(character_rank * expected_projection_ratio) | |
202 | |
203 if ( | |
204 large_alphabet is False | |
205 and abs(character_rank_projection - character_rank_in_language) > 4 | |
206 ): | |
207 continue | |
208 | |
209 if ( | |
210 large_alphabet is True | |
211 and abs(character_rank_projection - character_rank_in_language) | |
212 < target_language_characters_count / 3 | |
213 ): | |
214 character_approved_count += 1 | |
215 continue | |
216 | |
217 characters_before_source: List[str] = FREQUENCIES[language][ | |
218 0:character_rank_in_language | |
219 ] | |
220 characters_after_source: List[str] = FREQUENCIES[language][ | |
221 character_rank_in_language: | |
222 ] | |
223 characters_before: List[str] = ordered_characters[0:character_rank] | |
224 characters_after: List[str] = ordered_characters[character_rank:] | |
225 | |
226 before_match_count: int = len( | |
227 set(characters_before) & set(characters_before_source) | |
228 ) | |
229 | |
230 after_match_count: int = len( | |
231 set(characters_after) & set(characters_after_source) | |
232 ) | |
233 | |
234 if len(characters_before_source) == 0 and before_match_count <= 4: | |
235 character_approved_count += 1 | |
236 continue | |
237 | |
238 if len(characters_after_source) == 0 and after_match_count <= 4: | |
239 character_approved_count += 1 | |
240 continue | |
241 | |
242 if ( | |
243 before_match_count / len(characters_before_source) >= 0.4 | |
244 or after_match_count / len(characters_after_source) >= 0.4 | |
245 ): | |
246 character_approved_count += 1 | |
247 continue | |
248 | |
249 return character_approved_count / len(ordered_characters) | |
250 | |
251 | |
252 def alpha_unicode_split(decoded_sequence: str) -> List[str]: | |
253 """ | |
254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. | |
255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; | |
256 One containing the latin letters and the other hebrew. | |
257 """ | |
258 layers: Dict[str, str] = {} | |
259 | |
260 for character in decoded_sequence: | |
261 if character.isalpha() is False: | |
262 continue | |
263 | |
264 character_range: Optional[str] = unicode_range(character) | |
265 | |
266 if character_range is None: | |
267 continue | |
268 | |
269 layer_target_range: Optional[str] = None | |
270 | |
271 for discovered_range in layers: | |
272 if ( | |
273 is_suspiciously_successive_range(discovered_range, character_range) | |
274 is False | |
275 ): | |
276 layer_target_range = discovered_range | |
277 break | |
278 | |
279 if layer_target_range is None: | |
280 layer_target_range = character_range | |
281 | |
282 if layer_target_range not in layers: | |
283 layers[layer_target_range] = character.lower() | |
284 continue | |
285 | |
286 layers[layer_target_range] += character.lower() | |
287 | |
288 return list(layers.values()) | |
289 | |
290 | |
291 def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: | |
292 """ | |
293 This function merge results previously given by the function coherence_ratio. | |
294 The return type is the same as coherence_ratio. | |
295 """ | |
296 per_language_ratios: Dict[str, List[float]] = {} | |
297 for result in results: | |
298 for sub_result in result: | |
299 language, ratio = sub_result | |
300 if language not in per_language_ratios: | |
301 per_language_ratios[language] = [ratio] | |
302 continue | |
303 per_language_ratios[language].append(ratio) | |
304 | |
305 merge = [ | |
306 ( | |
307 language, | |
308 round( | |
309 sum(per_language_ratios[language]) / len(per_language_ratios[language]), | |
310 4, | |
311 ), | |
312 ) | |
313 for language in per_language_ratios | |
314 ] | |
315 | |
316 return sorted(merge, key=lambda x: x[1], reverse=True) | |
317 | |
318 | |
319 def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: | |
320 """ | |
321 We shall NOT return "English—" in CoherenceMatches because it is an alternative | |
322 of "English". This function only keeps the best match and remove the em-dash in it. | |
323 """ | |
324 index_results: Dict[str, List[float]] = dict() | |
325 | |
326 for result in results: | |
327 language, ratio = result | |
328 no_em_name: str = language.replace("—", "") | |
329 | |
330 if no_em_name not in index_results: | |
331 index_results[no_em_name] = [] | |
332 | |
333 index_results[no_em_name].append(ratio) | |
334 | |
335 if any(len(index_results[e]) > 1 for e in index_results): | |
336 filtered_results: CoherenceMatches = [] | |
337 | |
338 for language in index_results: | |
339 filtered_results.append((language, max(index_results[language]))) | |
340 | |
341 return filtered_results | |
342 | |
343 return results | |
344 | |
345 | |
346 @lru_cache(maxsize=2048) | |
347 def coherence_ratio( | |
348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None | |
349 ) -> CoherenceMatches: | |
350 """ | |
351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. | |
352 A layer = Character extraction by alphabets/ranges. | |
353 """ | |
354 | |
355 results: List[Tuple[str, float]] = [] | |
356 ignore_non_latin: bool = False | |
357 | |
358 sufficient_match_count: int = 0 | |
359 | |
360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] | |
361 if "Latin Based" in lg_inclusion_list: | |
362 ignore_non_latin = True | |
363 lg_inclusion_list.remove("Latin Based") | |
364 | |
365 for layer in alpha_unicode_split(decoded_sequence): | |
366 sequence_frequencies: TypeCounter[str] = Counter(layer) | |
367 most_common = sequence_frequencies.most_common() | |
368 | |
369 character_count: int = sum(o for c, o in most_common) | |
370 | |
371 if character_count <= TOO_SMALL_SEQUENCE: | |
372 continue | |
373 | |
374 popular_character_ordered: List[str] = [c for c, o in most_common] | |
375 | |
376 for language in lg_inclusion_list or alphabet_languages( | |
377 popular_character_ordered, ignore_non_latin | |
378 ): | |
379 ratio: float = characters_popularity_compare( | |
380 language, popular_character_ordered | |
381 ) | |
382 | |
383 if ratio < threshold: | |
384 continue | |
385 elif ratio >= 0.8: | |
386 sufficient_match_count += 1 | |
387 | |
388 results.append((language, round(ratio, 4))) | |
389 | |
390 if sufficient_match_count >= 3: | |
391 break | |
392 | |
393 return sorted( | |
394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True | |
395 ) |