jpayne@7
|
1 import importlib
|
jpayne@7
|
2 from codecs import IncrementalDecoder
|
jpayne@7
|
3 from collections import Counter
|
jpayne@7
|
4 from functools import lru_cache
|
jpayne@7
|
5 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
jpayne@7
|
6
|
jpayne@7
|
7 from .constant import (
|
jpayne@7
|
8 FREQUENCIES,
|
jpayne@7
|
9 KO_NAMES,
|
jpayne@7
|
10 LANGUAGE_SUPPORTED_COUNT,
|
jpayne@7
|
11 TOO_SMALL_SEQUENCE,
|
jpayne@7
|
12 ZH_NAMES,
|
jpayne@7
|
13 )
|
jpayne@7
|
14 from .md import is_suspiciously_successive_range
|
jpayne@7
|
15 from .models import CoherenceMatches
|
jpayne@7
|
16 from .utils import (
|
jpayne@7
|
17 is_accentuated,
|
jpayne@7
|
18 is_latin,
|
jpayne@7
|
19 is_multi_byte_encoding,
|
jpayne@7
|
20 is_unicode_range_secondary,
|
jpayne@7
|
21 unicode_range,
|
jpayne@7
|
22 )
|
jpayne@7
|
23
|
jpayne@7
|
24
|
jpayne@7
|
25 def encoding_unicode_range(iana_name: str) -> List[str]:
|
jpayne@7
|
26 """
|
jpayne@7
|
27 Return associated unicode ranges in a single byte code page.
|
jpayne@7
|
28 """
|
jpayne@7
|
29 if is_multi_byte_encoding(iana_name):
|
jpayne@7
|
30 raise IOError("Function not supported on multi-byte code page")
|
jpayne@7
|
31
|
jpayne@7
|
32 decoder = importlib.import_module(
|
jpayne@7
|
33 "encodings.{}".format(iana_name)
|
jpayne@7
|
34 ).IncrementalDecoder
|
jpayne@7
|
35
|
jpayne@7
|
36 p: IncrementalDecoder = decoder(errors="ignore")
|
jpayne@7
|
37 seen_ranges: Dict[str, int] = {}
|
jpayne@7
|
38 character_count: int = 0
|
jpayne@7
|
39
|
jpayne@7
|
40 for i in range(0x40, 0xFF):
|
jpayne@7
|
41 chunk: str = p.decode(bytes([i]))
|
jpayne@7
|
42
|
jpayne@7
|
43 if chunk:
|
jpayne@7
|
44 character_range: Optional[str] = unicode_range(chunk)
|
jpayne@7
|
45
|
jpayne@7
|
46 if character_range is None:
|
jpayne@7
|
47 continue
|
jpayne@7
|
48
|
jpayne@7
|
49 if is_unicode_range_secondary(character_range) is False:
|
jpayne@7
|
50 if character_range not in seen_ranges:
|
jpayne@7
|
51 seen_ranges[character_range] = 0
|
jpayne@7
|
52 seen_ranges[character_range] += 1
|
jpayne@7
|
53 character_count += 1
|
jpayne@7
|
54
|
jpayne@7
|
55 return sorted(
|
jpayne@7
|
56 [
|
jpayne@7
|
57 character_range
|
jpayne@7
|
58 for character_range in seen_ranges
|
jpayne@7
|
59 if seen_ranges[character_range] / character_count >= 0.15
|
jpayne@7
|
60 ]
|
jpayne@7
|
61 )
|
jpayne@7
|
62
|
jpayne@7
|
63
|
jpayne@7
|
64 def unicode_range_languages(primary_range: str) -> List[str]:
|
jpayne@7
|
65 """
|
jpayne@7
|
66 Return inferred languages used with a unicode range.
|
jpayne@7
|
67 """
|
jpayne@7
|
68 languages: List[str] = []
|
jpayne@7
|
69
|
jpayne@7
|
70 for language, characters in FREQUENCIES.items():
|
jpayne@7
|
71 for character in characters:
|
jpayne@7
|
72 if unicode_range(character) == primary_range:
|
jpayne@7
|
73 languages.append(language)
|
jpayne@7
|
74 break
|
jpayne@7
|
75
|
jpayne@7
|
76 return languages
|
jpayne@7
|
77
|
jpayne@7
|
78
|
jpayne@7
|
79 @lru_cache()
|
jpayne@7
|
80 def encoding_languages(iana_name: str) -> List[str]:
|
jpayne@7
|
81 """
|
jpayne@7
|
82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
jpayne@7
|
83 This function does the correspondence.
|
jpayne@7
|
84 """
|
jpayne@7
|
85 unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
jpayne@7
|
86 primary_range: Optional[str] = None
|
jpayne@7
|
87
|
jpayne@7
|
88 for specified_range in unicode_ranges:
|
jpayne@7
|
89 if "Latin" not in specified_range:
|
jpayne@7
|
90 primary_range = specified_range
|
jpayne@7
|
91 break
|
jpayne@7
|
92
|
jpayne@7
|
93 if primary_range is None:
|
jpayne@7
|
94 return ["Latin Based"]
|
jpayne@7
|
95
|
jpayne@7
|
96 return unicode_range_languages(primary_range)
|
jpayne@7
|
97
|
jpayne@7
|
98
|
jpayne@7
|
99 @lru_cache()
|
jpayne@7
|
100 def mb_encoding_languages(iana_name: str) -> List[str]:
|
jpayne@7
|
101 """
|
jpayne@7
|
102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
jpayne@7
|
103 This function does the correspondence.
|
jpayne@7
|
104 """
|
jpayne@7
|
105 if (
|
jpayne@7
|
106 iana_name.startswith("shift_")
|
jpayne@7
|
107 or iana_name.startswith("iso2022_jp")
|
jpayne@7
|
108 or iana_name.startswith("euc_j")
|
jpayne@7
|
109 or iana_name == "cp932"
|
jpayne@7
|
110 ):
|
jpayne@7
|
111 return ["Japanese"]
|
jpayne@7
|
112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
jpayne@7
|
113 return ["Chinese"]
|
jpayne@7
|
114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
jpayne@7
|
115 return ["Korean"]
|
jpayne@7
|
116
|
jpayne@7
|
117 return []
|
jpayne@7
|
118
|
jpayne@7
|
119
|
jpayne@7
|
120 @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
jpayne@7
|
121 def get_target_features(language: str) -> Tuple[bool, bool]:
|
jpayne@7
|
122 """
|
jpayne@7
|
123 Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
jpayne@7
|
124 """
|
jpayne@7
|
125 target_have_accents: bool = False
|
jpayne@7
|
126 target_pure_latin: bool = True
|
jpayne@7
|
127
|
jpayne@7
|
128 for character in FREQUENCIES[language]:
|
jpayne@7
|
129 if not target_have_accents and is_accentuated(character):
|
jpayne@7
|
130 target_have_accents = True
|
jpayne@7
|
131 if target_pure_latin and is_latin(character) is False:
|
jpayne@7
|
132 target_pure_latin = False
|
jpayne@7
|
133
|
jpayne@7
|
134 return target_have_accents, target_pure_latin
|
jpayne@7
|
135
|
jpayne@7
|
136
|
jpayne@7
|
137 def alphabet_languages(
|
jpayne@7
|
138 characters: List[str], ignore_non_latin: bool = False
|
jpayne@7
|
139 ) -> List[str]:
|
jpayne@7
|
140 """
|
jpayne@7
|
141 Return associated languages associated to given characters.
|
jpayne@7
|
142 """
|
jpayne@7
|
143 languages: List[Tuple[str, float]] = []
|
jpayne@7
|
144
|
jpayne@7
|
145 source_have_accents = any(is_accentuated(character) for character in characters)
|
jpayne@7
|
146
|
jpayne@7
|
147 for language, language_characters in FREQUENCIES.items():
|
jpayne@7
|
148 target_have_accents, target_pure_latin = get_target_features(language)
|
jpayne@7
|
149
|
jpayne@7
|
150 if ignore_non_latin and target_pure_latin is False:
|
jpayne@7
|
151 continue
|
jpayne@7
|
152
|
jpayne@7
|
153 if target_have_accents is False and source_have_accents:
|
jpayne@7
|
154 continue
|
jpayne@7
|
155
|
jpayne@7
|
156 character_count: int = len(language_characters)
|
jpayne@7
|
157
|
jpayne@7
|
158 character_match_count: int = len(
|
jpayne@7
|
159 [c for c in language_characters if c in characters]
|
jpayne@7
|
160 )
|
jpayne@7
|
161
|
jpayne@7
|
162 ratio: float = character_match_count / character_count
|
jpayne@7
|
163
|
jpayne@7
|
164 if ratio >= 0.2:
|
jpayne@7
|
165 languages.append((language, ratio))
|
jpayne@7
|
166
|
jpayne@7
|
167 languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
jpayne@7
|
168
|
jpayne@7
|
169 return [compatible_language[0] for compatible_language in languages]
|
jpayne@7
|
170
|
jpayne@7
|
171
|
jpayne@7
|
172 def characters_popularity_compare(
|
jpayne@7
|
173 language: str, ordered_characters: List[str]
|
jpayne@7
|
174 ) -> float:
|
jpayne@7
|
175 """
|
jpayne@7
|
176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
jpayne@7
|
177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
jpayne@7
|
178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
jpayne@7
|
179 """
|
jpayne@7
|
180 if language not in FREQUENCIES:
|
jpayne@7
|
181 raise ValueError("{} not available".format(language))
|
jpayne@7
|
182
|
jpayne@7
|
183 character_approved_count: int = 0
|
jpayne@7
|
184 FREQUENCIES_language_set = set(FREQUENCIES[language])
|
jpayne@7
|
185
|
jpayne@7
|
186 ordered_characters_count: int = len(ordered_characters)
|
jpayne@7
|
187 target_language_characters_count: int = len(FREQUENCIES[language])
|
jpayne@7
|
188
|
jpayne@7
|
189 large_alphabet: bool = target_language_characters_count > 26
|
jpayne@7
|
190
|
jpayne@7
|
191 for character, character_rank in zip(
|
jpayne@7
|
192 ordered_characters, range(0, ordered_characters_count)
|
jpayne@7
|
193 ):
|
jpayne@7
|
194 if character not in FREQUENCIES_language_set:
|
jpayne@7
|
195 continue
|
jpayne@7
|
196
|
jpayne@7
|
197 character_rank_in_language: int = FREQUENCIES[language].index(character)
|
jpayne@7
|
198 expected_projection_ratio: float = (
|
jpayne@7
|
199 target_language_characters_count / ordered_characters_count
|
jpayne@7
|
200 )
|
jpayne@7
|
201 character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
jpayne@7
|
202
|
jpayne@7
|
203 if (
|
jpayne@7
|
204 large_alphabet is False
|
jpayne@7
|
205 and abs(character_rank_projection - character_rank_in_language) > 4
|
jpayne@7
|
206 ):
|
jpayne@7
|
207 continue
|
jpayne@7
|
208
|
jpayne@7
|
209 if (
|
jpayne@7
|
210 large_alphabet is True
|
jpayne@7
|
211 and abs(character_rank_projection - character_rank_in_language)
|
jpayne@7
|
212 < target_language_characters_count / 3
|
jpayne@7
|
213 ):
|
jpayne@7
|
214 character_approved_count += 1
|
jpayne@7
|
215 continue
|
jpayne@7
|
216
|
jpayne@7
|
217 characters_before_source: List[str] = FREQUENCIES[language][
|
jpayne@7
|
218 0:character_rank_in_language
|
jpayne@7
|
219 ]
|
jpayne@7
|
220 characters_after_source: List[str] = FREQUENCIES[language][
|
jpayne@7
|
221 character_rank_in_language:
|
jpayne@7
|
222 ]
|
jpayne@7
|
223 characters_before: List[str] = ordered_characters[0:character_rank]
|
jpayne@7
|
224 characters_after: List[str] = ordered_characters[character_rank:]
|
jpayne@7
|
225
|
jpayne@7
|
226 before_match_count: int = len(
|
jpayne@7
|
227 set(characters_before) & set(characters_before_source)
|
jpayne@7
|
228 )
|
jpayne@7
|
229
|
jpayne@7
|
230 after_match_count: int = len(
|
jpayne@7
|
231 set(characters_after) & set(characters_after_source)
|
jpayne@7
|
232 )
|
jpayne@7
|
233
|
jpayne@7
|
234 if len(characters_before_source) == 0 and before_match_count <= 4:
|
jpayne@7
|
235 character_approved_count += 1
|
jpayne@7
|
236 continue
|
jpayne@7
|
237
|
jpayne@7
|
238 if len(characters_after_source) == 0 and after_match_count <= 4:
|
jpayne@7
|
239 character_approved_count += 1
|
jpayne@7
|
240 continue
|
jpayne@7
|
241
|
jpayne@7
|
242 if (
|
jpayne@7
|
243 before_match_count / len(characters_before_source) >= 0.4
|
jpayne@7
|
244 or after_match_count / len(characters_after_source) >= 0.4
|
jpayne@7
|
245 ):
|
jpayne@7
|
246 character_approved_count += 1
|
jpayne@7
|
247 continue
|
jpayne@7
|
248
|
jpayne@7
|
249 return character_approved_count / len(ordered_characters)
|
jpayne@7
|
250
|
jpayne@7
|
251
|
jpayne@7
|
252 def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
jpayne@7
|
253 """
|
jpayne@7
|
254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
jpayne@7
|
255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
jpayne@7
|
256 One containing the latin letters and the other hebrew.
|
jpayne@7
|
257 """
|
jpayne@7
|
258 layers: Dict[str, str] = {}
|
jpayne@7
|
259
|
jpayne@7
|
260 for character in decoded_sequence:
|
jpayne@7
|
261 if character.isalpha() is False:
|
jpayne@7
|
262 continue
|
jpayne@7
|
263
|
jpayne@7
|
264 character_range: Optional[str] = unicode_range(character)
|
jpayne@7
|
265
|
jpayne@7
|
266 if character_range is None:
|
jpayne@7
|
267 continue
|
jpayne@7
|
268
|
jpayne@7
|
269 layer_target_range: Optional[str] = None
|
jpayne@7
|
270
|
jpayne@7
|
271 for discovered_range in layers:
|
jpayne@7
|
272 if (
|
jpayne@7
|
273 is_suspiciously_successive_range(discovered_range, character_range)
|
jpayne@7
|
274 is False
|
jpayne@7
|
275 ):
|
jpayne@7
|
276 layer_target_range = discovered_range
|
jpayne@7
|
277 break
|
jpayne@7
|
278
|
jpayne@7
|
279 if layer_target_range is None:
|
jpayne@7
|
280 layer_target_range = character_range
|
jpayne@7
|
281
|
jpayne@7
|
282 if layer_target_range not in layers:
|
jpayne@7
|
283 layers[layer_target_range] = character.lower()
|
jpayne@7
|
284 continue
|
jpayne@7
|
285
|
jpayne@7
|
286 layers[layer_target_range] += character.lower()
|
jpayne@7
|
287
|
jpayne@7
|
288 return list(layers.values())
|
jpayne@7
|
289
|
jpayne@7
|
290
|
jpayne@7
|
291 def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
jpayne@7
|
292 """
|
jpayne@7
|
293 This function merge results previously given by the function coherence_ratio.
|
jpayne@7
|
294 The return type is the same as coherence_ratio.
|
jpayne@7
|
295 """
|
jpayne@7
|
296 per_language_ratios: Dict[str, List[float]] = {}
|
jpayne@7
|
297 for result in results:
|
jpayne@7
|
298 for sub_result in result:
|
jpayne@7
|
299 language, ratio = sub_result
|
jpayne@7
|
300 if language not in per_language_ratios:
|
jpayne@7
|
301 per_language_ratios[language] = [ratio]
|
jpayne@7
|
302 continue
|
jpayne@7
|
303 per_language_ratios[language].append(ratio)
|
jpayne@7
|
304
|
jpayne@7
|
305 merge = [
|
jpayne@7
|
306 (
|
jpayne@7
|
307 language,
|
jpayne@7
|
308 round(
|
jpayne@7
|
309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
jpayne@7
|
310 4,
|
jpayne@7
|
311 ),
|
jpayne@7
|
312 )
|
jpayne@7
|
313 for language in per_language_ratios
|
jpayne@7
|
314 ]
|
jpayne@7
|
315
|
jpayne@7
|
316 return sorted(merge, key=lambda x: x[1], reverse=True)
|
jpayne@7
|
317
|
jpayne@7
|
318
|
jpayne@7
|
319 def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
jpayne@7
|
320 """
|
jpayne@7
|
321 We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
jpayne@7
|
322 of "English". This function only keeps the best match and remove the em-dash in it.
|
jpayne@7
|
323 """
|
jpayne@7
|
324 index_results: Dict[str, List[float]] = dict()
|
jpayne@7
|
325
|
jpayne@7
|
326 for result in results:
|
jpayne@7
|
327 language, ratio = result
|
jpayne@7
|
328 no_em_name: str = language.replace("—", "")
|
jpayne@7
|
329
|
jpayne@7
|
330 if no_em_name not in index_results:
|
jpayne@7
|
331 index_results[no_em_name] = []
|
jpayne@7
|
332
|
jpayne@7
|
333 index_results[no_em_name].append(ratio)
|
jpayne@7
|
334
|
jpayne@7
|
335 if any(len(index_results[e]) > 1 for e in index_results):
|
jpayne@7
|
336 filtered_results: CoherenceMatches = []
|
jpayne@7
|
337
|
jpayne@7
|
338 for language in index_results:
|
jpayne@7
|
339 filtered_results.append((language, max(index_results[language])))
|
jpayne@7
|
340
|
jpayne@7
|
341 return filtered_results
|
jpayne@7
|
342
|
jpayne@7
|
343 return results
|
jpayne@7
|
344
|
jpayne@7
|
345
|
jpayne@7
|
346 @lru_cache(maxsize=2048)
|
jpayne@7
|
347 def coherence_ratio(
|
jpayne@7
|
348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
jpayne@7
|
349 ) -> CoherenceMatches:
|
jpayne@7
|
350 """
|
jpayne@7
|
351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
jpayne@7
|
352 A layer = Character extraction by alphabets/ranges.
|
jpayne@7
|
353 """
|
jpayne@7
|
354
|
jpayne@7
|
355 results: List[Tuple[str, float]] = []
|
jpayne@7
|
356 ignore_non_latin: bool = False
|
jpayne@7
|
357
|
jpayne@7
|
358 sufficient_match_count: int = 0
|
jpayne@7
|
359
|
jpayne@7
|
360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
jpayne@7
|
361 if "Latin Based" in lg_inclusion_list:
|
jpayne@7
|
362 ignore_non_latin = True
|
jpayne@7
|
363 lg_inclusion_list.remove("Latin Based")
|
jpayne@7
|
364
|
jpayne@7
|
365 for layer in alpha_unicode_split(decoded_sequence):
|
jpayne@7
|
366 sequence_frequencies: TypeCounter[str] = Counter(layer)
|
jpayne@7
|
367 most_common = sequence_frequencies.most_common()
|
jpayne@7
|
368
|
jpayne@7
|
369 character_count: int = sum(o for c, o in most_common)
|
jpayne@7
|
370
|
jpayne@7
|
371 if character_count <= TOO_SMALL_SEQUENCE:
|
jpayne@7
|
372 continue
|
jpayne@7
|
373
|
jpayne@7
|
374 popular_character_ordered: List[str] = [c for c, o in most_common]
|
jpayne@7
|
375
|
jpayne@7
|
376 for language in lg_inclusion_list or alphabet_languages(
|
jpayne@7
|
377 popular_character_ordered, ignore_non_latin
|
jpayne@7
|
378 ):
|
jpayne@7
|
379 ratio: float = characters_popularity_compare(
|
jpayne@7
|
380 language, popular_character_ordered
|
jpayne@7
|
381 )
|
jpayne@7
|
382
|
jpayne@7
|
383 if ratio < threshold:
|
jpayne@7
|
384 continue
|
jpayne@7
|
385 elif ratio >= 0.8:
|
jpayne@7
|
386 sufficient_match_count += 1
|
jpayne@7
|
387
|
jpayne@7
|
388 results.append((language, round(ratio, 4)))
|
jpayne@7
|
389
|
jpayne@7
|
390 if sufficient_match_count >= 3:
|
jpayne@7
|
391 break
|
jpayne@7
|
392
|
jpayne@7
|
393 return sorted(
|
jpayne@7
|
394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
jpayne@7
|
395 )
|