Mercurial > repos > jpayne > bioproject_to_srr_2
comparison charset_normalizer/constant.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE | |
3 from encodings.aliases import aliases | |
4 from re import IGNORECASE, compile as re_compile | |
5 from typing import Dict, List, Set, Union | |
6 | |
7 # Contain for each eligible encoding a list of/item bytes SIG/BOM | |
8 ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { | |
9 "utf_8": BOM_UTF8, | |
10 "utf_7": [ | |
11 b"\x2b\x2f\x76\x38", | |
12 b"\x2b\x2f\x76\x39", | |
13 b"\x2b\x2f\x76\x2b", | |
14 b"\x2b\x2f\x76\x2f", | |
15 b"\x2b\x2f\x76\x38\x2d", | |
16 ], | |
17 "gb18030": b"\x84\x31\x95\x33", | |
18 "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE], | |
19 "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE], | |
20 } | |
21 | |
22 TOO_SMALL_SEQUENCE: int = 32 | |
23 TOO_BIG_SEQUENCE: int = int(10e6) | |
24 | |
25 UTF8_MAXIMAL_ALLOCATION: int = 1_112_064 | |
26 | |
27 # Up-to-date Unicode ucd/15.0.0 | |
28 UNICODE_RANGES_COMBINED: Dict[str, range] = { | |
29 "Control character": range(32), | |
30 "Basic Latin": range(32, 128), | |
31 "Latin-1 Supplement": range(128, 256), | |
32 "Latin Extended-A": range(256, 384), | |
33 "Latin Extended-B": range(384, 592), | |
34 "IPA Extensions": range(592, 688), | |
35 "Spacing Modifier Letters": range(688, 768), | |
36 "Combining Diacritical Marks": range(768, 880), | |
37 "Greek and Coptic": range(880, 1024), | |
38 "Cyrillic": range(1024, 1280), | |
39 "Cyrillic Supplement": range(1280, 1328), | |
40 "Armenian": range(1328, 1424), | |
41 "Hebrew": range(1424, 1536), | |
42 "Arabic": range(1536, 1792), | |
43 "Syriac": range(1792, 1872), | |
44 "Arabic Supplement": range(1872, 1920), | |
45 "Thaana": range(1920, 1984), | |
46 "NKo": range(1984, 2048), | |
47 "Samaritan": range(2048, 2112), | |
48 "Mandaic": range(2112, 2144), | |
49 "Syriac Supplement": range(2144, 2160), | |
50 "Arabic Extended-B": range(2160, 2208), | |
51 "Arabic Extended-A": range(2208, 2304), | |
52 "Devanagari": range(2304, 2432), | |
53 "Bengali": range(2432, 2560), | |
54 "Gurmukhi": range(2560, 2688), | |
55 "Gujarati": range(2688, 2816), | |
56 "Oriya": range(2816, 2944), | |
57 "Tamil": range(2944, 3072), | |
58 "Telugu": range(3072, 3200), | |
59 "Kannada": range(3200, 3328), | |
60 "Malayalam": range(3328, 3456), | |
61 "Sinhala": range(3456, 3584), | |
62 "Thai": range(3584, 3712), | |
63 "Lao": range(3712, 3840), | |
64 "Tibetan": range(3840, 4096), | |
65 "Myanmar": range(4096, 4256), | |
66 "Georgian": range(4256, 4352), | |
67 "Hangul Jamo": range(4352, 4608), | |
68 "Ethiopic": range(4608, 4992), | |
69 "Ethiopic Supplement": range(4992, 5024), | |
70 "Cherokee": range(5024, 5120), | |
71 "Unified Canadian Aboriginal Syllabics": range(5120, 5760), | |
72 "Ogham": range(5760, 5792), | |
73 "Runic": range(5792, 5888), | |
74 "Tagalog": range(5888, 5920), | |
75 "Hanunoo": range(5920, 5952), | |
76 "Buhid": range(5952, 5984), | |
77 "Tagbanwa": range(5984, 6016), | |
78 "Khmer": range(6016, 6144), | |
79 "Mongolian": range(6144, 6320), | |
80 "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400), | |
81 "Limbu": range(6400, 6480), | |
82 "Tai Le": range(6480, 6528), | |
83 "New Tai Lue": range(6528, 6624), | |
84 "Khmer Symbols": range(6624, 6656), | |
85 "Buginese": range(6656, 6688), | |
86 "Tai Tham": range(6688, 6832), | |
87 "Combining Diacritical Marks Extended": range(6832, 6912), | |
88 "Balinese": range(6912, 7040), | |
89 "Sundanese": range(7040, 7104), | |
90 "Batak": range(7104, 7168), | |
91 "Lepcha": range(7168, 7248), | |
92 "Ol Chiki": range(7248, 7296), | |
93 "Cyrillic Extended-C": range(7296, 7312), | |
94 "Georgian Extended": range(7312, 7360), | |
95 "Sundanese Supplement": range(7360, 7376), | |
96 "Vedic Extensions": range(7376, 7424), | |
97 "Phonetic Extensions": range(7424, 7552), | |
98 "Phonetic Extensions Supplement": range(7552, 7616), | |
99 "Combining Diacritical Marks Supplement": range(7616, 7680), | |
100 "Latin Extended Additional": range(7680, 7936), | |
101 "Greek Extended": range(7936, 8192), | |
102 "General Punctuation": range(8192, 8304), | |
103 "Superscripts and Subscripts": range(8304, 8352), | |
104 "Currency Symbols": range(8352, 8400), | |
105 "Combining Diacritical Marks for Symbols": range(8400, 8448), | |
106 "Letterlike Symbols": range(8448, 8528), | |
107 "Number Forms": range(8528, 8592), | |
108 "Arrows": range(8592, 8704), | |
109 "Mathematical Operators": range(8704, 8960), | |
110 "Miscellaneous Technical": range(8960, 9216), | |
111 "Control Pictures": range(9216, 9280), | |
112 "Optical Character Recognition": range(9280, 9312), | |
113 "Enclosed Alphanumerics": range(9312, 9472), | |
114 "Box Drawing": range(9472, 9600), | |
115 "Block Elements": range(9600, 9632), | |
116 "Geometric Shapes": range(9632, 9728), | |
117 "Miscellaneous Symbols": range(9728, 9984), | |
118 "Dingbats": range(9984, 10176), | |
119 "Miscellaneous Mathematical Symbols-A": range(10176, 10224), | |
120 "Supplemental Arrows-A": range(10224, 10240), | |
121 "Braille Patterns": range(10240, 10496), | |
122 "Supplemental Arrows-B": range(10496, 10624), | |
123 "Miscellaneous Mathematical Symbols-B": range(10624, 10752), | |
124 "Supplemental Mathematical Operators": range(10752, 11008), | |
125 "Miscellaneous Symbols and Arrows": range(11008, 11264), | |
126 "Glagolitic": range(11264, 11360), | |
127 "Latin Extended-C": range(11360, 11392), | |
128 "Coptic": range(11392, 11520), | |
129 "Georgian Supplement": range(11520, 11568), | |
130 "Tifinagh": range(11568, 11648), | |
131 "Ethiopic Extended": range(11648, 11744), | |
132 "Cyrillic Extended-A": range(11744, 11776), | |
133 "Supplemental Punctuation": range(11776, 11904), | |
134 "CJK Radicals Supplement": range(11904, 12032), | |
135 "Kangxi Radicals": range(12032, 12256), | |
136 "Ideographic Description Characters": range(12272, 12288), | |
137 "CJK Symbols and Punctuation": range(12288, 12352), | |
138 "Hiragana": range(12352, 12448), | |
139 "Katakana": range(12448, 12544), | |
140 "Bopomofo": range(12544, 12592), | |
141 "Hangul Compatibility Jamo": range(12592, 12688), | |
142 "Kanbun": range(12688, 12704), | |
143 "Bopomofo Extended": range(12704, 12736), | |
144 "CJK Strokes": range(12736, 12784), | |
145 "Katakana Phonetic Extensions": range(12784, 12800), | |
146 "Enclosed CJK Letters and Months": range(12800, 13056), | |
147 "CJK Compatibility": range(13056, 13312), | |
148 "CJK Unified Ideographs Extension A": range(13312, 19904), | |
149 "Yijing Hexagram Symbols": range(19904, 19968), | |
150 "CJK Unified Ideographs": range(19968, 40960), | |
151 "Yi Syllables": range(40960, 42128), | |
152 "Yi Radicals": range(42128, 42192), | |
153 "Lisu": range(42192, 42240), | |
154 "Vai": range(42240, 42560), | |
155 "Cyrillic Extended-B": range(42560, 42656), | |
156 "Bamum": range(42656, 42752), | |
157 "Modifier Tone Letters": range(42752, 42784), | |
158 "Latin Extended-D": range(42784, 43008), | |
159 "Syloti Nagri": range(43008, 43056), | |
160 "Common Indic Number Forms": range(43056, 43072), | |
161 "Phags-pa": range(43072, 43136), | |
162 "Saurashtra": range(43136, 43232), | |
163 "Devanagari Extended": range(43232, 43264), | |
164 "Kayah Li": range(43264, 43312), | |
165 "Rejang": range(43312, 43360), | |
166 "Hangul Jamo Extended-A": range(43360, 43392), | |
167 "Javanese": range(43392, 43488), | |
168 "Myanmar Extended-B": range(43488, 43520), | |
169 "Cham": range(43520, 43616), | |
170 "Myanmar Extended-A": range(43616, 43648), | |
171 "Tai Viet": range(43648, 43744), | |
172 "Meetei Mayek Extensions": range(43744, 43776), | |
173 "Ethiopic Extended-A": range(43776, 43824), | |
174 "Latin Extended-E": range(43824, 43888), | |
175 "Cherokee Supplement": range(43888, 43968), | |
176 "Meetei Mayek": range(43968, 44032), | |
177 "Hangul Syllables": range(44032, 55216), | |
178 "Hangul Jamo Extended-B": range(55216, 55296), | |
179 "High Surrogates": range(55296, 56192), | |
180 "High Private Use Surrogates": range(56192, 56320), | |
181 "Low Surrogates": range(56320, 57344), | |
182 "Private Use Area": range(57344, 63744), | |
183 "CJK Compatibility Ideographs": range(63744, 64256), | |
184 "Alphabetic Presentation Forms": range(64256, 64336), | |
185 "Arabic Presentation Forms-A": range(64336, 65024), | |
186 "Variation Selectors": range(65024, 65040), | |
187 "Vertical Forms": range(65040, 65056), | |
188 "Combining Half Marks": range(65056, 65072), | |
189 "CJK Compatibility Forms": range(65072, 65104), | |
190 "Small Form Variants": range(65104, 65136), | |
191 "Arabic Presentation Forms-B": range(65136, 65280), | |
192 "Halfwidth and Fullwidth Forms": range(65280, 65520), | |
193 "Specials": range(65520, 65536), | |
194 "Linear B Syllabary": range(65536, 65664), | |
195 "Linear B Ideograms": range(65664, 65792), | |
196 "Aegean Numbers": range(65792, 65856), | |
197 "Ancient Greek Numbers": range(65856, 65936), | |
198 "Ancient Symbols": range(65936, 66000), | |
199 "Phaistos Disc": range(66000, 66048), | |
200 "Lycian": range(66176, 66208), | |
201 "Carian": range(66208, 66272), | |
202 "Coptic Epact Numbers": range(66272, 66304), | |
203 "Old Italic": range(66304, 66352), | |
204 "Gothic": range(66352, 66384), | |
205 "Old Permic": range(66384, 66432), | |
206 "Ugaritic": range(66432, 66464), | |
207 "Old Persian": range(66464, 66528), | |
208 "Deseret": range(66560, 66640), | |
209 "Shavian": range(66640, 66688), | |
210 "Osmanya": range(66688, 66736), | |
211 "Osage": range(66736, 66816), | |
212 "Elbasan": range(66816, 66864), | |
213 "Caucasian Albanian": range(66864, 66928), | |
214 "Vithkuqi": range(66928, 67008), | |
215 "Linear A": range(67072, 67456), | |
216 "Latin Extended-F": range(67456, 67520), | |
217 "Cypriot Syllabary": range(67584, 67648), | |
218 "Imperial Aramaic": range(67648, 67680), | |
219 "Palmyrene": range(67680, 67712), | |
220 "Nabataean": range(67712, 67760), | |
221 "Hatran": range(67808, 67840), | |
222 "Phoenician": range(67840, 67872), | |
223 "Lydian": range(67872, 67904), | |
224 "Meroitic Hieroglyphs": range(67968, 68000), | |
225 "Meroitic Cursive": range(68000, 68096), | |
226 "Kharoshthi": range(68096, 68192), | |
227 "Old South Arabian": range(68192, 68224), | |
228 "Old North Arabian": range(68224, 68256), | |
229 "Manichaean": range(68288, 68352), | |
230 "Avestan": range(68352, 68416), | |
231 "Inscriptional Parthian": range(68416, 68448), | |
232 "Inscriptional Pahlavi": range(68448, 68480), | |
233 "Psalter Pahlavi": range(68480, 68528), | |
234 "Old Turkic": range(68608, 68688), | |
235 "Old Hungarian": range(68736, 68864), | |
236 "Hanifi Rohingya": range(68864, 68928), | |
237 "Rumi Numeral Symbols": range(69216, 69248), | |
238 "Yezidi": range(69248, 69312), | |
239 "Arabic Extended-C": range(69312, 69376), | |
240 "Old Sogdian": range(69376, 69424), | |
241 "Sogdian": range(69424, 69488), | |
242 "Old Uyghur": range(69488, 69552), | |
243 "Chorasmian": range(69552, 69600), | |
244 "Elymaic": range(69600, 69632), | |
245 "Brahmi": range(69632, 69760), | |
246 "Kaithi": range(69760, 69840), | |
247 "Sora Sompeng": range(69840, 69888), | |
248 "Chakma": range(69888, 69968), | |
249 "Mahajani": range(69968, 70016), | |
250 "Sharada": range(70016, 70112), | |
251 "Sinhala Archaic Numbers": range(70112, 70144), | |
252 "Khojki": range(70144, 70224), | |
253 "Multani": range(70272, 70320), | |
254 "Khudawadi": range(70320, 70400), | |
255 "Grantha": range(70400, 70528), | |
256 "Newa": range(70656, 70784), | |
257 "Tirhuta": range(70784, 70880), | |
258 "Siddham": range(71040, 71168), | |
259 "Modi": range(71168, 71264), | |
260 "Mongolian Supplement": range(71264, 71296), | |
261 "Takri": range(71296, 71376), | |
262 "Ahom": range(71424, 71504), | |
263 "Dogra": range(71680, 71760), | |
264 "Warang Citi": range(71840, 71936), | |
265 "Dives Akuru": range(71936, 72032), | |
266 "Nandinagari": range(72096, 72192), | |
267 "Zanabazar Square": range(72192, 72272), | |
268 "Soyombo": range(72272, 72368), | |
269 "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384), | |
270 "Pau Cin Hau": range(72384, 72448), | |
271 "Devanagari Extended-A": range(72448, 72544), | |
272 "Bhaiksuki": range(72704, 72816), | |
273 "Marchen": range(72816, 72896), | |
274 "Masaram Gondi": range(72960, 73056), | |
275 "Gunjala Gondi": range(73056, 73136), | |
276 "Makasar": range(73440, 73472), | |
277 "Kawi": range(73472, 73568), | |
278 "Lisu Supplement": range(73648, 73664), | |
279 "Tamil Supplement": range(73664, 73728), | |
280 "Cuneiform": range(73728, 74752), | |
281 "Cuneiform Numbers and Punctuation": range(74752, 74880), | |
282 "Early Dynastic Cuneiform": range(74880, 75088), | |
283 "Cypro-Minoan": range(77712, 77824), | |
284 "Egyptian Hieroglyphs": range(77824, 78896), | |
285 "Egyptian Hieroglyph Format Controls": range(78896, 78944), | |
286 "Anatolian Hieroglyphs": range(82944, 83584), | |
287 "Bamum Supplement": range(92160, 92736), | |
288 "Mro": range(92736, 92784), | |
289 "Tangsa": range(92784, 92880), | |
290 "Bassa Vah": range(92880, 92928), | |
291 "Pahawh Hmong": range(92928, 93072), | |
292 "Medefaidrin": range(93760, 93856), | |
293 "Miao": range(93952, 94112), | |
294 "Ideographic Symbols and Punctuation": range(94176, 94208), | |
295 "Tangut": range(94208, 100352), | |
296 "Tangut Components": range(100352, 101120), | |
297 "Khitan Small Script": range(101120, 101632), | |
298 "Tangut Supplement": range(101632, 101760), | |
299 "Kana Extended-B": range(110576, 110592), | |
300 "Kana Supplement": range(110592, 110848), | |
301 "Kana Extended-A": range(110848, 110896), | |
302 "Small Kana Extension": range(110896, 110960), | |
303 "Nushu": range(110960, 111360), | |
304 "Duployan": range(113664, 113824), | |
305 "Shorthand Format Controls": range(113824, 113840), | |
306 "Znamenny Musical Notation": range(118528, 118736), | |
307 "Byzantine Musical Symbols": range(118784, 119040), | |
308 "Musical Symbols": range(119040, 119296), | |
309 "Ancient Greek Musical Notation": range(119296, 119376), | |
310 "Kaktovik Numerals": range(119488, 119520), | |
311 "Mayan Numerals": range(119520, 119552), | |
312 "Tai Xuan Jing Symbols": range(119552, 119648), | |
313 "Counting Rod Numerals": range(119648, 119680), | |
314 "Mathematical Alphanumeric Symbols": range(119808, 120832), | |
315 "Sutton SignWriting": range(120832, 121520), | |
316 "Latin Extended-G": range(122624, 122880), | |
317 "Glagolitic Supplement": range(122880, 122928), | |
318 "Cyrillic Extended-D": range(122928, 123024), | |
319 "Nyiakeng Puachue Hmong": range(123136, 123216), | |
320 "Toto": range(123536, 123584), | |
321 "Wancho": range(123584, 123648), | |
322 "Nag Mundari": range(124112, 124160), | |
323 "Ethiopic Extended-B": range(124896, 124928), | |
324 "Mende Kikakui": range(124928, 125152), | |
325 "Adlam": range(125184, 125280), | |
326 "Indic Siyaq Numbers": range(126064, 126144), | |
327 "Ottoman Siyaq Numbers": range(126208, 126288), | |
328 "Arabic Mathematical Alphabetic Symbols": range(126464, 126720), | |
329 "Mahjong Tiles": range(126976, 127024), | |
330 "Domino Tiles": range(127024, 127136), | |
331 "Playing Cards": range(127136, 127232), | |
332 "Enclosed Alphanumeric Supplement": range(127232, 127488), | |
333 "Enclosed Ideographic Supplement": range(127488, 127744), | |
334 "Miscellaneous Symbols and Pictographs": range(127744, 128512), | |
335 "Emoticons range(Emoji)": range(128512, 128592), | |
336 "Ornamental Dingbats": range(128592, 128640), | |
337 "Transport and Map Symbols": range(128640, 128768), | |
338 "Alchemical Symbols": range(128768, 128896), | |
339 "Geometric Shapes Extended": range(128896, 129024), | |
340 "Supplemental Arrows-C": range(129024, 129280), | |
341 "Supplemental Symbols and Pictographs": range(129280, 129536), | |
342 "Chess Symbols": range(129536, 129648), | |
343 "Symbols and Pictographs Extended-A": range(129648, 129792), | |
344 "Symbols for Legacy Computing": range(129792, 130048), | |
345 "CJK Unified Ideographs Extension B": range(131072, 173792), | |
346 "CJK Unified Ideographs Extension C": range(173824, 177984), | |
347 "CJK Unified Ideographs Extension D": range(177984, 178208), | |
348 "CJK Unified Ideographs Extension E": range(178208, 183984), | |
349 "CJK Unified Ideographs Extension F": range(183984, 191472), | |
350 "CJK Compatibility Ideographs Supplement": range(194560, 195104), | |
351 "CJK Unified Ideographs Extension G": range(196608, 201552), | |
352 "CJK Unified Ideographs Extension H": range(201552, 205744), | |
353 "Tags": range(917504, 917632), | |
354 "Variation Selectors Supplement": range(917760, 918000), | |
355 "Supplementary Private Use Area-A": range(983040, 1048576), | |
356 "Supplementary Private Use Area-B": range(1048576, 1114112), | |
357 } | |
358 | |
359 | |
360 UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [ | |
361 "Supplement", | |
362 "Extended", | |
363 "Extensions", | |
364 "Modifier", | |
365 "Marks", | |
366 "Punctuation", | |
367 "Symbols", | |
368 "Forms", | |
369 "Operators", | |
370 "Miscellaneous", | |
371 "Drawing", | |
372 "Block", | |
373 "Shapes", | |
374 "Supplemental", | |
375 "Tags", | |
376 ] | |
377 | |
378 RE_POSSIBLE_ENCODING_INDICATION = re_compile( | |
379 r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)", | |
380 IGNORECASE, | |
381 ) | |
382 | |
383 IANA_NO_ALIASES = [ | |
384 "cp720", | |
385 "cp737", | |
386 "cp856", | |
387 "cp874", | |
388 "cp875", | |
389 "cp1006", | |
390 "koi8_r", | |
391 "koi8_t", | |
392 "koi8_u", | |
393 ] | |
394 | |
395 IANA_SUPPORTED: List[str] = sorted( | |
396 filter( | |
397 lambda x: x.endswith("_codec") is False | |
398 and x not in {"rot_13", "tactis", "mbcs"}, | |
399 list(set(aliases.values())) + IANA_NO_ALIASES, | |
400 ) | |
401 ) | |
402 | |
403 IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED) | |
404 | |
405 # pre-computed code page that are similar using the function cp_similarity. | |
406 IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = { | |
407 "cp037": ["cp1026", "cp1140", "cp273", "cp500"], | |
408 "cp1026": ["cp037", "cp1140", "cp273", "cp500"], | |
409 "cp1125": ["cp866"], | |
410 "cp1140": ["cp037", "cp1026", "cp273", "cp500"], | |
411 "cp1250": ["iso8859_2"], | |
412 "cp1251": ["kz1048", "ptcp154"], | |
413 "cp1252": ["iso8859_15", "iso8859_9", "latin_1"], | |
414 "cp1253": ["iso8859_7"], | |
415 "cp1254": ["iso8859_15", "iso8859_9", "latin_1"], | |
416 "cp1257": ["iso8859_13"], | |
417 "cp273": ["cp037", "cp1026", "cp1140", "cp500"], | |
418 "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"], | |
419 "cp500": ["cp037", "cp1026", "cp1140", "cp273"], | |
420 "cp850": ["cp437", "cp857", "cp858", "cp865"], | |
421 "cp857": ["cp850", "cp858", "cp865"], | |
422 "cp858": ["cp437", "cp850", "cp857", "cp865"], | |
423 "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"], | |
424 "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"], | |
425 "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"], | |
426 "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"], | |
427 "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"], | |
428 "cp866": ["cp1125"], | |
429 "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"], | |
430 "iso8859_11": ["tis_620"], | |
431 "iso8859_13": ["cp1257"], | |
432 "iso8859_14": [ | |
433 "iso8859_10", | |
434 "iso8859_15", | |
435 "iso8859_16", | |
436 "iso8859_3", | |
437 "iso8859_9", | |
438 "latin_1", | |
439 ], | |
440 "iso8859_15": [ | |
441 "cp1252", | |
442 "cp1254", | |
443 "iso8859_10", | |
444 "iso8859_14", | |
445 "iso8859_16", | |
446 "iso8859_3", | |
447 "iso8859_9", | |
448 "latin_1", | |
449 ], | |
450 "iso8859_16": [ | |
451 "iso8859_14", | |
452 "iso8859_15", | |
453 "iso8859_2", | |
454 "iso8859_3", | |
455 "iso8859_9", | |
456 "latin_1", | |
457 ], | |
458 "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"], | |
459 "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"], | |
460 "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"], | |
461 "iso8859_7": ["cp1253"], | |
462 "iso8859_9": [ | |
463 "cp1252", | |
464 "cp1254", | |
465 "cp1258", | |
466 "iso8859_10", | |
467 "iso8859_14", | |
468 "iso8859_15", | |
469 "iso8859_16", | |
470 "iso8859_3", | |
471 "iso8859_4", | |
472 "latin_1", | |
473 ], | |
474 "kz1048": ["cp1251", "ptcp154"], | |
475 "latin_1": [ | |
476 "cp1252", | |
477 "cp1254", | |
478 "cp1258", | |
479 "iso8859_10", | |
480 "iso8859_14", | |
481 "iso8859_15", | |
482 "iso8859_16", | |
483 "iso8859_3", | |
484 "iso8859_4", | |
485 "iso8859_9", | |
486 ], | |
487 "mac_iceland": ["mac_roman", "mac_turkish"], | |
488 "mac_roman": ["mac_iceland", "mac_turkish"], | |
489 "mac_turkish": ["mac_iceland", "mac_roman"], | |
490 "ptcp154": ["cp1251", "kz1048"], | |
491 "tis_620": ["iso8859_11"], | |
492 } | |
493 | |
494 | |
495 CHARDET_CORRESPONDENCE: Dict[str, str] = { | |
496 "iso2022_kr": "ISO-2022-KR", | |
497 "iso2022_jp": "ISO-2022-JP", | |
498 "euc_kr": "EUC-KR", | |
499 "tis_620": "TIS-620", | |
500 "utf_32": "UTF-32", | |
501 "euc_jp": "EUC-JP", | |
502 "koi8_r": "KOI8-R", | |
503 "iso8859_1": "ISO-8859-1", | |
504 "iso8859_2": "ISO-8859-2", | |
505 "iso8859_5": "ISO-8859-5", | |
506 "iso8859_6": "ISO-8859-6", | |
507 "iso8859_7": "ISO-8859-7", | |
508 "iso8859_8": "ISO-8859-8", | |
509 "utf_16": "UTF-16", | |
510 "cp855": "IBM855", | |
511 "mac_cyrillic": "MacCyrillic", | |
512 "gb2312": "GB2312", | |
513 "gb18030": "GB18030", | |
514 "cp932": "CP932", | |
515 "cp866": "IBM866", | |
516 "utf_8": "utf-8", | |
517 "utf_8_sig": "UTF-8-SIG", | |
518 "shift_jis": "SHIFT_JIS", | |
519 "big5": "Big5", | |
520 "cp1250": "windows-1250", | |
521 "cp1251": "windows-1251", | |
522 "cp1252": "Windows-1252", | |
523 "cp1253": "windows-1253", | |
524 "cp1255": "windows-1255", | |
525 "cp1256": "windows-1256", | |
526 "cp1254": "Windows-1254", | |
527 "cp949": "CP949", | |
528 } | |
529 | |
530 | |
531 COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { | |
532 "<", | |
533 ">", | |
534 "=", | |
535 ":", | |
536 "/", | |
537 "&", | |
538 ";", | |
539 "{", | |
540 "}", | |
541 "[", | |
542 "]", | |
543 ",", | |
544 "|", | |
545 '"', | |
546 "-", | |
547 } | |
548 | |
549 | |
550 KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} | |
551 ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} | |
552 | |
553 # Logging LEVEL below DEBUG | |
554 TRACE: int = 5 | |
555 | |
556 | |
557 # Language label that contain the em dash "—" | |
558 # character are to be considered alternative seq to origin | |
559 FREQUENCIES: Dict[str, List[str]] = { | |
560 "English": [ | |
561 "e", | |
562 "a", | |
563 "t", | |
564 "i", | |
565 "o", | |
566 "n", | |
567 "s", | |
568 "r", | |
569 "h", | |
570 "l", | |
571 "d", | |
572 "c", | |
573 "u", | |
574 "m", | |
575 "f", | |
576 "p", | |
577 "g", | |
578 "w", | |
579 "y", | |
580 "b", | |
581 "v", | |
582 "k", | |
583 "x", | |
584 "j", | |
585 "z", | |
586 "q", | |
587 ], | |
588 "English—": [ | |
589 "e", | |
590 "a", | |
591 "t", | |
592 "i", | |
593 "o", | |
594 "n", | |
595 "s", | |
596 "r", | |
597 "h", | |
598 "l", | |
599 "d", | |
600 "c", | |
601 "m", | |
602 "u", | |
603 "f", | |
604 "p", | |
605 "g", | |
606 "w", | |
607 "b", | |
608 "y", | |
609 "v", | |
610 "k", | |
611 "j", | |
612 "x", | |
613 "z", | |
614 "q", | |
615 ], | |
616 "German": [ | |
617 "e", | |
618 "n", | |
619 "i", | |
620 "r", | |
621 "s", | |
622 "t", | |
623 "a", | |
624 "d", | |
625 "h", | |
626 "u", | |
627 "l", | |
628 "g", | |
629 "o", | |
630 "c", | |
631 "m", | |
632 "b", | |
633 "f", | |
634 "k", | |
635 "w", | |
636 "z", | |
637 "p", | |
638 "v", | |
639 "ü", | |
640 "ä", | |
641 "ö", | |
642 "j", | |
643 ], | |
644 "French": [ | |
645 "e", | |
646 "a", | |
647 "s", | |
648 "n", | |
649 "i", | |
650 "t", | |
651 "r", | |
652 "l", | |
653 "u", | |
654 "o", | |
655 "d", | |
656 "c", | |
657 "p", | |
658 "m", | |
659 "é", | |
660 "v", | |
661 "g", | |
662 "f", | |
663 "b", | |
664 "h", | |
665 "q", | |
666 "à", | |
667 "x", | |
668 "è", | |
669 "y", | |
670 "j", | |
671 ], | |
672 "Dutch": [ | |
673 "e", | |
674 "n", | |
675 "a", | |
676 "i", | |
677 "r", | |
678 "t", | |
679 "o", | |
680 "d", | |
681 "s", | |
682 "l", | |
683 "g", | |
684 "h", | |
685 "v", | |
686 "m", | |
687 "u", | |
688 "k", | |
689 "c", | |
690 "p", | |
691 "b", | |
692 "w", | |
693 "j", | |
694 "z", | |
695 "f", | |
696 "y", | |
697 "x", | |
698 "ë", | |
699 ], | |
700 "Italian": [ | |
701 "e", | |
702 "i", | |
703 "a", | |
704 "o", | |
705 "n", | |
706 "l", | |
707 "t", | |
708 "r", | |
709 "s", | |
710 "c", | |
711 "d", | |
712 "u", | |
713 "p", | |
714 "m", | |
715 "g", | |
716 "v", | |
717 "f", | |
718 "b", | |
719 "z", | |
720 "h", | |
721 "q", | |
722 "è", | |
723 "à", | |
724 "k", | |
725 "y", | |
726 "ò", | |
727 ], | |
728 "Polish": [ | |
729 "a", | |
730 "i", | |
731 "o", | |
732 "e", | |
733 "n", | |
734 "r", | |
735 "z", | |
736 "w", | |
737 "s", | |
738 "c", | |
739 "t", | |
740 "k", | |
741 "y", | |
742 "d", | |
743 "p", | |
744 "m", | |
745 "u", | |
746 "l", | |
747 "j", | |
748 "ł", | |
749 "g", | |
750 "b", | |
751 "h", | |
752 "ą", | |
753 "ę", | |
754 "ó", | |
755 ], | |
756 "Spanish": [ | |
757 "e", | |
758 "a", | |
759 "o", | |
760 "n", | |
761 "s", | |
762 "r", | |
763 "i", | |
764 "l", | |
765 "d", | |
766 "t", | |
767 "c", | |
768 "u", | |
769 "m", | |
770 "p", | |
771 "b", | |
772 "g", | |
773 "v", | |
774 "f", | |
775 "y", | |
776 "ó", | |
777 "h", | |
778 "q", | |
779 "í", | |
780 "j", | |
781 "z", | |
782 "á", | |
783 ], | |
784 "Russian": [ | |
785 "о", | |
786 "а", | |
787 "е", | |
788 "и", | |
789 "н", | |
790 "с", | |
791 "т", | |
792 "р", | |
793 "в", | |
794 "л", | |
795 "к", | |
796 "м", | |
797 "д", | |
798 "п", | |
799 "у", | |
800 "г", | |
801 "я", | |
802 "ы", | |
803 "з", | |
804 "б", | |
805 "й", | |
806 "ь", | |
807 "ч", | |
808 "х", | |
809 "ж", | |
810 "ц", | |
811 ], | |
812 # Jap-Kanji | |
813 "Japanese": [ | |
814 "人", | |
815 "一", | |
816 "大", | |
817 "亅", | |
818 "丁", | |
819 "丨", | |
820 "竹", | |
821 "笑", | |
822 "口", | |
823 "日", | |
824 "今", | |
825 "二", | |
826 "彳", | |
827 "行", | |
828 "十", | |
829 "土", | |
830 "丶", | |
831 "寸", | |
832 "寺", | |
833 "時", | |
834 "乙", | |
835 "丿", | |
836 "乂", | |
837 "气", | |
838 "気", | |
839 "冂", | |
840 "巾", | |
841 "亠", | |
842 "市", | |
843 "目", | |
844 "儿", | |
845 "見", | |
846 "八", | |
847 "小", | |
848 "凵", | |
849 "県", | |
850 "月", | |
851 "彐", | |
852 "門", | |
853 "間", | |
854 "木", | |
855 "東", | |
856 "山", | |
857 "出", | |
858 "本", | |
859 "中", | |
860 "刀", | |
861 "分", | |
862 "耳", | |
863 "又", | |
864 "取", | |
865 "最", | |
866 "言", | |
867 "田", | |
868 "心", | |
869 "思", | |
870 "刂", | |
871 "前", | |
872 "京", | |
873 "尹", | |
874 "事", | |
875 "生", | |
876 "厶", | |
877 "云", | |
878 "会", | |
879 "未", | |
880 "来", | |
881 "白", | |
882 "冫", | |
883 "楽", | |
884 "灬", | |
885 "馬", | |
886 "尸", | |
887 "尺", | |
888 "駅", | |
889 "明", | |
890 "耂", | |
891 "者", | |
892 "了", | |
893 "阝", | |
894 "都", | |
895 "高", | |
896 "卜", | |
897 "占", | |
898 "厂", | |
899 "广", | |
900 "店", | |
901 "子", | |
902 "申", | |
903 "奄", | |
904 "亻", | |
905 "俺", | |
906 "上", | |
907 "方", | |
908 "冖", | |
909 "学", | |
910 "衣", | |
911 "艮", | |
912 "食", | |
913 "自", | |
914 ], | |
915 # Jap-Katakana | |
916 "Japanese—": [ | |
917 "ー", | |
918 "ン", | |
919 "ス", | |
920 "・", | |
921 "ル", | |
922 "ト", | |
923 "リ", | |
924 "イ", | |
925 "ア", | |
926 "ラ", | |
927 "ッ", | |
928 "ク", | |
929 "ド", | |
930 "シ", | |
931 "レ", | |
932 "ジ", | |
933 "タ", | |
934 "フ", | |
935 "ロ", | |
936 "カ", | |
937 "テ", | |
938 "マ", | |
939 "ィ", | |
940 "グ", | |
941 "バ", | |
942 "ム", | |
943 "プ", | |
944 "オ", | |
945 "コ", | |
946 "デ", | |
947 "ニ", | |
948 "ウ", | |
949 "メ", | |
950 "サ", | |
951 "ビ", | |
952 "ナ", | |
953 "ブ", | |
954 "ャ", | |
955 "エ", | |
956 "ュ", | |
957 "チ", | |
958 "キ", | |
959 "ズ", | |
960 "ダ", | |
961 "パ", | |
962 "ミ", | |
963 "ェ", | |
964 "ョ", | |
965 "ハ", | |
966 "セ", | |
967 "ベ", | |
968 "ガ", | |
969 "モ", | |
970 "ツ", | |
971 "ネ", | |
972 "ボ", | |
973 "ソ", | |
974 "ノ", | |
975 "ァ", | |
976 "ヴ", | |
977 "ワ", | |
978 "ポ", | |
979 "ペ", | |
980 "ピ", | |
981 "ケ", | |
982 "ゴ", | |
983 "ギ", | |
984 "ザ", | |
985 "ホ", | |
986 "ゲ", | |
987 "ォ", | |
988 "ヤ", | |
989 "ヒ", | |
990 "ユ", | |
991 "ヨ", | |
992 "ヘ", | |
993 "ゼ", | |
994 "ヌ", | |
995 "ゥ", | |
996 "ゾ", | |
997 "ヶ", | |
998 "ヂ", | |
999 "ヲ", | |
1000 "ヅ", | |
1001 "ヵ", | |
1002 "ヱ", | |
1003 "ヰ", | |
1004 "ヮ", | |
1005 "ヽ", | |
1006 "゠", | |
1007 "ヾ", | |
1008 "ヷ", | |
1009 "ヿ", | |
1010 "ヸ", | |
1011 "ヹ", | |
1012 "ヺ", | |
1013 ], | |
1014 # Jap-Hiragana | |
1015 "Japanese——": [ | |
1016 "の", | |
1017 "に", | |
1018 "る", | |
1019 "た", | |
1020 "と", | |
1021 "は", | |
1022 "し", | |
1023 "い", | |
1024 "を", | |
1025 "で", | |
1026 "て", | |
1027 "が", | |
1028 "な", | |
1029 "れ", | |
1030 "か", | |
1031 "ら", | |
1032 "さ", | |
1033 "っ", | |
1034 "り", | |
1035 "す", | |
1036 "あ", | |
1037 "も", | |
1038 "こ", | |
1039 "ま", | |
1040 "う", | |
1041 "く", | |
1042 "よ", | |
1043 "き", | |
1044 "ん", | |
1045 "め", | |
1046 "お", | |
1047 "け", | |
1048 "そ", | |
1049 "つ", | |
1050 "だ", | |
1051 "や", | |
1052 "え", | |
1053 "ど", | |
1054 "わ", | |
1055 "ち", | |
1056 "み", | |
1057 "せ", | |
1058 "じ", | |
1059 "ば", | |
1060 "へ", | |
1061 "び", | |
1062 "ず", | |
1063 "ろ", | |
1064 "ほ", | |
1065 "げ", | |
1066 "む", | |
1067 "べ", | |
1068 "ひ", | |
1069 "ょ", | |
1070 "ゆ", | |
1071 "ぶ", | |
1072 "ご", | |
1073 "ゃ", | |
1074 "ね", | |
1075 "ふ", | |
1076 "ぐ", | |
1077 "ぎ", | |
1078 "ぼ", | |
1079 "ゅ", | |
1080 "づ", | |
1081 "ざ", | |
1082 "ぞ", | |
1083 "ぬ", | |
1084 "ぜ", | |
1085 "ぱ", | |
1086 "ぽ", | |
1087 "ぷ", | |
1088 "ぴ", | |
1089 "ぃ", | |
1090 "ぁ", | |
1091 "ぇ", | |
1092 "ぺ", | |
1093 "ゞ", | |
1094 "ぢ", | |
1095 "ぉ", | |
1096 "ぅ", | |
1097 "ゐ", | |
1098 "ゝ", | |
1099 "ゑ", | |
1100 "゛", | |
1101 "゜", | |
1102 "ゎ", | |
1103 "ゔ", | |
1104 "゚", | |
1105 "ゟ", | |
1106 "゙", | |
1107 "ゕ", | |
1108 "ゖ", | |
1109 ], | |
1110 "Portuguese": [ | |
1111 "a", | |
1112 "e", | |
1113 "o", | |
1114 "s", | |
1115 "i", | |
1116 "r", | |
1117 "d", | |
1118 "n", | |
1119 "t", | |
1120 "m", | |
1121 "u", | |
1122 "c", | |
1123 "l", | |
1124 "p", | |
1125 "g", | |
1126 "v", | |
1127 "b", | |
1128 "f", | |
1129 "h", | |
1130 "ã", | |
1131 "q", | |
1132 "é", | |
1133 "ç", | |
1134 "á", | |
1135 "z", | |
1136 "í", | |
1137 ], | |
1138 "Swedish": [ | |
1139 "e", | |
1140 "a", | |
1141 "n", | |
1142 "r", | |
1143 "t", | |
1144 "s", | |
1145 "i", | |
1146 "l", | |
1147 "d", | |
1148 "o", | |
1149 "m", | |
1150 "k", | |
1151 "g", | |
1152 "v", | |
1153 "h", | |
1154 "f", | |
1155 "u", | |
1156 "p", | |
1157 "ä", | |
1158 "c", | |
1159 "b", | |
1160 "ö", | |
1161 "å", | |
1162 "y", | |
1163 "j", | |
1164 "x", | |
1165 ], | |
1166 "Chinese": [ | |
1167 "的", | |
1168 "一", | |
1169 "是", | |
1170 "不", | |
1171 "了", | |
1172 "在", | |
1173 "人", | |
1174 "有", | |
1175 "我", | |
1176 "他", | |
1177 "这", | |
1178 "个", | |
1179 "们", | |
1180 "中", | |
1181 "来", | |
1182 "上", | |
1183 "大", | |
1184 "为", | |
1185 "和", | |
1186 "国", | |
1187 "地", | |
1188 "到", | |
1189 "以", | |
1190 "说", | |
1191 "时", | |
1192 "要", | |
1193 "就", | |
1194 "出", | |
1195 "会", | |
1196 "可", | |
1197 "也", | |
1198 "你", | |
1199 "对", | |
1200 "生", | |
1201 "能", | |
1202 "而", | |
1203 "子", | |
1204 "那", | |
1205 "得", | |
1206 "于", | |
1207 "着", | |
1208 "下", | |
1209 "自", | |
1210 "之", | |
1211 "年", | |
1212 "过", | |
1213 "发", | |
1214 "后", | |
1215 "作", | |
1216 "里", | |
1217 "用", | |
1218 "道", | |
1219 "行", | |
1220 "所", | |
1221 "然", | |
1222 "家", | |
1223 "种", | |
1224 "事", | |
1225 "成", | |
1226 "方", | |
1227 "多", | |
1228 "经", | |
1229 "么", | |
1230 "去", | |
1231 "法", | |
1232 "学", | |
1233 "如", | |
1234 "都", | |
1235 "同", | |
1236 "现", | |
1237 "当", | |
1238 "没", | |
1239 "动", | |
1240 "面", | |
1241 "起", | |
1242 "看", | |
1243 "定", | |
1244 "天", | |
1245 "分", | |
1246 "还", | |
1247 "进", | |
1248 "好", | |
1249 "小", | |
1250 "部", | |
1251 "其", | |
1252 "些", | |
1253 "主", | |
1254 "样", | |
1255 "理", | |
1256 "心", | |
1257 "她", | |
1258 "本", | |
1259 "前", | |
1260 "开", | |
1261 "但", | |
1262 "因", | |
1263 "只", | |
1264 "从", | |
1265 "想", | |
1266 "实", | |
1267 ], | |
1268 "Ukrainian": [ | |
1269 "о", | |
1270 "а", | |
1271 "н", | |
1272 "і", | |
1273 "и", | |
1274 "р", | |
1275 "в", | |
1276 "т", | |
1277 "е", | |
1278 "с", | |
1279 "к", | |
1280 "л", | |
1281 "у", | |
1282 "д", | |
1283 "м", | |
1284 "п", | |
1285 "з", | |
1286 "я", | |
1287 "ь", | |
1288 "б", | |
1289 "г", | |
1290 "й", | |
1291 "ч", | |
1292 "х", | |
1293 "ц", | |
1294 "ї", | |
1295 ], | |
1296 "Norwegian": [ | |
1297 "e", | |
1298 "r", | |
1299 "n", | |
1300 "t", | |
1301 "a", | |
1302 "s", | |
1303 "i", | |
1304 "o", | |
1305 "l", | |
1306 "d", | |
1307 "g", | |
1308 "k", | |
1309 "m", | |
1310 "v", | |
1311 "f", | |
1312 "p", | |
1313 "u", | |
1314 "b", | |
1315 "h", | |
1316 "å", | |
1317 "y", | |
1318 "j", | |
1319 "ø", | |
1320 "c", | |
1321 "æ", | |
1322 "w", | |
1323 ], | |
1324 "Finnish": [ | |
1325 "a", | |
1326 "i", | |
1327 "n", | |
1328 "t", | |
1329 "e", | |
1330 "s", | |
1331 "l", | |
1332 "o", | |
1333 "u", | |
1334 "k", | |
1335 "ä", | |
1336 "m", | |
1337 "r", | |
1338 "v", | |
1339 "j", | |
1340 "h", | |
1341 "p", | |
1342 "y", | |
1343 "d", | |
1344 "ö", | |
1345 "g", | |
1346 "c", | |
1347 "b", | |
1348 "f", | |
1349 "w", | |
1350 "z", | |
1351 ], | |
1352 "Vietnamese": [ | |
1353 "n", | |
1354 "h", | |
1355 "t", | |
1356 "i", | |
1357 "c", | |
1358 "g", | |
1359 "a", | |
1360 "o", | |
1361 "u", | |
1362 "m", | |
1363 "l", | |
1364 "r", | |
1365 "à", | |
1366 "đ", | |
1367 "s", | |
1368 "e", | |
1369 "v", | |
1370 "p", | |
1371 "b", | |
1372 "y", | |
1373 "ư", | |
1374 "d", | |
1375 "á", | |
1376 "k", | |
1377 "ộ", | |
1378 "ế", | |
1379 ], | |
1380 "Czech": [ | |
1381 "o", | |
1382 "e", | |
1383 "a", | |
1384 "n", | |
1385 "t", | |
1386 "s", | |
1387 "i", | |
1388 "l", | |
1389 "v", | |
1390 "r", | |
1391 "k", | |
1392 "d", | |
1393 "u", | |
1394 "m", | |
1395 "p", | |
1396 "í", | |
1397 "c", | |
1398 "h", | |
1399 "z", | |
1400 "á", | |
1401 "y", | |
1402 "j", | |
1403 "b", | |
1404 "ě", | |
1405 "é", | |
1406 "ř", | |
1407 ], | |
1408 "Hungarian": [ | |
1409 "e", | |
1410 "a", | |
1411 "t", | |
1412 "l", | |
1413 "s", | |
1414 "n", | |
1415 "k", | |
1416 "r", | |
1417 "i", | |
1418 "o", | |
1419 "z", | |
1420 "á", | |
1421 "é", | |
1422 "g", | |
1423 "m", | |
1424 "b", | |
1425 "y", | |
1426 "v", | |
1427 "d", | |
1428 "h", | |
1429 "u", | |
1430 "p", | |
1431 "j", | |
1432 "ö", | |
1433 "f", | |
1434 "c", | |
1435 ], | |
1436 "Korean": [ | |
1437 "이", | |
1438 "다", | |
1439 "에", | |
1440 "의", | |
1441 "는", | |
1442 "로", | |
1443 "하", | |
1444 "을", | |
1445 "가", | |
1446 "고", | |
1447 "지", | |
1448 "서", | |
1449 "한", | |
1450 "은", | |
1451 "기", | |
1452 "으", | |
1453 "년", | |
1454 "대", | |
1455 "사", | |
1456 "시", | |
1457 "를", | |
1458 "리", | |
1459 "도", | |
1460 "인", | |
1461 "스", | |
1462 "일", | |
1463 ], | |
1464 "Indonesian": [ | |
1465 "a", | |
1466 "n", | |
1467 "e", | |
1468 "i", | |
1469 "r", | |
1470 "t", | |
1471 "u", | |
1472 "s", | |
1473 "d", | |
1474 "k", | |
1475 "m", | |
1476 "l", | |
1477 "g", | |
1478 "p", | |
1479 "b", | |
1480 "o", | |
1481 "h", | |
1482 "y", | |
1483 "j", | |
1484 "c", | |
1485 "w", | |
1486 "f", | |
1487 "v", | |
1488 "z", | |
1489 "x", | |
1490 "q", | |
1491 ], | |
1492 "Turkish": [ | |
1493 "a", | |
1494 "e", | |
1495 "i", | |
1496 "n", | |
1497 "r", | |
1498 "l", | |
1499 "ı", | |
1500 "k", | |
1501 "d", | |
1502 "t", | |
1503 "s", | |
1504 "m", | |
1505 "y", | |
1506 "u", | |
1507 "o", | |
1508 "b", | |
1509 "ü", | |
1510 "ş", | |
1511 "v", | |
1512 "g", | |
1513 "z", | |
1514 "h", | |
1515 "c", | |
1516 "p", | |
1517 "ç", | |
1518 "ğ", | |
1519 ], | |
1520 "Romanian": [ | |
1521 "e", | |
1522 "i", | |
1523 "a", | |
1524 "r", | |
1525 "n", | |
1526 "t", | |
1527 "u", | |
1528 "l", | |
1529 "o", | |
1530 "c", | |
1531 "s", | |
1532 "d", | |
1533 "p", | |
1534 "m", | |
1535 "ă", | |
1536 "f", | |
1537 "v", | |
1538 "î", | |
1539 "g", | |
1540 "b", | |
1541 "ș", | |
1542 "ț", | |
1543 "z", | |
1544 "h", | |
1545 "â", | |
1546 "j", | |
1547 ], | |
1548 "Farsi": [ | |
1549 "ا", | |
1550 "ی", | |
1551 "ر", | |
1552 "د", | |
1553 "ن", | |
1554 "ه", | |
1555 "و", | |
1556 "م", | |
1557 "ت", | |
1558 "ب", | |
1559 "س", | |
1560 "ل", | |
1561 "ک", | |
1562 "ش", | |
1563 "ز", | |
1564 "ف", | |
1565 "گ", | |
1566 "ع", | |
1567 "خ", | |
1568 "ق", | |
1569 "ج", | |
1570 "آ", | |
1571 "پ", | |
1572 "ح", | |
1573 "ط", | |
1574 "ص", | |
1575 ], | |
1576 "Arabic": [ | |
1577 "ا", | |
1578 "ل", | |
1579 "ي", | |
1580 "م", | |
1581 "و", | |
1582 "ن", | |
1583 "ر", | |
1584 "ت", | |
1585 "ب", | |
1586 "ة", | |
1587 "ع", | |
1588 "د", | |
1589 "س", | |
1590 "ف", | |
1591 "ه", | |
1592 "ك", | |
1593 "ق", | |
1594 "أ", | |
1595 "ح", | |
1596 "ج", | |
1597 "ش", | |
1598 "ط", | |
1599 "ص", | |
1600 "ى", | |
1601 "خ", | |
1602 "إ", | |
1603 ], | |
1604 "Danish": [ | |
1605 "e", | |
1606 "r", | |
1607 "n", | |
1608 "t", | |
1609 "a", | |
1610 "i", | |
1611 "s", | |
1612 "d", | |
1613 "l", | |
1614 "o", | |
1615 "g", | |
1616 "m", | |
1617 "k", | |
1618 "f", | |
1619 "v", | |
1620 "u", | |
1621 "b", | |
1622 "h", | |
1623 "p", | |
1624 "å", | |
1625 "y", | |
1626 "ø", | |
1627 "æ", | |
1628 "c", | |
1629 "j", | |
1630 "w", | |
1631 ], | |
1632 "Serbian": [ | |
1633 "а", | |
1634 "и", | |
1635 "о", | |
1636 "е", | |
1637 "н", | |
1638 "р", | |
1639 "с", | |
1640 "у", | |
1641 "т", | |
1642 "к", | |
1643 "ј", | |
1644 "в", | |
1645 "д", | |
1646 "м", | |
1647 "п", | |
1648 "л", | |
1649 "г", | |
1650 "з", | |
1651 "б", | |
1652 "a", | |
1653 "i", | |
1654 "e", | |
1655 "o", | |
1656 "n", | |
1657 "ц", | |
1658 "ш", | |
1659 ], | |
1660 "Lithuanian": [ | |
1661 "i", | |
1662 "a", | |
1663 "s", | |
1664 "o", | |
1665 "r", | |
1666 "e", | |
1667 "t", | |
1668 "n", | |
1669 "u", | |
1670 "k", | |
1671 "m", | |
1672 "l", | |
1673 "p", | |
1674 "v", | |
1675 "d", | |
1676 "j", | |
1677 "g", | |
1678 "ė", | |
1679 "b", | |
1680 "y", | |
1681 "ų", | |
1682 "š", | |
1683 "ž", | |
1684 "c", | |
1685 "ą", | |
1686 "į", | |
1687 ], | |
1688 "Slovene": [ | |
1689 "e", | |
1690 "a", | |
1691 "i", | |
1692 "o", | |
1693 "n", | |
1694 "r", | |
1695 "s", | |
1696 "l", | |
1697 "t", | |
1698 "j", | |
1699 "v", | |
1700 "k", | |
1701 "d", | |
1702 "p", | |
1703 "m", | |
1704 "u", | |
1705 "z", | |
1706 "b", | |
1707 "g", | |
1708 "h", | |
1709 "č", | |
1710 "c", | |
1711 "š", | |
1712 "ž", | |
1713 "f", | |
1714 "y", | |
1715 ], | |
1716 "Slovak": [ | |
1717 "o", | |
1718 "a", | |
1719 "e", | |
1720 "n", | |
1721 "i", | |
1722 "r", | |
1723 "v", | |
1724 "t", | |
1725 "s", | |
1726 "l", | |
1727 "k", | |
1728 "d", | |
1729 "m", | |
1730 "p", | |
1731 "u", | |
1732 "c", | |
1733 "h", | |
1734 "j", | |
1735 "b", | |
1736 "z", | |
1737 "á", | |
1738 "y", | |
1739 "ý", | |
1740 "í", | |
1741 "č", | |
1742 "é", | |
1743 ], | |
1744 "Hebrew": [ | |
1745 "י", | |
1746 "ו", | |
1747 "ה", | |
1748 "ל", | |
1749 "ר", | |
1750 "ב", | |
1751 "ת", | |
1752 "מ", | |
1753 "א", | |
1754 "ש", | |
1755 "נ", | |
1756 "ע", | |
1757 "ם", | |
1758 "ד", | |
1759 "ק", | |
1760 "ח", | |
1761 "פ", | |
1762 "ס", | |
1763 "כ", | |
1764 "ג", | |
1765 "ט", | |
1766 "צ", | |
1767 "ן", | |
1768 "ז", | |
1769 "ך", | |
1770 ], | |
1771 "Bulgarian": [ | |
1772 "а", | |
1773 "и", | |
1774 "о", | |
1775 "е", | |
1776 "н", | |
1777 "т", | |
1778 "р", | |
1779 "с", | |
1780 "в", | |
1781 "л", | |
1782 "к", | |
1783 "д", | |
1784 "п", | |
1785 "м", | |
1786 "з", | |
1787 "г", | |
1788 "я", | |
1789 "ъ", | |
1790 "у", | |
1791 "б", | |
1792 "ч", | |
1793 "ц", | |
1794 "й", | |
1795 "ж", | |
1796 "щ", | |
1797 "х", | |
1798 ], | |
1799 "Croatian": [ | |
1800 "a", | |
1801 "i", | |
1802 "o", | |
1803 "e", | |
1804 "n", | |
1805 "r", | |
1806 "j", | |
1807 "s", | |
1808 "t", | |
1809 "u", | |
1810 "k", | |
1811 "l", | |
1812 "v", | |
1813 "d", | |
1814 "m", | |
1815 "p", | |
1816 "g", | |
1817 "z", | |
1818 "b", | |
1819 "c", | |
1820 "č", | |
1821 "h", | |
1822 "š", | |
1823 "ž", | |
1824 "ć", | |
1825 "f", | |
1826 ], | |
1827 "Hindi": [ | |
1828 "क", | |
1829 "र", | |
1830 "स", | |
1831 "न", | |
1832 "त", | |
1833 "म", | |
1834 "ह", | |
1835 "प", | |
1836 "य", | |
1837 "ल", | |
1838 "व", | |
1839 "ज", | |
1840 "द", | |
1841 "ग", | |
1842 "ब", | |
1843 "श", | |
1844 "ट", | |
1845 "अ", | |
1846 "ए", | |
1847 "थ", | |
1848 "भ", | |
1849 "ड", | |
1850 "च", | |
1851 "ध", | |
1852 "ष", | |
1853 "इ", | |
1854 ], | |
1855 "Estonian": [ | |
1856 "a", | |
1857 "i", | |
1858 "e", | |
1859 "s", | |
1860 "t", | |
1861 "l", | |
1862 "u", | |
1863 "n", | |
1864 "o", | |
1865 "k", | |
1866 "r", | |
1867 "d", | |
1868 "m", | |
1869 "v", | |
1870 "g", | |
1871 "p", | |
1872 "j", | |
1873 "h", | |
1874 "ä", | |
1875 "b", | |
1876 "õ", | |
1877 "ü", | |
1878 "f", | |
1879 "c", | |
1880 "ö", | |
1881 "y", | |
1882 ], | |
1883 "Thai": [ | |
1884 "า", | |
1885 "น", | |
1886 "ร", | |
1887 "อ", | |
1888 "ก", | |
1889 "เ", | |
1890 "ง", | |
1891 "ม", | |
1892 "ย", | |
1893 "ล", | |
1894 "ว", | |
1895 "ด", | |
1896 "ท", | |
1897 "ส", | |
1898 "ต", | |
1899 "ะ", | |
1900 "ป", | |
1901 "บ", | |
1902 "ค", | |
1903 "ห", | |
1904 "แ", | |
1905 "จ", | |
1906 "พ", | |
1907 "ช", | |
1908 "ข", | |
1909 "ใ", | |
1910 ], | |
1911 "Greek": [ | |
1912 "α", | |
1913 "τ", | |
1914 "ο", | |
1915 "ι", | |
1916 "ε", | |
1917 "ν", | |
1918 "ρ", | |
1919 "σ", | |
1920 "κ", | |
1921 "η", | |
1922 "π", | |
1923 "ς", | |
1924 "υ", | |
1925 "μ", | |
1926 "λ", | |
1927 "ί", | |
1928 "ό", | |
1929 "ά", | |
1930 "γ", | |
1931 "έ", | |
1932 "δ", | |
1933 "ή", | |
1934 "ω", | |
1935 "χ", | |
1936 "θ", | |
1937 "ύ", | |
1938 ], | |
1939 "Tamil": [ | |
1940 "க", | |
1941 "த", | |
1942 "ப", | |
1943 "ட", | |
1944 "ர", | |
1945 "ம", | |
1946 "ல", | |
1947 "ன", | |
1948 "வ", | |
1949 "ற", | |
1950 "ய", | |
1951 "ள", | |
1952 "ச", | |
1953 "ந", | |
1954 "இ", | |
1955 "ண", | |
1956 "அ", | |
1957 "ஆ", | |
1958 "ழ", | |
1959 "ங", | |
1960 "எ", | |
1961 "உ", | |
1962 "ஒ", | |
1963 "ஸ", | |
1964 ], | |
1965 "Kazakh": [ | |
1966 "а", | |
1967 "ы", | |
1968 "е", | |
1969 "н", | |
1970 "т", | |
1971 "р", | |
1972 "л", | |
1973 "і", | |
1974 "д", | |
1975 "с", | |
1976 "м", | |
1977 "қ", | |
1978 "к", | |
1979 "о", | |
1980 "б", | |
1981 "и", | |
1982 "у", | |
1983 "ғ", | |
1984 "ж", | |
1985 "ң", | |
1986 "з", | |
1987 "ш", | |
1988 "й", | |
1989 "п", | |
1990 "г", | |
1991 "ө", | |
1992 ], | |
1993 } | |
1994 | |
1995 LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) |