annotate charset_normalizer/constant.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
rev   line source
jpayne@7 1 # -*- coding: utf-8 -*-
jpayne@7 2 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
jpayne@7 3 from encodings.aliases import aliases
jpayne@7 4 from re import IGNORECASE, compile as re_compile
jpayne@7 5 from typing import Dict, List, Set, Union
jpayne@7 6
jpayne@7 7 # Contain for each eligible encoding a list of/item bytes SIG/BOM
jpayne@7 8 ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
jpayne@7 9 "utf_8": BOM_UTF8,
jpayne@7 10 "utf_7": [
jpayne@7 11 b"\x2b\x2f\x76\x38",
jpayne@7 12 b"\x2b\x2f\x76\x39",
jpayne@7 13 b"\x2b\x2f\x76\x2b",
jpayne@7 14 b"\x2b\x2f\x76\x2f",
jpayne@7 15 b"\x2b\x2f\x76\x38\x2d",
jpayne@7 16 ],
jpayne@7 17 "gb18030": b"\x84\x31\x95\x33",
jpayne@7 18 "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
jpayne@7 19 "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
jpayne@7 20 }
jpayne@7 21
jpayne@7 22 TOO_SMALL_SEQUENCE: int = 32
jpayne@7 23 TOO_BIG_SEQUENCE: int = int(10e6)
jpayne@7 24
jpayne@7 25 UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
jpayne@7 26
jpayne@7 27 # Up-to-date Unicode ucd/15.0.0
jpayne@7 28 UNICODE_RANGES_COMBINED: Dict[str, range] = {
jpayne@7 29 "Control character": range(32),
jpayne@7 30 "Basic Latin": range(32, 128),
jpayne@7 31 "Latin-1 Supplement": range(128, 256),
jpayne@7 32 "Latin Extended-A": range(256, 384),
jpayne@7 33 "Latin Extended-B": range(384, 592),
jpayne@7 34 "IPA Extensions": range(592, 688),
jpayne@7 35 "Spacing Modifier Letters": range(688, 768),
jpayne@7 36 "Combining Diacritical Marks": range(768, 880),
jpayne@7 37 "Greek and Coptic": range(880, 1024),
jpayne@7 38 "Cyrillic": range(1024, 1280),
jpayne@7 39 "Cyrillic Supplement": range(1280, 1328),
jpayne@7 40 "Armenian": range(1328, 1424),
jpayne@7 41 "Hebrew": range(1424, 1536),
jpayne@7 42 "Arabic": range(1536, 1792),
jpayne@7 43 "Syriac": range(1792, 1872),
jpayne@7 44 "Arabic Supplement": range(1872, 1920),
jpayne@7 45 "Thaana": range(1920, 1984),
jpayne@7 46 "NKo": range(1984, 2048),
jpayne@7 47 "Samaritan": range(2048, 2112),
jpayne@7 48 "Mandaic": range(2112, 2144),
jpayne@7 49 "Syriac Supplement": range(2144, 2160),
jpayne@7 50 "Arabic Extended-B": range(2160, 2208),
jpayne@7 51 "Arabic Extended-A": range(2208, 2304),
jpayne@7 52 "Devanagari": range(2304, 2432),
jpayne@7 53 "Bengali": range(2432, 2560),
jpayne@7 54 "Gurmukhi": range(2560, 2688),
jpayne@7 55 "Gujarati": range(2688, 2816),
jpayne@7 56 "Oriya": range(2816, 2944),
jpayne@7 57 "Tamil": range(2944, 3072),
jpayne@7 58 "Telugu": range(3072, 3200),
jpayne@7 59 "Kannada": range(3200, 3328),
jpayne@7 60 "Malayalam": range(3328, 3456),
jpayne@7 61 "Sinhala": range(3456, 3584),
jpayne@7 62 "Thai": range(3584, 3712),
jpayne@7 63 "Lao": range(3712, 3840),
jpayne@7 64 "Tibetan": range(3840, 4096),
jpayne@7 65 "Myanmar": range(4096, 4256),
jpayne@7 66 "Georgian": range(4256, 4352),
jpayne@7 67 "Hangul Jamo": range(4352, 4608),
jpayne@7 68 "Ethiopic": range(4608, 4992),
jpayne@7 69 "Ethiopic Supplement": range(4992, 5024),
jpayne@7 70 "Cherokee": range(5024, 5120),
jpayne@7 71 "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
jpayne@7 72 "Ogham": range(5760, 5792),
jpayne@7 73 "Runic": range(5792, 5888),
jpayne@7 74 "Tagalog": range(5888, 5920),
jpayne@7 75 "Hanunoo": range(5920, 5952),
jpayne@7 76 "Buhid": range(5952, 5984),
jpayne@7 77 "Tagbanwa": range(5984, 6016),
jpayne@7 78 "Khmer": range(6016, 6144),
jpayne@7 79 "Mongolian": range(6144, 6320),
jpayne@7 80 "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
jpayne@7 81 "Limbu": range(6400, 6480),
jpayne@7 82 "Tai Le": range(6480, 6528),
jpayne@7 83 "New Tai Lue": range(6528, 6624),
jpayne@7 84 "Khmer Symbols": range(6624, 6656),
jpayne@7 85 "Buginese": range(6656, 6688),
jpayne@7 86 "Tai Tham": range(6688, 6832),
jpayne@7 87 "Combining Diacritical Marks Extended": range(6832, 6912),
jpayne@7 88 "Balinese": range(6912, 7040),
jpayne@7 89 "Sundanese": range(7040, 7104),
jpayne@7 90 "Batak": range(7104, 7168),
jpayne@7 91 "Lepcha": range(7168, 7248),
jpayne@7 92 "Ol Chiki": range(7248, 7296),
jpayne@7 93 "Cyrillic Extended-C": range(7296, 7312),
jpayne@7 94 "Georgian Extended": range(7312, 7360),
jpayne@7 95 "Sundanese Supplement": range(7360, 7376),
jpayne@7 96 "Vedic Extensions": range(7376, 7424),
jpayne@7 97 "Phonetic Extensions": range(7424, 7552),
jpayne@7 98 "Phonetic Extensions Supplement": range(7552, 7616),
jpayne@7 99 "Combining Diacritical Marks Supplement": range(7616, 7680),
jpayne@7 100 "Latin Extended Additional": range(7680, 7936),
jpayne@7 101 "Greek Extended": range(7936, 8192),
jpayne@7 102 "General Punctuation": range(8192, 8304),
jpayne@7 103 "Superscripts and Subscripts": range(8304, 8352),
jpayne@7 104 "Currency Symbols": range(8352, 8400),
jpayne@7 105 "Combining Diacritical Marks for Symbols": range(8400, 8448),
jpayne@7 106 "Letterlike Symbols": range(8448, 8528),
jpayne@7 107 "Number Forms": range(8528, 8592),
jpayne@7 108 "Arrows": range(8592, 8704),
jpayne@7 109 "Mathematical Operators": range(8704, 8960),
jpayne@7 110 "Miscellaneous Technical": range(8960, 9216),
jpayne@7 111 "Control Pictures": range(9216, 9280),
jpayne@7 112 "Optical Character Recognition": range(9280, 9312),
jpayne@7 113 "Enclosed Alphanumerics": range(9312, 9472),
jpayne@7 114 "Box Drawing": range(9472, 9600),
jpayne@7 115 "Block Elements": range(9600, 9632),
jpayne@7 116 "Geometric Shapes": range(9632, 9728),
jpayne@7 117 "Miscellaneous Symbols": range(9728, 9984),
jpayne@7 118 "Dingbats": range(9984, 10176),
jpayne@7 119 "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
jpayne@7 120 "Supplemental Arrows-A": range(10224, 10240),
jpayne@7 121 "Braille Patterns": range(10240, 10496),
jpayne@7 122 "Supplemental Arrows-B": range(10496, 10624),
jpayne@7 123 "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
jpayne@7 124 "Supplemental Mathematical Operators": range(10752, 11008),
jpayne@7 125 "Miscellaneous Symbols and Arrows": range(11008, 11264),
jpayne@7 126 "Glagolitic": range(11264, 11360),
jpayne@7 127 "Latin Extended-C": range(11360, 11392),
jpayne@7 128 "Coptic": range(11392, 11520),
jpayne@7 129 "Georgian Supplement": range(11520, 11568),
jpayne@7 130 "Tifinagh": range(11568, 11648),
jpayne@7 131 "Ethiopic Extended": range(11648, 11744),
jpayne@7 132 "Cyrillic Extended-A": range(11744, 11776),
jpayne@7 133 "Supplemental Punctuation": range(11776, 11904),
jpayne@7 134 "CJK Radicals Supplement": range(11904, 12032),
jpayne@7 135 "Kangxi Radicals": range(12032, 12256),
jpayne@7 136 "Ideographic Description Characters": range(12272, 12288),
jpayne@7 137 "CJK Symbols and Punctuation": range(12288, 12352),
jpayne@7 138 "Hiragana": range(12352, 12448),
jpayne@7 139 "Katakana": range(12448, 12544),
jpayne@7 140 "Bopomofo": range(12544, 12592),
jpayne@7 141 "Hangul Compatibility Jamo": range(12592, 12688),
jpayne@7 142 "Kanbun": range(12688, 12704),
jpayne@7 143 "Bopomofo Extended": range(12704, 12736),
jpayne@7 144 "CJK Strokes": range(12736, 12784),
jpayne@7 145 "Katakana Phonetic Extensions": range(12784, 12800),
jpayne@7 146 "Enclosed CJK Letters and Months": range(12800, 13056),
jpayne@7 147 "CJK Compatibility": range(13056, 13312),
jpayne@7 148 "CJK Unified Ideographs Extension A": range(13312, 19904),
jpayne@7 149 "Yijing Hexagram Symbols": range(19904, 19968),
jpayne@7 150 "CJK Unified Ideographs": range(19968, 40960),
jpayne@7 151 "Yi Syllables": range(40960, 42128),
jpayne@7 152 "Yi Radicals": range(42128, 42192),
jpayne@7 153 "Lisu": range(42192, 42240),
jpayne@7 154 "Vai": range(42240, 42560),
jpayne@7 155 "Cyrillic Extended-B": range(42560, 42656),
jpayne@7 156 "Bamum": range(42656, 42752),
jpayne@7 157 "Modifier Tone Letters": range(42752, 42784),
jpayne@7 158 "Latin Extended-D": range(42784, 43008),
jpayne@7 159 "Syloti Nagri": range(43008, 43056),
jpayne@7 160 "Common Indic Number Forms": range(43056, 43072),
jpayne@7 161 "Phags-pa": range(43072, 43136),
jpayne@7 162 "Saurashtra": range(43136, 43232),
jpayne@7 163 "Devanagari Extended": range(43232, 43264),
jpayne@7 164 "Kayah Li": range(43264, 43312),
jpayne@7 165 "Rejang": range(43312, 43360),
jpayne@7 166 "Hangul Jamo Extended-A": range(43360, 43392),
jpayne@7 167 "Javanese": range(43392, 43488),
jpayne@7 168 "Myanmar Extended-B": range(43488, 43520),
jpayne@7 169 "Cham": range(43520, 43616),
jpayne@7 170 "Myanmar Extended-A": range(43616, 43648),
jpayne@7 171 "Tai Viet": range(43648, 43744),
jpayne@7 172 "Meetei Mayek Extensions": range(43744, 43776),
jpayne@7 173 "Ethiopic Extended-A": range(43776, 43824),
jpayne@7 174 "Latin Extended-E": range(43824, 43888),
jpayne@7 175 "Cherokee Supplement": range(43888, 43968),
jpayne@7 176 "Meetei Mayek": range(43968, 44032),
jpayne@7 177 "Hangul Syllables": range(44032, 55216),
jpayne@7 178 "Hangul Jamo Extended-B": range(55216, 55296),
jpayne@7 179 "High Surrogates": range(55296, 56192),
jpayne@7 180 "High Private Use Surrogates": range(56192, 56320),
jpayne@7 181 "Low Surrogates": range(56320, 57344),
jpayne@7 182 "Private Use Area": range(57344, 63744),
jpayne@7 183 "CJK Compatibility Ideographs": range(63744, 64256),
jpayne@7 184 "Alphabetic Presentation Forms": range(64256, 64336),
jpayne@7 185 "Arabic Presentation Forms-A": range(64336, 65024),
jpayne@7 186 "Variation Selectors": range(65024, 65040),
jpayne@7 187 "Vertical Forms": range(65040, 65056),
jpayne@7 188 "Combining Half Marks": range(65056, 65072),
jpayne@7 189 "CJK Compatibility Forms": range(65072, 65104),
jpayne@7 190 "Small Form Variants": range(65104, 65136),
jpayne@7 191 "Arabic Presentation Forms-B": range(65136, 65280),
jpayne@7 192 "Halfwidth and Fullwidth Forms": range(65280, 65520),
jpayne@7 193 "Specials": range(65520, 65536),
jpayne@7 194 "Linear B Syllabary": range(65536, 65664),
jpayne@7 195 "Linear B Ideograms": range(65664, 65792),
jpayne@7 196 "Aegean Numbers": range(65792, 65856),
jpayne@7 197 "Ancient Greek Numbers": range(65856, 65936),
jpayne@7 198 "Ancient Symbols": range(65936, 66000),
jpayne@7 199 "Phaistos Disc": range(66000, 66048),
jpayne@7 200 "Lycian": range(66176, 66208),
jpayne@7 201 "Carian": range(66208, 66272),
jpayne@7 202 "Coptic Epact Numbers": range(66272, 66304),
jpayne@7 203 "Old Italic": range(66304, 66352),
jpayne@7 204 "Gothic": range(66352, 66384),
jpayne@7 205 "Old Permic": range(66384, 66432),
jpayne@7 206 "Ugaritic": range(66432, 66464),
jpayne@7 207 "Old Persian": range(66464, 66528),
jpayne@7 208 "Deseret": range(66560, 66640),
jpayne@7 209 "Shavian": range(66640, 66688),
jpayne@7 210 "Osmanya": range(66688, 66736),
jpayne@7 211 "Osage": range(66736, 66816),
jpayne@7 212 "Elbasan": range(66816, 66864),
jpayne@7 213 "Caucasian Albanian": range(66864, 66928),
jpayne@7 214 "Vithkuqi": range(66928, 67008),
jpayne@7 215 "Linear A": range(67072, 67456),
jpayne@7 216 "Latin Extended-F": range(67456, 67520),
jpayne@7 217 "Cypriot Syllabary": range(67584, 67648),
jpayne@7 218 "Imperial Aramaic": range(67648, 67680),
jpayne@7 219 "Palmyrene": range(67680, 67712),
jpayne@7 220 "Nabataean": range(67712, 67760),
jpayne@7 221 "Hatran": range(67808, 67840),
jpayne@7 222 "Phoenician": range(67840, 67872),
jpayne@7 223 "Lydian": range(67872, 67904),
jpayne@7 224 "Meroitic Hieroglyphs": range(67968, 68000),
jpayne@7 225 "Meroitic Cursive": range(68000, 68096),
jpayne@7 226 "Kharoshthi": range(68096, 68192),
jpayne@7 227 "Old South Arabian": range(68192, 68224),
jpayne@7 228 "Old North Arabian": range(68224, 68256),
jpayne@7 229 "Manichaean": range(68288, 68352),
jpayne@7 230 "Avestan": range(68352, 68416),
jpayne@7 231 "Inscriptional Parthian": range(68416, 68448),
jpayne@7 232 "Inscriptional Pahlavi": range(68448, 68480),
jpayne@7 233 "Psalter Pahlavi": range(68480, 68528),
jpayne@7 234 "Old Turkic": range(68608, 68688),
jpayne@7 235 "Old Hungarian": range(68736, 68864),
jpayne@7 236 "Hanifi Rohingya": range(68864, 68928),
jpayne@7 237 "Rumi Numeral Symbols": range(69216, 69248),
jpayne@7 238 "Yezidi": range(69248, 69312),
jpayne@7 239 "Arabic Extended-C": range(69312, 69376),
jpayne@7 240 "Old Sogdian": range(69376, 69424),
jpayne@7 241 "Sogdian": range(69424, 69488),
jpayne@7 242 "Old Uyghur": range(69488, 69552),
jpayne@7 243 "Chorasmian": range(69552, 69600),
jpayne@7 244 "Elymaic": range(69600, 69632),
jpayne@7 245 "Brahmi": range(69632, 69760),
jpayne@7 246 "Kaithi": range(69760, 69840),
jpayne@7 247 "Sora Sompeng": range(69840, 69888),
jpayne@7 248 "Chakma": range(69888, 69968),
jpayne@7 249 "Mahajani": range(69968, 70016),
jpayne@7 250 "Sharada": range(70016, 70112),
jpayne@7 251 "Sinhala Archaic Numbers": range(70112, 70144),
jpayne@7 252 "Khojki": range(70144, 70224),
jpayne@7 253 "Multani": range(70272, 70320),
jpayne@7 254 "Khudawadi": range(70320, 70400),
jpayne@7 255 "Grantha": range(70400, 70528),
jpayne@7 256 "Newa": range(70656, 70784),
jpayne@7 257 "Tirhuta": range(70784, 70880),
jpayne@7 258 "Siddham": range(71040, 71168),
jpayne@7 259 "Modi": range(71168, 71264),
jpayne@7 260 "Mongolian Supplement": range(71264, 71296),
jpayne@7 261 "Takri": range(71296, 71376),
jpayne@7 262 "Ahom": range(71424, 71504),
jpayne@7 263 "Dogra": range(71680, 71760),
jpayne@7 264 "Warang Citi": range(71840, 71936),
jpayne@7 265 "Dives Akuru": range(71936, 72032),
jpayne@7 266 "Nandinagari": range(72096, 72192),
jpayne@7 267 "Zanabazar Square": range(72192, 72272),
jpayne@7 268 "Soyombo": range(72272, 72368),
jpayne@7 269 "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
jpayne@7 270 "Pau Cin Hau": range(72384, 72448),
jpayne@7 271 "Devanagari Extended-A": range(72448, 72544),
jpayne@7 272 "Bhaiksuki": range(72704, 72816),
jpayne@7 273 "Marchen": range(72816, 72896),
jpayne@7 274 "Masaram Gondi": range(72960, 73056),
jpayne@7 275 "Gunjala Gondi": range(73056, 73136),
jpayne@7 276 "Makasar": range(73440, 73472),
jpayne@7 277 "Kawi": range(73472, 73568),
jpayne@7 278 "Lisu Supplement": range(73648, 73664),
jpayne@7 279 "Tamil Supplement": range(73664, 73728),
jpayne@7 280 "Cuneiform": range(73728, 74752),
jpayne@7 281 "Cuneiform Numbers and Punctuation": range(74752, 74880),
jpayne@7 282 "Early Dynastic Cuneiform": range(74880, 75088),
jpayne@7 283 "Cypro-Minoan": range(77712, 77824),
jpayne@7 284 "Egyptian Hieroglyphs": range(77824, 78896),
jpayne@7 285 "Egyptian Hieroglyph Format Controls": range(78896, 78944),
jpayne@7 286 "Anatolian Hieroglyphs": range(82944, 83584),
jpayne@7 287 "Bamum Supplement": range(92160, 92736),
jpayne@7 288 "Mro": range(92736, 92784),
jpayne@7 289 "Tangsa": range(92784, 92880),
jpayne@7 290 "Bassa Vah": range(92880, 92928),
jpayne@7 291 "Pahawh Hmong": range(92928, 93072),
jpayne@7 292 "Medefaidrin": range(93760, 93856),
jpayne@7 293 "Miao": range(93952, 94112),
jpayne@7 294 "Ideographic Symbols and Punctuation": range(94176, 94208),
jpayne@7 295 "Tangut": range(94208, 100352),
jpayne@7 296 "Tangut Components": range(100352, 101120),
jpayne@7 297 "Khitan Small Script": range(101120, 101632),
jpayne@7 298 "Tangut Supplement": range(101632, 101760),
jpayne@7 299 "Kana Extended-B": range(110576, 110592),
jpayne@7 300 "Kana Supplement": range(110592, 110848),
jpayne@7 301 "Kana Extended-A": range(110848, 110896),
jpayne@7 302 "Small Kana Extension": range(110896, 110960),
jpayne@7 303 "Nushu": range(110960, 111360),
jpayne@7 304 "Duployan": range(113664, 113824),
jpayne@7 305 "Shorthand Format Controls": range(113824, 113840),
jpayne@7 306 "Znamenny Musical Notation": range(118528, 118736),
jpayne@7 307 "Byzantine Musical Symbols": range(118784, 119040),
jpayne@7 308 "Musical Symbols": range(119040, 119296),
jpayne@7 309 "Ancient Greek Musical Notation": range(119296, 119376),
jpayne@7 310 "Kaktovik Numerals": range(119488, 119520),
jpayne@7 311 "Mayan Numerals": range(119520, 119552),
jpayne@7 312 "Tai Xuan Jing Symbols": range(119552, 119648),
jpayne@7 313 "Counting Rod Numerals": range(119648, 119680),
jpayne@7 314 "Mathematical Alphanumeric Symbols": range(119808, 120832),
jpayne@7 315 "Sutton SignWriting": range(120832, 121520),
jpayne@7 316 "Latin Extended-G": range(122624, 122880),
jpayne@7 317 "Glagolitic Supplement": range(122880, 122928),
jpayne@7 318 "Cyrillic Extended-D": range(122928, 123024),
jpayne@7 319 "Nyiakeng Puachue Hmong": range(123136, 123216),
jpayne@7 320 "Toto": range(123536, 123584),
jpayne@7 321 "Wancho": range(123584, 123648),
jpayne@7 322 "Nag Mundari": range(124112, 124160),
jpayne@7 323 "Ethiopic Extended-B": range(124896, 124928),
jpayne@7 324 "Mende Kikakui": range(124928, 125152),
jpayne@7 325 "Adlam": range(125184, 125280),
jpayne@7 326 "Indic Siyaq Numbers": range(126064, 126144),
jpayne@7 327 "Ottoman Siyaq Numbers": range(126208, 126288),
jpayne@7 328 "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
jpayne@7 329 "Mahjong Tiles": range(126976, 127024),
jpayne@7 330 "Domino Tiles": range(127024, 127136),
jpayne@7 331 "Playing Cards": range(127136, 127232),
jpayne@7 332 "Enclosed Alphanumeric Supplement": range(127232, 127488),
jpayne@7 333 "Enclosed Ideographic Supplement": range(127488, 127744),
jpayne@7 334 "Miscellaneous Symbols and Pictographs": range(127744, 128512),
jpayne@7 335 "Emoticons range(Emoji)": range(128512, 128592),
jpayne@7 336 "Ornamental Dingbats": range(128592, 128640),
jpayne@7 337 "Transport and Map Symbols": range(128640, 128768),
jpayne@7 338 "Alchemical Symbols": range(128768, 128896),
jpayne@7 339 "Geometric Shapes Extended": range(128896, 129024),
jpayne@7 340 "Supplemental Arrows-C": range(129024, 129280),
jpayne@7 341 "Supplemental Symbols and Pictographs": range(129280, 129536),
jpayne@7 342 "Chess Symbols": range(129536, 129648),
jpayne@7 343 "Symbols and Pictographs Extended-A": range(129648, 129792),
jpayne@7 344 "Symbols for Legacy Computing": range(129792, 130048),
jpayne@7 345 "CJK Unified Ideographs Extension B": range(131072, 173792),
jpayne@7 346 "CJK Unified Ideographs Extension C": range(173824, 177984),
jpayne@7 347 "CJK Unified Ideographs Extension D": range(177984, 178208),
jpayne@7 348 "CJK Unified Ideographs Extension E": range(178208, 183984),
jpayne@7 349 "CJK Unified Ideographs Extension F": range(183984, 191472),
jpayne@7 350 "CJK Compatibility Ideographs Supplement": range(194560, 195104),
jpayne@7 351 "CJK Unified Ideographs Extension G": range(196608, 201552),
jpayne@7 352 "CJK Unified Ideographs Extension H": range(201552, 205744),
jpayne@7 353 "Tags": range(917504, 917632),
jpayne@7 354 "Variation Selectors Supplement": range(917760, 918000),
jpayne@7 355 "Supplementary Private Use Area-A": range(983040, 1048576),
jpayne@7 356 "Supplementary Private Use Area-B": range(1048576, 1114112),
jpayne@7 357 }
jpayne@7 358
jpayne@7 359
jpayne@7 360 UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
jpayne@7 361 "Supplement",
jpayne@7 362 "Extended",
jpayne@7 363 "Extensions",
jpayne@7 364 "Modifier",
jpayne@7 365 "Marks",
jpayne@7 366 "Punctuation",
jpayne@7 367 "Symbols",
jpayne@7 368 "Forms",
jpayne@7 369 "Operators",
jpayne@7 370 "Miscellaneous",
jpayne@7 371 "Drawing",
jpayne@7 372 "Block",
jpayne@7 373 "Shapes",
jpayne@7 374 "Supplemental",
jpayne@7 375 "Tags",
jpayne@7 376 ]
jpayne@7 377
jpayne@7 378 RE_POSSIBLE_ENCODING_INDICATION = re_compile(
jpayne@7 379 r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
jpayne@7 380 IGNORECASE,
jpayne@7 381 )
jpayne@7 382
jpayne@7 383 IANA_NO_ALIASES = [
jpayne@7 384 "cp720",
jpayne@7 385 "cp737",
jpayne@7 386 "cp856",
jpayne@7 387 "cp874",
jpayne@7 388 "cp875",
jpayne@7 389 "cp1006",
jpayne@7 390 "koi8_r",
jpayne@7 391 "koi8_t",
jpayne@7 392 "koi8_u",
jpayne@7 393 ]
jpayne@7 394
jpayne@7 395 IANA_SUPPORTED: List[str] = sorted(
jpayne@7 396 filter(
jpayne@7 397 lambda x: x.endswith("_codec") is False
jpayne@7 398 and x not in {"rot_13", "tactis", "mbcs"},
jpayne@7 399 list(set(aliases.values())) + IANA_NO_ALIASES,
jpayne@7 400 )
jpayne@7 401 )
jpayne@7 402
jpayne@7 403 IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
jpayne@7 404
jpayne@7 405 # pre-computed code page that are similar using the function cp_similarity.
jpayne@7 406 IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
jpayne@7 407 "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
jpayne@7 408 "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
jpayne@7 409 "cp1125": ["cp866"],
jpayne@7 410 "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
jpayne@7 411 "cp1250": ["iso8859_2"],
jpayne@7 412 "cp1251": ["kz1048", "ptcp154"],
jpayne@7 413 "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
jpayne@7 414 "cp1253": ["iso8859_7"],
jpayne@7 415 "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
jpayne@7 416 "cp1257": ["iso8859_13"],
jpayne@7 417 "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
jpayne@7 418 "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
jpayne@7 419 "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
jpayne@7 420 "cp850": ["cp437", "cp857", "cp858", "cp865"],
jpayne@7 421 "cp857": ["cp850", "cp858", "cp865"],
jpayne@7 422 "cp858": ["cp437", "cp850", "cp857", "cp865"],
jpayne@7 423 "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
jpayne@7 424 "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
jpayne@7 425 "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
jpayne@7 426 "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
jpayne@7 427 "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
jpayne@7 428 "cp866": ["cp1125"],
jpayne@7 429 "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
jpayne@7 430 "iso8859_11": ["tis_620"],
jpayne@7 431 "iso8859_13": ["cp1257"],
jpayne@7 432 "iso8859_14": [
jpayne@7 433 "iso8859_10",
jpayne@7 434 "iso8859_15",
jpayne@7 435 "iso8859_16",
jpayne@7 436 "iso8859_3",
jpayne@7 437 "iso8859_9",
jpayne@7 438 "latin_1",
jpayne@7 439 ],
jpayne@7 440 "iso8859_15": [
jpayne@7 441 "cp1252",
jpayne@7 442 "cp1254",
jpayne@7 443 "iso8859_10",
jpayne@7 444 "iso8859_14",
jpayne@7 445 "iso8859_16",
jpayne@7 446 "iso8859_3",
jpayne@7 447 "iso8859_9",
jpayne@7 448 "latin_1",
jpayne@7 449 ],
jpayne@7 450 "iso8859_16": [
jpayne@7 451 "iso8859_14",
jpayne@7 452 "iso8859_15",
jpayne@7 453 "iso8859_2",
jpayne@7 454 "iso8859_3",
jpayne@7 455 "iso8859_9",
jpayne@7 456 "latin_1",
jpayne@7 457 ],
jpayne@7 458 "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
jpayne@7 459 "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
jpayne@7 460 "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
jpayne@7 461 "iso8859_7": ["cp1253"],
jpayne@7 462 "iso8859_9": [
jpayne@7 463 "cp1252",
jpayne@7 464 "cp1254",
jpayne@7 465 "cp1258",
jpayne@7 466 "iso8859_10",
jpayne@7 467 "iso8859_14",
jpayne@7 468 "iso8859_15",
jpayne@7 469 "iso8859_16",
jpayne@7 470 "iso8859_3",
jpayne@7 471 "iso8859_4",
jpayne@7 472 "latin_1",
jpayne@7 473 ],
jpayne@7 474 "kz1048": ["cp1251", "ptcp154"],
jpayne@7 475 "latin_1": [
jpayne@7 476 "cp1252",
jpayne@7 477 "cp1254",
jpayne@7 478 "cp1258",
jpayne@7 479 "iso8859_10",
jpayne@7 480 "iso8859_14",
jpayne@7 481 "iso8859_15",
jpayne@7 482 "iso8859_16",
jpayne@7 483 "iso8859_3",
jpayne@7 484 "iso8859_4",
jpayne@7 485 "iso8859_9",
jpayne@7 486 ],
jpayne@7 487 "mac_iceland": ["mac_roman", "mac_turkish"],
jpayne@7 488 "mac_roman": ["mac_iceland", "mac_turkish"],
jpayne@7 489 "mac_turkish": ["mac_iceland", "mac_roman"],
jpayne@7 490 "ptcp154": ["cp1251", "kz1048"],
jpayne@7 491 "tis_620": ["iso8859_11"],
jpayne@7 492 }
jpayne@7 493
jpayne@7 494
jpayne@7 495 CHARDET_CORRESPONDENCE: Dict[str, str] = {
jpayne@7 496 "iso2022_kr": "ISO-2022-KR",
jpayne@7 497 "iso2022_jp": "ISO-2022-JP",
jpayne@7 498 "euc_kr": "EUC-KR",
jpayne@7 499 "tis_620": "TIS-620",
jpayne@7 500 "utf_32": "UTF-32",
jpayne@7 501 "euc_jp": "EUC-JP",
jpayne@7 502 "koi8_r": "KOI8-R",
jpayne@7 503 "iso8859_1": "ISO-8859-1",
jpayne@7 504 "iso8859_2": "ISO-8859-2",
jpayne@7 505 "iso8859_5": "ISO-8859-5",
jpayne@7 506 "iso8859_6": "ISO-8859-6",
jpayne@7 507 "iso8859_7": "ISO-8859-7",
jpayne@7 508 "iso8859_8": "ISO-8859-8",
jpayne@7 509 "utf_16": "UTF-16",
jpayne@7 510 "cp855": "IBM855",
jpayne@7 511 "mac_cyrillic": "MacCyrillic",
jpayne@7 512 "gb2312": "GB2312",
jpayne@7 513 "gb18030": "GB18030",
jpayne@7 514 "cp932": "CP932",
jpayne@7 515 "cp866": "IBM866",
jpayne@7 516 "utf_8": "utf-8",
jpayne@7 517 "utf_8_sig": "UTF-8-SIG",
jpayne@7 518 "shift_jis": "SHIFT_JIS",
jpayne@7 519 "big5": "Big5",
jpayne@7 520 "cp1250": "windows-1250",
jpayne@7 521 "cp1251": "windows-1251",
jpayne@7 522 "cp1252": "Windows-1252",
jpayne@7 523 "cp1253": "windows-1253",
jpayne@7 524 "cp1255": "windows-1255",
jpayne@7 525 "cp1256": "windows-1256",
jpayne@7 526 "cp1254": "Windows-1254",
jpayne@7 527 "cp949": "CP949",
jpayne@7 528 }
jpayne@7 529
jpayne@7 530
jpayne@7 531 COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
jpayne@7 532 "<",
jpayne@7 533 ">",
jpayne@7 534 "=",
jpayne@7 535 ":",
jpayne@7 536 "/",
jpayne@7 537 "&",
jpayne@7 538 ";",
jpayne@7 539 "{",
jpayne@7 540 "}",
jpayne@7 541 "[",
jpayne@7 542 "]",
jpayne@7 543 ",",
jpayne@7 544 "|",
jpayne@7 545 '"',
jpayne@7 546 "-",
jpayne@7 547 }
jpayne@7 548
jpayne@7 549
jpayne@7 550 KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
jpayne@7 551 ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
jpayne@7 552
jpayne@7 553 # Logging LEVEL below DEBUG
jpayne@7 554 TRACE: int = 5
jpayne@7 555
jpayne@7 556
jpayne@7 557 # Language label that contain the em dash "—"
jpayne@7 558 # character are to be considered alternative seq to origin
jpayne@7 559 FREQUENCIES: Dict[str, List[str]] = {
jpayne@7 560 "English": [
jpayne@7 561 "e",
jpayne@7 562 "a",
jpayne@7 563 "t",
jpayne@7 564 "i",
jpayne@7 565 "o",
jpayne@7 566 "n",
jpayne@7 567 "s",
jpayne@7 568 "r",
jpayne@7 569 "h",
jpayne@7 570 "l",
jpayne@7 571 "d",
jpayne@7 572 "c",
jpayne@7 573 "u",
jpayne@7 574 "m",
jpayne@7 575 "f",
jpayne@7 576 "p",
jpayne@7 577 "g",
jpayne@7 578 "w",
jpayne@7 579 "y",
jpayne@7 580 "b",
jpayne@7 581 "v",
jpayne@7 582 "k",
jpayne@7 583 "x",
jpayne@7 584 "j",
jpayne@7 585 "z",
jpayne@7 586 "q",
jpayne@7 587 ],
jpayne@7 588 "English—": [
jpayne@7 589 "e",
jpayne@7 590 "a",
jpayne@7 591 "t",
jpayne@7 592 "i",
jpayne@7 593 "o",
jpayne@7 594 "n",
jpayne@7 595 "s",
jpayne@7 596 "r",
jpayne@7 597 "h",
jpayne@7 598 "l",
jpayne@7 599 "d",
jpayne@7 600 "c",
jpayne@7 601 "m",
jpayne@7 602 "u",
jpayne@7 603 "f",
jpayne@7 604 "p",
jpayne@7 605 "g",
jpayne@7 606 "w",
jpayne@7 607 "b",
jpayne@7 608 "y",
jpayne@7 609 "v",
jpayne@7 610 "k",
jpayne@7 611 "j",
jpayne@7 612 "x",
jpayne@7 613 "z",
jpayne@7 614 "q",
jpayne@7 615 ],
jpayne@7 616 "German": [
jpayne@7 617 "e",
jpayne@7 618 "n",
jpayne@7 619 "i",
jpayne@7 620 "r",
jpayne@7 621 "s",
jpayne@7 622 "t",
jpayne@7 623 "a",
jpayne@7 624 "d",
jpayne@7 625 "h",
jpayne@7 626 "u",
jpayne@7 627 "l",
jpayne@7 628 "g",
jpayne@7 629 "o",
jpayne@7 630 "c",
jpayne@7 631 "m",
jpayne@7 632 "b",
jpayne@7 633 "f",
jpayne@7 634 "k",
jpayne@7 635 "w",
jpayne@7 636 "z",
jpayne@7 637 "p",
jpayne@7 638 "v",
jpayne@7 639 "ü",
jpayne@7 640 "ä",
jpayne@7 641 "ö",
jpayne@7 642 "j",
jpayne@7 643 ],
jpayne@7 644 "French": [
jpayne@7 645 "e",
jpayne@7 646 "a",
jpayne@7 647 "s",
jpayne@7 648 "n",
jpayne@7 649 "i",
jpayne@7 650 "t",
jpayne@7 651 "r",
jpayne@7 652 "l",
jpayne@7 653 "u",
jpayne@7 654 "o",
jpayne@7 655 "d",
jpayne@7 656 "c",
jpayne@7 657 "p",
jpayne@7 658 "m",
jpayne@7 659 "é",
jpayne@7 660 "v",
jpayne@7 661 "g",
jpayne@7 662 "f",
jpayne@7 663 "b",
jpayne@7 664 "h",
jpayne@7 665 "q",
jpayne@7 666 "à",
jpayne@7 667 "x",
jpayne@7 668 "è",
jpayne@7 669 "y",
jpayne@7 670 "j",
jpayne@7 671 ],
jpayne@7 672 "Dutch": [
jpayne@7 673 "e",
jpayne@7 674 "n",
jpayne@7 675 "a",
jpayne@7 676 "i",
jpayne@7 677 "r",
jpayne@7 678 "t",
jpayne@7 679 "o",
jpayne@7 680 "d",
jpayne@7 681 "s",
jpayne@7 682 "l",
jpayne@7 683 "g",
jpayne@7 684 "h",
jpayne@7 685 "v",
jpayne@7 686 "m",
jpayne@7 687 "u",
jpayne@7 688 "k",
jpayne@7 689 "c",
jpayne@7 690 "p",
jpayne@7 691 "b",
jpayne@7 692 "w",
jpayne@7 693 "j",
jpayne@7 694 "z",
jpayne@7 695 "f",
jpayne@7 696 "y",
jpayne@7 697 "x",
jpayne@7 698 "ë",
jpayne@7 699 ],
jpayne@7 700 "Italian": [
jpayne@7 701 "e",
jpayne@7 702 "i",
jpayne@7 703 "a",
jpayne@7 704 "o",
jpayne@7 705 "n",
jpayne@7 706 "l",
jpayne@7 707 "t",
jpayne@7 708 "r",
jpayne@7 709 "s",
jpayne@7 710 "c",
jpayne@7 711 "d",
jpayne@7 712 "u",
jpayne@7 713 "p",
jpayne@7 714 "m",
jpayne@7 715 "g",
jpayne@7 716 "v",
jpayne@7 717 "f",
jpayne@7 718 "b",
jpayne@7 719 "z",
jpayne@7 720 "h",
jpayne@7 721 "q",
jpayne@7 722 "è",
jpayne@7 723 "à",
jpayne@7 724 "k",
jpayne@7 725 "y",
jpayne@7 726 "ò",
jpayne@7 727 ],
jpayne@7 728 "Polish": [
jpayne@7 729 "a",
jpayne@7 730 "i",
jpayne@7 731 "o",
jpayne@7 732 "e",
jpayne@7 733 "n",
jpayne@7 734 "r",
jpayne@7 735 "z",
jpayne@7 736 "w",
jpayne@7 737 "s",
jpayne@7 738 "c",
jpayne@7 739 "t",
jpayne@7 740 "k",
jpayne@7 741 "y",
jpayne@7 742 "d",
jpayne@7 743 "p",
jpayne@7 744 "m",
jpayne@7 745 "u",
jpayne@7 746 "l",
jpayne@7 747 "j",
jpayne@7 748 "ł",
jpayne@7 749 "g",
jpayne@7 750 "b",
jpayne@7 751 "h",
jpayne@7 752 "ą",
jpayne@7 753 "ę",
jpayne@7 754 "ó",
jpayne@7 755 ],
jpayne@7 756 "Spanish": [
jpayne@7 757 "e",
jpayne@7 758 "a",
jpayne@7 759 "o",
jpayne@7 760 "n",
jpayne@7 761 "s",
jpayne@7 762 "r",
jpayne@7 763 "i",
jpayne@7 764 "l",
jpayne@7 765 "d",
jpayne@7 766 "t",
jpayne@7 767 "c",
jpayne@7 768 "u",
jpayne@7 769 "m",
jpayne@7 770 "p",
jpayne@7 771 "b",
jpayne@7 772 "g",
jpayne@7 773 "v",
jpayne@7 774 "f",
jpayne@7 775 "y",
jpayne@7 776 "ó",
jpayne@7 777 "h",
jpayne@7 778 "q",
jpayne@7 779 "í",
jpayne@7 780 "j",
jpayne@7 781 "z",
jpayne@7 782 "á",
jpayne@7 783 ],
jpayne@7 784 "Russian": [
jpayne@7 785 "о",
jpayne@7 786 "а",
jpayne@7 787 "е",
jpayne@7 788 "и",
jpayne@7 789 "н",
jpayne@7 790 "с",
jpayne@7 791 "т",
jpayne@7 792 "р",
jpayne@7 793 "в",
jpayne@7 794 "л",
jpayne@7 795 "к",
jpayne@7 796 "м",
jpayne@7 797 "д",
jpayne@7 798 "п",
jpayne@7 799 "у",
jpayne@7 800 "г",
jpayne@7 801 "я",
jpayne@7 802 "ы",
jpayne@7 803 "з",
jpayne@7 804 "б",
jpayne@7 805 "й",
jpayne@7 806 "ь",
jpayne@7 807 "ч",
jpayne@7 808 "х",
jpayne@7 809 "ж",
jpayne@7 810 "ц",
jpayne@7 811 ],
jpayne@7 812 # Jap-Kanji
jpayne@7 813 "Japanese": [
jpayne@7 814 "人",
jpayne@7 815 "一",
jpayne@7 816 "大",
jpayne@7 817 "亅",
jpayne@7 818 "丁",
jpayne@7 819 "丨",
jpayne@7 820 "竹",
jpayne@7 821 "笑",
jpayne@7 822 "口",
jpayne@7 823 "日",
jpayne@7 824 "今",
jpayne@7 825 "二",
jpayne@7 826 "彳",
jpayne@7 827 "行",
jpayne@7 828 "十",
jpayne@7 829 "土",
jpayne@7 830 "丶",
jpayne@7 831 "寸",
jpayne@7 832 "寺",
jpayne@7 833 "時",
jpayne@7 834 "乙",
jpayne@7 835 "丿",
jpayne@7 836 "乂",
jpayne@7 837 "气",
jpayne@7 838 "気",
jpayne@7 839 "冂",
jpayne@7 840 "巾",
jpayne@7 841 "亠",
jpayne@7 842 "市",
jpayne@7 843 "目",
jpayne@7 844 "儿",
jpayne@7 845 "見",
jpayne@7 846 "八",
jpayne@7 847 "小",
jpayne@7 848 "凵",
jpayne@7 849 "県",
jpayne@7 850 "月",
jpayne@7 851 "彐",
jpayne@7 852 "門",
jpayne@7 853 "間",
jpayne@7 854 "木",
jpayne@7 855 "東",
jpayne@7 856 "山",
jpayne@7 857 "出",
jpayne@7 858 "本",
jpayne@7 859 "中",
jpayne@7 860 "刀",
jpayne@7 861 "分",
jpayne@7 862 "耳",
jpayne@7 863 "又",
jpayne@7 864 "取",
jpayne@7 865 "最",
jpayne@7 866 "言",
jpayne@7 867 "田",
jpayne@7 868 "心",
jpayne@7 869 "思",
jpayne@7 870 "刂",
jpayne@7 871 "前",
jpayne@7 872 "京",
jpayne@7 873 "尹",
jpayne@7 874 "事",
jpayne@7 875 "生",
jpayne@7 876 "厶",
jpayne@7 877 "云",
jpayne@7 878 "会",
jpayne@7 879 "未",
jpayne@7 880 "来",
jpayne@7 881 "白",
jpayne@7 882 "冫",
jpayne@7 883 "楽",
jpayne@7 884 "灬",
jpayne@7 885 "馬",
jpayne@7 886 "尸",
jpayne@7 887 "尺",
jpayne@7 888 "駅",
jpayne@7 889 "明",
jpayne@7 890 "耂",
jpayne@7 891 "者",
jpayne@7 892 "了",
jpayne@7 893 "阝",
jpayne@7 894 "都",
jpayne@7 895 "高",
jpayne@7 896 "卜",
jpayne@7 897 "占",
jpayne@7 898 "厂",
jpayne@7 899 "广",
jpayne@7 900 "店",
jpayne@7 901 "子",
jpayne@7 902 "申",
jpayne@7 903 "奄",
jpayne@7 904 "亻",
jpayne@7 905 "俺",
jpayne@7 906 "上",
jpayne@7 907 "方",
jpayne@7 908 "冖",
jpayne@7 909 "学",
jpayne@7 910 "衣",
jpayne@7 911 "艮",
jpayne@7 912 "食",
jpayne@7 913 "自",
jpayne@7 914 ],
jpayne@7 915 # Jap-Katakana
jpayne@7 916 "Japanese—": [
jpayne@7 917 "ー",
jpayne@7 918 "ン",
jpayne@7 919 "ス",
jpayne@7 920 "・",
jpayne@7 921 "ル",
jpayne@7 922 "ト",
jpayne@7 923 "リ",
jpayne@7 924 "イ",
jpayne@7 925 "ア",
jpayne@7 926 "ラ",
jpayne@7 927 "ッ",
jpayne@7 928 "ク",
jpayne@7 929 "ド",
jpayne@7 930 "シ",
jpayne@7 931 "レ",
jpayne@7 932 "ジ",
jpayne@7 933 "タ",
jpayne@7 934 "フ",
jpayne@7 935 "ロ",
jpayne@7 936 "カ",
jpayne@7 937 "テ",