Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/python3.8/cpython/unicodeobject.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 #ifndef Py_CPYTHON_UNICODEOBJECT_H | |
2 # error "this header file must not be included directly" | |
3 #endif | |
4 | |
5 #ifdef __cplusplus | |
6 extern "C" { | |
7 #endif | |
8 | |
9 /* Py_UNICODE was the native Unicode storage format (code unit) used by | |
10 Python and represents a single Unicode element in the Unicode type. | |
11 With PEP 393, Py_UNICODE is deprecated and replaced with a | |
12 typedef to wchar_t. */ | |
13 #define PY_UNICODE_TYPE wchar_t | |
14 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE; | |
15 | |
16 /* --- Internal Unicode Operations ---------------------------------------- */ | |
17 | |
18 /* Since splitting on whitespace is an important use case, and | |
19 whitespace in most situations is solely ASCII whitespace, we | |
20 optimize for the common case by using a quick look-up table | |
21 _Py_ascii_whitespace (see below) with an inlined check. | |
22 | |
23 */ | |
24 #define Py_UNICODE_ISSPACE(ch) \ | |
25 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) | |
26 | |
27 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) | |
28 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) | |
29 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) | |
30 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) | |
31 | |
32 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) | |
33 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) | |
34 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) | |
35 | |
36 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) | |
37 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) | |
38 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) | |
39 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) | |
40 | |
41 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) | |
42 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) | |
43 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) | |
44 | |
45 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) | |
46 | |
47 #define Py_UNICODE_ISALNUM(ch) \ | |
48 (Py_UNICODE_ISALPHA(ch) || \ | |
49 Py_UNICODE_ISDECIMAL(ch) || \ | |
50 Py_UNICODE_ISDIGIT(ch) || \ | |
51 Py_UNICODE_ISNUMERIC(ch)) | |
52 | |
53 #define Py_UNICODE_COPY(target, source, length) \ | |
54 memcpy((target), (source), (length)*sizeof(Py_UNICODE)) | |
55 | |
56 #define Py_UNICODE_FILL(target, value, length) \ | |
57 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ | |
58 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ | |
59 } while (0) | |
60 | |
61 /* macros to work with surrogates */ | |
62 #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) | |
63 #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) | |
64 #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) | |
65 /* Join two surrogate characters and return a single Py_UCS4 value. */ | |
66 #define Py_UNICODE_JOIN_SURROGATES(high, low) \ | |
67 (((((Py_UCS4)(high) & 0x03FF) << 10) | \ | |
68 ((Py_UCS4)(low) & 0x03FF)) + 0x10000) | |
69 /* high surrogate = top 10 bits added to D800 */ | |
70 #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) | |
71 /* low surrogate = bottom 10 bits added to DC00 */ | |
72 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) | |
73 | |
74 /* Check if substring matches at given offset. The offset must be | |
75 valid, and the substring must not be empty. */ | |
76 | |
77 #define Py_UNICODE_MATCH(string, offset, substring) \ | |
78 ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ | |
79 ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ | |
80 !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) | |
81 | |
82 /* --- Unicode Type ------------------------------------------------------- */ | |
83 | |
84 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject | |
85 structure. state.ascii and state.compact are set, and the data | |
86 immediately follow the structure. utf8_length and wstr_length can be found | |
87 in the length field; the utf8 pointer is equal to the data pointer. */ | |
88 typedef struct { | |
89 /* There are 4 forms of Unicode strings: | |
90 | |
91 - compact ascii: | |
92 | |
93 * structure = PyASCIIObject | |
94 * test: PyUnicode_IS_COMPACT_ASCII(op) | |
95 * kind = PyUnicode_1BYTE_KIND | |
96 * compact = 1 | |
97 * ascii = 1 | |
98 * ready = 1 | |
99 * (length is the length of the utf8 and wstr strings) | |
100 * (data starts just after the structure) | |
101 * (since ASCII is decoded from UTF-8, the utf8 string are the data) | |
102 | |
103 - compact: | |
104 | |
105 * structure = PyCompactUnicodeObject | |
106 * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) | |
107 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or | |
108 PyUnicode_4BYTE_KIND | |
109 * compact = 1 | |
110 * ready = 1 | |
111 * ascii = 0 | |
112 * utf8 is not shared with data | |
113 * utf8_length = 0 if utf8 is NULL | |
114 * wstr is shared with data and wstr_length=length | |
115 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 | |
116 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 | |
117 * wstr_length = 0 if wstr is NULL | |
118 * (data starts just after the structure) | |
119 | |
120 - legacy string, not ready: | |
121 | |
122 * structure = PyUnicodeObject | |
123 * test: kind == PyUnicode_WCHAR_KIND | |
124 * length = 0 (use wstr_length) | |
125 * hash = -1 | |
126 * kind = PyUnicode_WCHAR_KIND | |
127 * compact = 0 | |
128 * ascii = 0 | |
129 * ready = 0 | |
130 * interned = SSTATE_NOT_INTERNED | |
131 * wstr is not NULL | |
132 * data.any is NULL | |
133 * utf8 is NULL | |
134 * utf8_length = 0 | |
135 | |
136 - legacy string, ready: | |
137 | |
138 * structure = PyUnicodeObject structure | |
139 * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND | |
140 * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or | |
141 PyUnicode_4BYTE_KIND | |
142 * compact = 0 | |
143 * ready = 1 | |
144 * data.any is not NULL | |
145 * utf8 is shared and utf8_length = length with data.any if ascii = 1 | |
146 * utf8_length = 0 if utf8 is NULL | |
147 * wstr is shared with data.any and wstr_length = length | |
148 if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 | |
149 or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 | |
150 * wstr_length = 0 if wstr is NULL | |
151 | |
152 Compact strings use only one memory block (structure + characters), | |
153 whereas legacy strings use one block for the structure and one block | |
154 for characters. | |
155 | |
156 Legacy strings are created by PyUnicode_FromUnicode() and | |
157 PyUnicode_FromStringAndSize(NULL, size) functions. They become ready | |
158 when PyUnicode_READY() is called. | |
159 | |
160 See also _PyUnicode_CheckConsistency(). | |
161 */ | |
162 PyObject_HEAD | |
163 Py_ssize_t length; /* Number of code points in the string */ | |
164 Py_hash_t hash; /* Hash value; -1 if not set */ | |
165 struct { | |
166 /* | |
167 SSTATE_NOT_INTERNED (0) | |
168 SSTATE_INTERNED_MORTAL (1) | |
169 SSTATE_INTERNED_IMMORTAL (2) | |
170 | |
171 If interned != SSTATE_NOT_INTERNED, the two references from the | |
172 dictionary to this object are *not* counted in ob_refcnt. | |
173 */ | |
174 unsigned int interned:2; | |
175 /* Character size: | |
176 | |
177 - PyUnicode_WCHAR_KIND (0): | |
178 | |
179 * character type = wchar_t (16 or 32 bits, depending on the | |
180 platform) | |
181 | |
182 - PyUnicode_1BYTE_KIND (1): | |
183 | |
184 * character type = Py_UCS1 (8 bits, unsigned) | |
185 * all characters are in the range U+0000-U+00FF (latin1) | |
186 * if ascii is set, all characters are in the range U+0000-U+007F | |
187 (ASCII), otherwise at least one character is in the range | |
188 U+0080-U+00FF | |
189 | |
190 - PyUnicode_2BYTE_KIND (2): | |
191 | |
192 * character type = Py_UCS2 (16 bits, unsigned) | |
193 * all characters are in the range U+0000-U+FFFF (BMP) | |
194 * at least one character is in the range U+0100-U+FFFF | |
195 | |
196 - PyUnicode_4BYTE_KIND (4): | |
197 | |
198 * character type = Py_UCS4 (32 bits, unsigned) | |
199 * all characters are in the range U+0000-U+10FFFF | |
200 * at least one character is in the range U+10000-U+10FFFF | |
201 */ | |
202 unsigned int kind:3; | |
203 /* Compact is with respect to the allocation scheme. Compact unicode | |
204 objects only require one memory block while non-compact objects use | |
205 one block for the PyUnicodeObject struct and another for its data | |
206 buffer. */ | |
207 unsigned int compact:1; | |
208 /* The string only contains characters in the range U+0000-U+007F (ASCII) | |
209 and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is | |
210 set, use the PyASCIIObject structure. */ | |
211 unsigned int ascii:1; | |
212 /* The ready flag indicates whether the object layout is initialized | |
213 completely. This means that this is either a compact object, or | |
214 the data pointer is filled out. The bit is redundant, and helps | |
215 to minimize the test in PyUnicode_IS_READY(). */ | |
216 unsigned int ready:1; | |
217 /* Padding to ensure that PyUnicode_DATA() is always aligned to | |
218 4 bytes (see issue #19537 on m68k). */ | |
219 unsigned int :24; | |
220 } state; | |
221 wchar_t *wstr; /* wchar_t representation (null-terminated) */ | |
222 } PyASCIIObject; | |
223 | |
224 /* Non-ASCII strings allocated through PyUnicode_New use the | |
225 PyCompactUnicodeObject structure. state.compact is set, and the data | |
226 immediately follow the structure. */ | |
227 typedef struct { | |
228 PyASCIIObject _base; | |
229 Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the | |
230 * terminating \0. */ | |
231 char *utf8; /* UTF-8 representation (null-terminated) */ | |
232 Py_ssize_t wstr_length; /* Number of code points in wstr, possible | |
233 * surrogates count as two code points. */ | |
234 } PyCompactUnicodeObject; | |
235 | |
236 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the | |
237 PyUnicodeObject structure. The actual string data is initially in the wstr | |
238 block, and copied into the data block using _PyUnicode_Ready. */ | |
239 typedef struct { | |
240 PyCompactUnicodeObject _base; | |
241 union { | |
242 void *any; | |
243 Py_UCS1 *latin1; | |
244 Py_UCS2 *ucs2; | |
245 Py_UCS4 *ucs4; | |
246 } data; /* Canonical, smallest-form Unicode buffer */ | |
247 } PyUnicodeObject; | |
248 | |
249 PyAPI_FUNC(int) _PyUnicode_CheckConsistency( | |
250 PyObject *op, | |
251 int check_content); | |
252 | |
253 /* Fast access macros */ | |
254 #define PyUnicode_WSTR_LENGTH(op) \ | |
255 (PyUnicode_IS_COMPACT_ASCII(op) ? \ | |
256 ((PyASCIIObject*)op)->length : \ | |
257 ((PyCompactUnicodeObject*)op)->wstr_length) | |
258 | |
259 /* Returns the deprecated Py_UNICODE representation's size in code units | |
260 (this includes surrogate pairs as 2 units). | |
261 If the Py_UNICODE representation is not available, it will be computed | |
262 on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ | |
263 | |
264 /* Py_DEPRECATED(3.3) */ | |
265 #define PyUnicode_GET_SIZE(op) \ | |
266 (assert(PyUnicode_Check(op)), \ | |
267 (((PyASCIIObject *)(op))->wstr) ? \ | |
268 PyUnicode_WSTR_LENGTH(op) : \ | |
269 ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\ | |
270 assert(((PyASCIIObject *)(op))->wstr), \ | |
271 PyUnicode_WSTR_LENGTH(op))) | |
272 | |
273 /* Py_DEPRECATED(3.3) */ | |
274 #define PyUnicode_GET_DATA_SIZE(op) \ | |
275 (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) | |
276 | |
277 /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE | |
278 representation on demand. Using this macro is very inefficient now, | |
279 try to port your code to use the new PyUnicode_*BYTE_DATA() macros or | |
280 use PyUnicode_WRITE() and PyUnicode_READ(). */ | |
281 | |
282 /* Py_DEPRECATED(3.3) */ | |
283 #define PyUnicode_AS_UNICODE(op) \ | |
284 (assert(PyUnicode_Check(op)), \ | |
285 (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ | |
286 PyUnicode_AsUnicode(_PyObject_CAST(op))) | |
287 | |
288 /* Py_DEPRECATED(3.3) */ | |
289 #define PyUnicode_AS_DATA(op) \ | |
290 ((const char *)(PyUnicode_AS_UNICODE(op))) | |
291 | |
292 | |
293 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ | |
294 | |
295 /* Values for PyASCIIObject.state: */ | |
296 | |
297 /* Interning state. */ | |
298 #define SSTATE_NOT_INTERNED 0 | |
299 #define SSTATE_INTERNED_MORTAL 1 | |
300 #define SSTATE_INTERNED_IMMORTAL 2 | |
301 | |
302 /* Return true if the string contains only ASCII characters, or 0 if not. The | |
303 string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be | |
304 ready. */ | |
305 #define PyUnicode_IS_ASCII(op) \ | |
306 (assert(PyUnicode_Check(op)), \ | |
307 assert(PyUnicode_IS_READY(op)), \ | |
308 ((PyASCIIObject*)op)->state.ascii) | |
309 | |
310 /* Return true if the string is compact or 0 if not. | |
311 No type checks or Ready calls are performed. */ | |
312 #define PyUnicode_IS_COMPACT(op) \ | |
313 (((PyASCIIObject*)(op))->state.compact) | |
314 | |
315 /* Return true if the string is a compact ASCII string (use PyASCIIObject | |
316 structure), or 0 if not. No type checks or Ready calls are performed. */ | |
317 #define PyUnicode_IS_COMPACT_ASCII(op) \ | |
318 (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) | |
319 | |
320 enum PyUnicode_Kind { | |
321 /* String contains only wstr byte characters. This is only possible | |
322 when the string was created with a legacy API and _PyUnicode_Ready() | |
323 has not been called yet. */ | |
324 PyUnicode_WCHAR_KIND = 0, | |
325 /* Return values of the PyUnicode_KIND() macro: */ | |
326 PyUnicode_1BYTE_KIND = 1, | |
327 PyUnicode_2BYTE_KIND = 2, | |
328 PyUnicode_4BYTE_KIND = 4 | |
329 }; | |
330 | |
331 /* Return pointers to the canonical representation cast to unsigned char, | |
332 Py_UCS2, or Py_UCS4 for direct character access. | |
333 No checks are performed, use PyUnicode_KIND() before to ensure | |
334 these will work correctly. */ | |
335 | |
336 #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) | |
337 #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) | |
338 #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) | |
339 | |
340 /* Return one of the PyUnicode_*_KIND values defined above. */ | |
341 #define PyUnicode_KIND(op) \ | |
342 (assert(PyUnicode_Check(op)), \ | |
343 assert(PyUnicode_IS_READY(op)), \ | |
344 ((PyASCIIObject *)(op))->state.kind) | |
345 | |
346 /* Return a void pointer to the raw unicode buffer. */ | |
347 #define _PyUnicode_COMPACT_DATA(op) \ | |
348 (PyUnicode_IS_ASCII(op) ? \ | |
349 ((void*)((PyASCIIObject*)(op) + 1)) : \ | |
350 ((void*)((PyCompactUnicodeObject*)(op) + 1))) | |
351 | |
352 #define _PyUnicode_NONCOMPACT_DATA(op) \ | |
353 (assert(((PyUnicodeObject*)(op))->data.any), \ | |
354 ((((PyUnicodeObject *)(op))->data.any))) | |
355 | |
356 #define PyUnicode_DATA(op) \ | |
357 (assert(PyUnicode_Check(op)), \ | |
358 PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ | |
359 _PyUnicode_NONCOMPACT_DATA(op)) | |
360 | |
361 /* In the access macros below, "kind" may be evaluated more than once. | |
362 All other macro parameters are evaluated exactly once, so it is safe | |
363 to put side effects into them (such as increasing the index). */ | |
364 | |
365 /* Write into the canonical representation, this macro does not do any sanity | |
366 checks and is intended for usage in loops. The caller should cache the | |
367 kind and data pointers obtained from other macro calls. | |
368 index is the index in the string (starts at 0) and value is the new | |
369 code point value which should be written to that location. */ | |
370 #define PyUnicode_WRITE(kind, data, index, value) \ | |
371 do { \ | |
372 switch ((kind)) { \ | |
373 case PyUnicode_1BYTE_KIND: { \ | |
374 ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ | |
375 break; \ | |
376 } \ | |
377 case PyUnicode_2BYTE_KIND: { \ | |
378 ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ | |
379 break; \ | |
380 } \ | |
381 default: { \ | |
382 assert((kind) == PyUnicode_4BYTE_KIND); \ | |
383 ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ | |
384 } \ | |
385 } \ | |
386 } while (0) | |
387 | |
388 /* Read a code point from the string's canonical representation. No checks | |
389 or ready calls are performed. */ | |
390 #define PyUnicode_READ(kind, data, index) \ | |
391 ((Py_UCS4) \ | |
392 ((kind) == PyUnicode_1BYTE_KIND ? \ | |
393 ((const Py_UCS1 *)(data))[(index)] : \ | |
394 ((kind) == PyUnicode_2BYTE_KIND ? \ | |
395 ((const Py_UCS2 *)(data))[(index)] : \ | |
396 ((const Py_UCS4 *)(data))[(index)] \ | |
397 ) \ | |
398 )) | |
399 | |
400 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it | |
401 calls PyUnicode_KIND() and might call it twice. For single reads, use | |
402 PyUnicode_READ_CHAR, for multiple consecutive reads callers should | |
403 cache kind and use PyUnicode_READ instead. */ | |
404 #define PyUnicode_READ_CHAR(unicode, index) \ | |
405 (assert(PyUnicode_Check(unicode)), \ | |
406 assert(PyUnicode_IS_READY(unicode)), \ | |
407 (Py_UCS4) \ | |
408 (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ | |
409 ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ | |
410 (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ | |
411 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ | |
412 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ | |
413 ) \ | |
414 )) | |
415 | |
416 /* Returns the length of the unicode string. The caller has to make sure that | |
417 the string has it's canonical representation set before calling | |
418 this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ | |
419 #define PyUnicode_GET_LENGTH(op) \ | |
420 (assert(PyUnicode_Check(op)), \ | |
421 assert(PyUnicode_IS_READY(op)), \ | |
422 ((PyASCIIObject *)(op))->length) | |
423 | |
424 | |
425 /* Fast check to determine whether an object is ready. Equivalent to | |
426 PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ | |
427 | |
428 #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) | |
429 | |
430 /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best | |
431 case. If the canonical representation is not yet set, it will still call | |
432 _PyUnicode_Ready(). | |
433 Returns 0 on success and -1 on errors. */ | |
434 #define PyUnicode_READY(op) \ | |
435 (assert(PyUnicode_Check(op)), \ | |
436 (PyUnicode_IS_READY(op) ? \ | |
437 0 : _PyUnicode_Ready(_PyObject_CAST(op)))) | |
438 | |
439 /* Return a maximum character value which is suitable for creating another | |
440 string based on op. This is always an approximation but more efficient | |
441 than iterating over the string. */ | |
442 #define PyUnicode_MAX_CHAR_VALUE(op) \ | |
443 (assert(PyUnicode_IS_READY(op)), \ | |
444 (PyUnicode_IS_ASCII(op) ? \ | |
445 (0x7f) : \ | |
446 (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ | |
447 (0xffU) : \ | |
448 (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ | |
449 (0xffffU) : \ | |
450 (0x10ffffU))))) | |
451 | |
452 /* === Public API ========================================================= */ | |
453 | |
454 /* --- Plain Py_UNICODE --------------------------------------------------- */ | |
455 | |
456 /* With PEP 393, this is the recommended way to allocate a new unicode object. | |
457 This function will allocate the object and its buffer in a single memory | |
458 block. Objects created using this function are not resizable. */ | |
459 PyAPI_FUNC(PyObject*) PyUnicode_New( | |
460 Py_ssize_t size, /* Number of code points in the new string */ | |
461 Py_UCS4 maxchar /* maximum code point value in the string */ | |
462 ); | |
463 | |
464 /* Initializes the canonical string representation from the deprecated | |
465 wstr/Py_UNICODE representation. This function is used to convert Unicode | |
466 objects which were created using the old API to the new flexible format | |
467 introduced with PEP 393. | |
468 | |
469 Don't call this function directly, use the public PyUnicode_READY() macro | |
470 instead. */ | |
471 PyAPI_FUNC(int) _PyUnicode_Ready( | |
472 PyObject *unicode /* Unicode object */ | |
473 ); | |
474 | |
475 /* Get a copy of a Unicode string. */ | |
476 PyAPI_FUNC(PyObject*) _PyUnicode_Copy( | |
477 PyObject *unicode | |
478 ); | |
479 | |
480 /* Copy character from one unicode object into another, this function performs | |
481 character conversion when necessary and falls back to memcpy() if possible. | |
482 | |
483 Fail if to is too small (smaller than *how_many* or smaller than | |
484 len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > | |
485 kind(to), or if *to* has more than 1 reference. | |
486 | |
487 Return the number of written character, or return -1 and raise an exception | |
488 on error. | |
489 | |
490 Pseudo-code: | |
491 | |
492 how_many = min(how_many, len(from) - from_start) | |
493 to[to_start:to_start+how_many] = from[from_start:from_start+how_many] | |
494 return how_many | |
495 | |
496 Note: The function doesn't write a terminating null character. | |
497 */ | |
498 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( | |
499 PyObject *to, | |
500 Py_ssize_t to_start, | |
501 PyObject *from, | |
502 Py_ssize_t from_start, | |
503 Py_ssize_t how_many | |
504 ); | |
505 | |
506 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so | |
507 may crash if parameters are invalid (e.g. if the output string | |
508 is too short). */ | |
509 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( | |
510 PyObject *to, | |
511 Py_ssize_t to_start, | |
512 PyObject *from, | |
513 Py_ssize_t from_start, | |
514 Py_ssize_t how_many | |
515 ); | |
516 | |
517 /* Fill a string with a character: write fill_char into | |
518 unicode[start:start+length]. | |
519 | |
520 Fail if fill_char is bigger than the string maximum character, or if the | |
521 string has more than 1 reference. | |
522 | |
523 Return the number of written character, or return -1 and raise an exception | |
524 on error. */ | |
525 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( | |
526 PyObject *unicode, | |
527 Py_ssize_t start, | |
528 Py_ssize_t length, | |
529 Py_UCS4 fill_char | |
530 ); | |
531 | |
532 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash | |
533 if parameters are invalid (e.g. if length is longer than the string). */ | |
534 PyAPI_FUNC(void) _PyUnicode_FastFill( | |
535 PyObject *unicode, | |
536 Py_ssize_t start, | |
537 Py_ssize_t length, | |
538 Py_UCS4 fill_char | |
539 ); | |
540 | |
541 /* Create a Unicode Object from the Py_UNICODE buffer u of the given | |
542 size. | |
543 | |
544 u may be NULL which causes the contents to be undefined. It is the | |
545 user's responsibility to fill in the needed data afterwards. Note | |
546 that modifying the Unicode object contents after construction is | |
547 only allowed if u was set to NULL. | |
548 | |
549 The buffer is copied into the new object. */ | |
550 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( | |
551 const Py_UNICODE *u, /* Unicode buffer */ | |
552 Py_ssize_t size /* size of buffer */ | |
553 ); | |
554 | |
555 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. | |
556 Scan the string to find the maximum character. */ | |
557 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( | |
558 int kind, | |
559 const void *buffer, | |
560 Py_ssize_t size); | |
561 | |
562 /* Create a new string from a buffer of ASCII characters. | |
563 WARNING: Don't check if the string contains any non-ASCII character. */ | |
564 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( | |
565 const char *buffer, | |
566 Py_ssize_t size); | |
567 | |
568 /* Compute the maximum character of the substring unicode[start:end]. | |
569 Return 127 for an empty string. */ | |
570 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( | |
571 PyObject *unicode, | |
572 Py_ssize_t start, | |
573 Py_ssize_t end); | |
574 | |
575 /* Return a read-only pointer to the Unicode object's internal | |
576 Py_UNICODE buffer. | |
577 If the wchar_t/Py_UNICODE representation is not yet available, this | |
578 function will calculate it. */ | |
579 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( | |
580 PyObject *unicode /* Unicode object */ | |
581 ); | |
582 | |
583 /* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string | |
584 contains null characters. */ | |
585 PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode( | |
586 PyObject *unicode /* Unicode object */ | |
587 ); | |
588 | |
589 /* Return a read-only pointer to the Unicode object's internal | |
590 Py_UNICODE buffer and save the length at size. | |
591 If the wchar_t/Py_UNICODE representation is not yet available, this | |
592 function will calculate it. */ | |
593 | |
594 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( | |
595 PyObject *unicode, /* Unicode object */ | |
596 Py_ssize_t *size /* location where to save the length */ | |
597 ); | |
598 | |
599 /* Get the maximum ordinal for a Unicode character. */ | |
600 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); | |
601 | |
602 | |
603 /* --- _PyUnicodeWriter API ----------------------------------------------- */ | |
604 | |
605 typedef struct { | |
606 PyObject *buffer; | |
607 void *data; | |
608 enum PyUnicode_Kind kind; | |
609 Py_UCS4 maxchar; | |
610 Py_ssize_t size; | |
611 Py_ssize_t pos; | |
612 | |
613 /* minimum number of allocated characters (default: 0) */ | |
614 Py_ssize_t min_length; | |
615 | |
616 /* minimum character (default: 127, ASCII) */ | |
617 Py_UCS4 min_char; | |
618 | |
619 /* If non-zero, overallocate the buffer (default: 0). */ | |
620 unsigned char overallocate; | |
621 | |
622 /* If readonly is 1, buffer is a shared string (cannot be modified) | |
623 and size is set to 0. */ | |
624 unsigned char readonly; | |
625 } _PyUnicodeWriter ; | |
626 | |
627 /* Initialize a Unicode writer. | |
628 * | |
629 * By default, the minimum buffer size is 0 character and overallocation is | |
630 * disabled. Set min_length, min_char and overallocate attributes to control | |
631 * the allocation of the buffer. */ | |
632 PyAPI_FUNC(void) | |
633 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer); | |
634 | |
635 /* Prepare the buffer to write 'length' characters | |
636 with the specified maximum character. | |
637 | |
638 Return 0 on success, raise an exception and return -1 on error. */ | |
639 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ | |
640 (((MAXCHAR) <= (WRITER)->maxchar \ | |
641 && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ | |
642 ? 0 \ | |
643 : (((LENGTH) == 0) \ | |
644 ? 0 \ | |
645 : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) | |
646 | |
647 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro | |
648 instead. */ | |
649 PyAPI_FUNC(int) | |
650 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, | |
651 Py_ssize_t length, Py_UCS4 maxchar); | |
652 | |
653 /* Prepare the buffer to have at least the kind KIND. | |
654 For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will | |
655 support characters in range U+000-U+FFFF. | |
656 | |
657 Return 0 on success, raise an exception and return -1 on error. */ | |
658 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ | |
659 (assert((KIND) != PyUnicode_WCHAR_KIND), \ | |
660 (KIND) <= (WRITER)->kind \ | |
661 ? 0 \ | |
662 : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) | |
663 | |
664 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() | |
665 macro instead. */ | |
666 PyAPI_FUNC(int) | |
667 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, | |
668 enum PyUnicode_Kind kind); | |
669 | |
670 /* Append a Unicode character. | |
671 Return 0 on success, raise an exception and return -1 on error. */ | |
672 PyAPI_FUNC(int) | |
673 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, | |
674 Py_UCS4 ch | |
675 ); | |
676 | |
677 /* Append a Unicode string. | |
678 Return 0 on success, raise an exception and return -1 on error. */ | |
679 PyAPI_FUNC(int) | |
680 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, | |
681 PyObject *str /* Unicode string */ | |
682 ); | |
683 | |
684 /* Append a substring of a Unicode string. | |
685 Return 0 on success, raise an exception and return -1 on error. */ | |
686 PyAPI_FUNC(int) | |
687 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, | |
688 PyObject *str, /* Unicode string */ | |
689 Py_ssize_t start, | |
690 Py_ssize_t end | |
691 ); | |
692 | |
693 /* Append an ASCII-encoded byte string. | |
694 Return 0 on success, raise an exception and return -1 on error. */ | |
695 PyAPI_FUNC(int) | |
696 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, | |
697 const char *str, /* ASCII-encoded byte string */ | |
698 Py_ssize_t len /* number of bytes, or -1 if unknown */ | |
699 ); | |
700 | |
701 /* Append a latin1-encoded byte string. | |
702 Return 0 on success, raise an exception and return -1 on error. */ | |
703 PyAPI_FUNC(int) | |
704 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, | |
705 const char *str, /* latin1-encoded byte string */ | |
706 Py_ssize_t len /* length in bytes */ | |
707 ); | |
708 | |
709 /* Get the value of the writer as a Unicode string. Clear the | |
710 buffer of the writer. Raise an exception and return NULL | |
711 on error. */ | |
712 PyAPI_FUNC(PyObject *) | |
713 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); | |
714 | |
715 /* Deallocate memory of a writer (clear its internal buffer). */ | |
716 PyAPI_FUNC(void) | |
717 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); | |
718 | |
719 | |
720 /* Format the object based on the format_spec, as defined in PEP 3101 | |
721 (Advanced String Formatting). */ | |
722 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( | |
723 _PyUnicodeWriter *writer, | |
724 PyObject *obj, | |
725 PyObject *format_spec, | |
726 Py_ssize_t start, | |
727 Py_ssize_t end); | |
728 | |
729 /* --- wchar_t support for platforms which support it --------------------- */ | |
730 | |
731 #ifdef HAVE_WCHAR_H | |
732 PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); | |
733 #endif | |
734 | |
735 /* --- Manage the default encoding ---------------------------------------- */ | |
736 | |
737 /* Returns a pointer to the default encoding (UTF-8) of the | |
738 Unicode object unicode and the size of the encoded representation | |
739 in bytes stored in *size. | |
740 | |
741 In case of an error, no *size is set. | |
742 | |
743 This function caches the UTF-8 encoded string in the unicodeobject | |
744 and subsequent calls will return the same string. The memory is released | |
745 when the unicodeobject is deallocated. | |
746 | |
747 _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to | |
748 support the previous internal function with the same behaviour. | |
749 | |
750 *** This API is for interpreter INTERNAL USE ONLY and will likely | |
751 *** be removed or changed in the future. | |
752 | |
753 *** If you need to access the Unicode object as UTF-8 bytes string, | |
754 *** please use PyUnicode_AsUTF8String() instead. | |
755 */ | |
756 | |
757 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( | |
758 PyObject *unicode, | |
759 Py_ssize_t *size); | |
760 | |
761 #define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize | |
762 | |
763 /* Returns a pointer to the default encoding (UTF-8) of the | |
764 Unicode object unicode. | |
765 | |
766 Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation | |
767 in the unicodeobject. | |
768 | |
769 _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to | |
770 support the previous internal function with the same behaviour. | |
771 | |
772 Use of this API is DEPRECATED since no size information can be | |
773 extracted from the returned data. | |
774 | |
775 *** This API is for interpreter INTERNAL USE ONLY and will likely | |
776 *** be removed or changed for Python 3.1. | |
777 | |
778 *** If you need to access the Unicode object as UTF-8 bytes string, | |
779 *** please use PyUnicode_AsUTF8String() instead. | |
780 | |
781 */ | |
782 | |
783 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); | |
784 | |
785 #define _PyUnicode_AsString PyUnicode_AsUTF8 | |
786 | |
787 /* --- Generic Codecs ----------------------------------------------------- */ | |
788 | |
789 /* Encodes a Py_UNICODE buffer of the given size and returns a | |
790 Python string object. */ | |
791 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode( | |
792 const Py_UNICODE *s, /* Unicode char buffer */ | |
793 Py_ssize_t size, /* number of Py_UNICODE chars to encode */ | |
794 const char *encoding, /* encoding */ | |
795 const char *errors /* error handling */ | |
796 ); | |
797 | |
798 /* --- UTF-7 Codecs ------------------------------------------------------- */ | |
799 | |
800 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( | |
801 const Py_UNICODE *data, /* Unicode char buffer */ | |
802 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ | |
803 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ | |
804 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ | |
805 const char *errors /* error handling */ | |
806 ); | |
807 | |
808 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( | |
809 PyObject *unicode, /* Unicode object */ | |
810 int base64SetO, /* Encode RFC2152 Set O characters in base64 */ | |
811 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ | |
812 const char *errors /* error handling */ | |
813 ); | |
814 | |
815 /* --- UTF-8 Codecs ------------------------------------------------------- */ | |
816 | |
817 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( | |
818 PyObject *unicode, | |
819 const char *errors); | |
820 | |
821 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( | |
822 const Py_UNICODE *data, /* Unicode char buffer */ | |
823 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ | |
824 const char *errors /* error handling */ | |
825 ); | |
826 | |
827 /* --- UTF-32 Codecs ------------------------------------------------------ */ | |
828 | |
829 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( | |
830 const Py_UNICODE *data, /* Unicode char buffer */ | |
831 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ | |
832 const char *errors, /* error handling */ | |
833 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ | |
834 ); | |
835 | |
836 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( | |
837 PyObject *object, /* Unicode object */ | |
838 const char *errors, /* error handling */ | |
839 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ | |
840 ); | |
841 | |
842 /* --- UTF-16 Codecs ------------------------------------------------------ */ | |
843 | |
844 /* Returns a Python string object holding the UTF-16 encoded value of | |
845 the Unicode data. | |
846 | |
847 If byteorder is not 0, output is written according to the following | |
848 byte order: | |
849 | |
850 byteorder == -1: little endian | |
851 byteorder == 0: native byte order (writes a BOM mark) | |
852 byteorder == 1: big endian | |
853 | |
854 If byteorder is 0, the output string will always start with the | |
855 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is | |
856 prepended. | |
857 | |
858 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to | |
859 UCS-2. This trick makes it possible to add full UTF-16 capabilities | |
860 at a later point without compromising the APIs. | |
861 | |
862 */ | |
863 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( | |
864 const Py_UNICODE *data, /* Unicode char buffer */ | |
865 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ | |
866 const char *errors, /* error handling */ | |
867 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ | |
868 ); | |
869 | |
870 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( | |
871 PyObject* unicode, /* Unicode object */ | |
872 const char *errors, /* error handling */ | |
873 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ | |
874 ); | |
875 | |
876 /* --- Unicode-Escape Codecs ---------------------------------------------- */ | |
877 | |
878 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape | |
879 chars. */ | |
880 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( | |
881 const char *string, /* Unicode-Escape encoded string */ | |
882 Py_ssize_t length, /* size of string */ | |
883 const char *errors, /* error handling */ | |
884 const char **first_invalid_escape /* on return, points to first | |
885 invalid escaped char in | |
886 string. */ | |
887 ); | |
888 | |
889 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( | |
890 const Py_UNICODE *data, /* Unicode char buffer */ | |
891 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ | |
892 ); | |
893 | |
894 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ | |
895 | |
896 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( | |
897 const Py_UNICODE *data, /* Unicode char buffer */ | |
898 Py_ssize_t length /* Number of Py_UNICODE chars to encode */ | |
899 ); | |
900 | |
901 /* --- Latin-1 Codecs ----------------------------------------------------- */ | |
902 | |
903 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( | |
904 PyObject* unicode, | |
905 const char* errors); | |
906 | |
907 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( | |
908 const Py_UNICODE *data, /* Unicode char buffer */ | |
909 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ | |
910 const char *errors /* error handling */ | |
911 ); | |
912 | |
913 /* --- ASCII Codecs ------------------------------------------------------- */ | |
914 | |
915 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( | |
916 PyObject* unicode, | |
917 const char* errors); | |
918 | |
919 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( | |
920 const Py_UNICODE *data, /* Unicode char buffer */ | |
921 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ | |
922 const char *errors /* error handling */ | |
923 ); | |
924 | |
925 /* --- Character Map Codecs ----------------------------------------------- */ | |
926 | |
927 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( | |
928 const Py_UNICODE *data, /* Unicode char buffer */ | |
929 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ | |
930 PyObject *mapping, /* encoding mapping */ | |
931 const char *errors /* error handling */ | |
932 ); | |
933 | |
934 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( | |
935 PyObject *unicode, /* Unicode object */ | |
936 PyObject *mapping, /* encoding mapping */ | |
937 const char *errors /* error handling */ | |
938 ); | |
939 | |
940 /* Translate a Py_UNICODE buffer of the given length by applying a | |
941 character mapping table to it and return the resulting Unicode | |
942 object. | |
943 | |
944 The mapping table must map Unicode ordinal integers to Unicode strings, | |
945 Unicode ordinal integers or None (causing deletion of the character). | |
946 | |
947 Mapping tables may be dictionaries or sequences. Unmapped character | |
948 ordinals (ones which cause a LookupError) are left untouched and | |
949 are copied as-is. | |
950 | |
951 */ | |
952 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( | |
953 const Py_UNICODE *data, /* Unicode char buffer */ | |
954 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ | |
955 PyObject *table, /* Translate table */ | |
956 const char *errors /* error handling */ | |
957 ); | |
958 | |
959 /* --- MBCS codecs for Windows -------------------------------------------- */ | |
960 | |
961 #ifdef MS_WINDOWS | |
962 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( | |
963 const Py_UNICODE *data, /* Unicode char buffer */ | |
964 Py_ssize_t length, /* number of Py_UNICODE chars to encode */ | |
965 const char *errors /* error handling */ | |
966 ); | |
967 #endif | |
968 | |
969 /* --- Decimal Encoder ---------------------------------------------------- */ | |
970 | |
971 /* Takes a Unicode string holding a decimal value and writes it into | |
972 an output buffer using standard ASCII digit codes. | |
973 | |
974 The output buffer has to provide at least length+1 bytes of storage | |
975 area. The output string is 0-terminated. | |
976 | |
977 The encoder converts whitespace to ' ', decimal characters to their | |
978 corresponding ASCII digit and all other Latin-1 characters except | |
979 \0 as-is. Characters outside this range (Unicode ordinals 1-256) | |
980 are treated as errors. This includes embedded NULL bytes. | |
981 | |
982 Error handling is defined by the errors argument: | |
983 | |
984 NULL or "strict": raise a ValueError | |
985 "ignore": ignore the wrong characters (these are not copied to the | |
986 output buffer) | |
987 "replace": replaces illegal characters with '?' | |
988 | |
989 Returns 0 on success, -1 on failure. | |
990 | |
991 */ | |
992 | |
993 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal( | |
994 Py_UNICODE *s, /* Unicode buffer */ | |
995 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ | |
996 char *output, /* Output buffer; must have size >= length */ | |
997 const char *errors /* error handling */ | |
998 ); | |
999 | |
1000 /* Transforms code points that have decimal digit property to the | |
1001 corresponding ASCII digit code points. | |
1002 | |
1003 Returns a new Unicode string on success, NULL on failure. | |
1004 */ | |
1005 | |
1006 /* Py_DEPRECATED(3.3) */ | |
1007 PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( | |
1008 Py_UNICODE *s, /* Unicode buffer */ | |
1009 Py_ssize_t length /* Number of Py_UNICODE chars to transform */ | |
1010 ); | |
1011 | |
1012 /* Coverts a Unicode object holding a decimal value to an ASCII string | |
1013 for using in int, float and complex parsers. | |
1014 Transforms code points that have decimal digit property to the | |
1015 corresponding ASCII digit code points. Transforms spaces to ASCII. | |
1016 Transforms code points starting from the first non-ASCII code point that | |
1017 is neither a decimal digit nor a space to the end into '?'. */ | |
1018 | |
1019 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( | |
1020 PyObject *unicode /* Unicode object */ | |
1021 ); | |
1022 | |
1023 /* --- Methods & Slots ---------------------------------------------------- */ | |
1024 | |
1025 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray( | |
1026 PyObject *separator, | |
1027 PyObject *const *items, | |
1028 Py_ssize_t seqlen | |
1029 ); | |
1030 | |
1031 /* Test whether a unicode is equal to ASCII identifier. Return 1 if true, | |
1032 0 otherwise. The right argument must be ASCII identifier. | |
1033 Any error occurs inside will be cleared before return. */ | |
1034 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId( | |
1035 PyObject *left, /* Left string */ | |
1036 _Py_Identifier *right /* Right identifier */ | |
1037 ); | |
1038 | |
1039 /* Test whether a unicode is equal to ASCII string. Return 1 if true, | |
1040 0 otherwise. The right argument must be ASCII-encoded string. | |
1041 Any error occurs inside will be cleared before return. */ | |
1042 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString( | |
1043 PyObject *left, | |
1044 const char *right /* ASCII-encoded string */ | |
1045 ); | |
1046 | |
1047 /* Externally visible for str.strip(unicode) */ | |
1048 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( | |
1049 PyObject *self, | |
1050 int striptype, | |
1051 PyObject *sepobj | |
1052 ); | |
1053 | |
1054 /* Using explicit passed-in values, insert the thousands grouping | |
1055 into the string pointed to by buffer. For the argument descriptions, | |
1056 see Objects/stringlib/localeutil.h */ | |
1057 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( | |
1058 _PyUnicodeWriter *writer, | |
1059 Py_ssize_t n_buffer, | |
1060 PyObject *digits, | |
1061 Py_ssize_t d_pos, | |
1062 Py_ssize_t n_digits, | |
1063 Py_ssize_t min_width, | |
1064 const char *grouping, | |
1065 PyObject *thousands_sep, | |
1066 Py_UCS4 *maxchar); | |
1067 | |
1068 /* === Characters Type APIs =============================================== */ | |
1069 | |
1070 /* Helper array used by Py_UNICODE_ISSPACE(). */ | |
1071 | |
1072 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; | |
1073 | |
1074 /* These should not be used directly. Use the Py_UNICODE_IS* and | |
1075 Py_UNICODE_TO* macros instead. | |
1076 | |
1077 These APIs are implemented in Objects/unicodectype.c. | |
1078 | |
1079 */ | |
1080 | |
1081 PyAPI_FUNC(int) _PyUnicode_IsLowercase( | |
1082 Py_UCS4 ch /* Unicode character */ | |
1083 ); | |
1084 | |
1085 PyAPI_FUNC(int) _PyUnicode_IsUppercase( | |
1086 Py_UCS4 ch /* Unicode character */ | |
1087 ); | |
1088 | |
1089 PyAPI_FUNC(int) _PyUnicode_IsTitlecase( | |
1090 Py_UCS4 ch /* Unicode character */ | |
1091 ); | |
1092 | |
1093 PyAPI_FUNC(int) _PyUnicode_IsXidStart( | |
1094 Py_UCS4 ch /* Unicode character */ | |
1095 ); | |
1096 | |
1097 PyAPI_FUNC(int) _PyUnicode_IsXidContinue( | |
1098 Py_UCS4 ch /* Unicode character */ | |
1099 ); | |
1100 | |
1101 PyAPI_FUNC(int) _PyUnicode_IsWhitespace( | |
1102 const Py_UCS4 ch /* Unicode character */ | |
1103 ); | |
1104 | |
1105 PyAPI_FUNC(int) _PyUnicode_IsLinebreak( | |
1106 const Py_UCS4 ch /* Unicode character */ | |
1107 ); | |
1108 | |
1109 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( | |
1110 Py_UCS4 ch /* Unicode character */ | |
1111 ); | |
1112 | |
1113 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( | |
1114 Py_UCS4 ch /* Unicode character */ | |
1115 ); | |
1116 | |
1117 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( | |
1118 Py_UCS4 ch /* Unicode character */ | |
1119 ); | |
1120 | |
1121 PyAPI_FUNC(int) _PyUnicode_ToLowerFull( | |
1122 Py_UCS4 ch, /* Unicode character */ | |
1123 Py_UCS4 *res | |
1124 ); | |
1125 | |
1126 PyAPI_FUNC(int) _PyUnicode_ToTitleFull( | |
1127 Py_UCS4 ch, /* Unicode character */ | |
1128 Py_UCS4 *res | |
1129 ); | |
1130 | |
1131 PyAPI_FUNC(int) _PyUnicode_ToUpperFull( | |
1132 Py_UCS4 ch, /* Unicode character */ | |
1133 Py_UCS4 *res | |
1134 ); | |
1135 | |
1136 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( | |
1137 Py_UCS4 ch, /* Unicode character */ | |
1138 Py_UCS4 *res | |
1139 ); | |
1140 | |
1141 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( | |
1142 Py_UCS4 ch /* Unicode character */ | |
1143 ); | |
1144 | |
1145 PyAPI_FUNC(int) _PyUnicode_IsCased( | |
1146 Py_UCS4 ch /* Unicode character */ | |
1147 ); | |
1148 | |
1149 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( | |
1150 Py_UCS4 ch /* Unicode character */ | |
1151 ); | |
1152 | |
1153 PyAPI_FUNC(int) _PyUnicode_ToDigit( | |
1154 Py_UCS4 ch /* Unicode character */ | |
1155 ); | |
1156 | |
1157 PyAPI_FUNC(double) _PyUnicode_ToNumeric( | |
1158 Py_UCS4 ch /* Unicode character */ | |
1159 ); | |
1160 | |
1161 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( | |
1162 Py_UCS4 ch /* Unicode character */ | |
1163 ); | |
1164 | |
1165 PyAPI_FUNC(int) _PyUnicode_IsDigit( | |
1166 Py_UCS4 ch /* Unicode character */ | |
1167 ); | |
1168 | |
1169 PyAPI_FUNC(int) _PyUnicode_IsNumeric( | |
1170 Py_UCS4 ch /* Unicode character */ | |
1171 ); | |
1172 | |
1173 PyAPI_FUNC(int) _PyUnicode_IsPrintable( | |
1174 Py_UCS4 ch /* Unicode character */ | |
1175 ); | |
1176 | |
1177 PyAPI_FUNC(int) _PyUnicode_IsAlpha( | |
1178 Py_UCS4 ch /* Unicode character */ | |
1179 ); | |
1180 | |
1181 Py_DEPRECATED(3.3) PyAPI_FUNC(size_t) Py_UNICODE_strlen( | |
1182 const Py_UNICODE *u | |
1183 ); | |
1184 | |
1185 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( | |
1186 Py_UNICODE *s1, | |
1187 const Py_UNICODE *s2); | |
1188 | |
1189 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( | |
1190 Py_UNICODE *s1, const Py_UNICODE *s2); | |
1191 | |
1192 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( | |
1193 Py_UNICODE *s1, | |
1194 const Py_UNICODE *s2, | |
1195 size_t n); | |
1196 | |
1197 Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strcmp( | |
1198 const Py_UNICODE *s1, | |
1199 const Py_UNICODE *s2 | |
1200 ); | |
1201 | |
1202 Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strncmp( | |
1203 const Py_UNICODE *s1, | |
1204 const Py_UNICODE *s2, | |
1205 size_t n | |
1206 ); | |
1207 | |
1208 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( | |
1209 const Py_UNICODE *s, | |
1210 Py_UNICODE c | |
1211 ); | |
1212 | |
1213 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( | |
1214 const Py_UNICODE *s, | |
1215 Py_UNICODE c | |
1216 ); | |
1217 | |
1218 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); | |
1219 | |
1220 /* Create a copy of a unicode string ending with a nul character. Return NULL | |
1221 and raise a MemoryError exception on memory allocation failure, otherwise | |
1222 return a new allocated buffer (use PyMem_Free() to free the buffer). */ | |
1223 | |
1224 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( | |
1225 PyObject *unicode | |
1226 ); | |
1227 | |
1228 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ | |
1229 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); | |
1230 /* Clear all static strings. */ | |
1231 PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); | |
1232 | |
1233 /* Fast equality check when the inputs are known to be exact unicode types | |
1234 and where the hash values are equal (i.e. a very probable match) */ | |
1235 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); | |
1236 | |
1237 #ifdef __cplusplus | |
1238 } | |
1239 #endif |