comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/python3.8/unicodeobject.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3
4 #include <stdarg.h>
5
6 /*
7
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12
13 Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python. This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58 #include <ctype.h>
59
60 /* === Internal API ======================================================= */
61
62 /* --- Internal Unicode Format -------------------------------------------- */
63
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74 Otherwise, Unicode strings are stored as UCS-2 (with limited support
75 for UTF-16) */
76
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80
81 /* Set these flags if the platform has "wchar.h" and the
82 wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85
86 /* If the compiler provides a wchar_t type we try to support it
87 through the interface functions PyUnicode_FromWideChar(),
88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89
90 #ifdef HAVE_USABLE_WCHAR_T
91 # ifndef HAVE_WCHAR_H
92 # define HAVE_WCHAR_H
93 # endif
94 #endif
95
96 #ifdef HAVE_WCHAR_H
97 # include <wchar.h>
98 #endif
99
100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
101 unicode representations. */
102 typedef uint32_t Py_UCS4;
103 typedef uint16_t Py_UCS2;
104 typedef uint8_t Py_UCS1;
105
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109
110
111 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113
114 #define PyUnicode_Check(op) \
115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
117
118 /* --- Constants ---------------------------------------------------------- */
119
120 /* This Unicode character will be used as replacement character during
121 decoding if the errors argument is set to "replace". Note: the
122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123 Unicode 3.0. */
124
125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126
127 /* === Public API ========================================================= */
128
129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131 const char *u, /* UTF-8 encoded string */
132 Py_ssize_t size /* size of buffer */
133 );
134
135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136 UTF-8 encoded bytes. The size is determined with strlen(). */
137 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138 const char *u /* UTF-8 encoded string */
139 );
140
141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143 PyObject *str,
144 Py_ssize_t start,
145 Py_ssize_t end);
146 #endif
147
148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149 /* Copy the string into a UCS4 buffer including the null character if copy_null
150 is set. Return NULL and raise an exception on error. Raise a SystemError if
151 the buffer is smaller than the string. Return buffer on success.
152
153 buflen is the length of the buffer in (Py_UCS4) characters. */
154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155 PyObject *unicode,
156 Py_UCS4* buffer,
157 Py_ssize_t buflen,
158 int copy_null);
159
160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
161 * PyMem_Malloc; if this fails, NULL is returned with a memory error
162 exception set. */
163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164 #endif
165
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Get the length of the Unicode object. */
168
169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170 PyObject *unicode
171 );
172 #endif
173
174 /* Get the number of Py_UNICODE units in the
175 string representation. */
176
177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178 PyObject *unicode /* Unicode object */
179 );
180
181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182 /* Read a character from the string. */
183
184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185 PyObject *unicode,
186 Py_ssize_t index
187 );
188
189 /* Write a character to the string. The string must have been created through
190 PyUnicode_New, must not be shared, and must not have been hashed yet.
191
192 Return 0 on success, -1 on error. */
193
194 PyAPI_FUNC(int) PyUnicode_WriteChar(
195 PyObject *unicode,
196 Py_ssize_t index,
197 Py_UCS4 character
198 );
199 #endif
200
201 /* Resize a Unicode object. The length is the number of characters, except
202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203 is the number of Py_UNICODE characters.
204
205 *unicode is modified to point to the new (resized) object and 0
206 returned on success.
207
208 Try to resize the string in place (which is usually faster than allocating
209 a new string and copy characters), or create a new string.
210
211 Error handling is implemented as follows: an exception is set, -1
212 is returned and *unicode left untouched.
213
214 WARNING: The function doesn't check string content, the result may not be a
215 string in canonical representation. */
216
217 PyAPI_FUNC(int) PyUnicode_Resize(
218 PyObject **unicode, /* Pointer to the Unicode object */
219 Py_ssize_t length /* New length */
220 );
221
222 /* Decode obj to a Unicode object.
223
224 bytes, bytearray and other bytes-like objects are decoded according to the
225 given encoding and error handler. The encoding and error handler can be
226 NULL to have the interface use UTF-8 and "strict".
227
228 All other objects (including Unicode objects) raise an exception.
229
230 The API returns NULL in case of an error. The caller is responsible
231 for decref'ing the returned objects.
232
233 */
234
235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236 PyObject *obj, /* Object */
237 const char *encoding, /* encoding */
238 const char *errors /* error handling */
239 );
240
241 /* Copy an instance of a Unicode subtype to a new true Unicode object if
242 necessary. If obj is already a true Unicode object (not a subtype), return
243 the reference with *incremented* refcount.
244
245 The API returns NULL in case of an error. The caller is responsible
246 for decref'ing the returned objects.
247
248 */
249
250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251 PyObject *obj /* Object */
252 );
253
254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255 const char *format, /* ASCII-encoded string */
256 va_list vargs
257 );
258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259 const char *format, /* ASCII-encoded string */
260 ...
261 );
262
263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
265 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
266 const char *u /* UTF-8 encoded string */
267 );
268
269 /* Use only if you know it's a string */
270 #define PyUnicode_CHECK_INTERNED(op) \
271 (((PyASCIIObject *)(op))->state.interned)
272
273 /* --- wchar_t support for platforms which support it --------------------- */
274
275 #ifdef HAVE_WCHAR_H
276
277 /* Create a Unicode Object from the wchar_t buffer w of the given
278 size.
279
280 The buffer is copied into the new object. */
281
282 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
283 const wchar_t *w, /* wchar_t buffer */
284 Py_ssize_t size /* size of buffer */
285 );
286
287 /* Copies the Unicode Object contents into the wchar_t buffer w. At
288 most size wchar_t characters are copied.
289
290 Note that the resulting wchar_t string may or may not be
291 0-terminated. It is the responsibility of the caller to make sure
292 that the wchar_t string is 0-terminated in case this is required by
293 the application.
294
295 Returns the number of wchar_t characters copied (excluding a
296 possibly trailing 0-termination character) or -1 in case of an
297 error. */
298
299 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
300 PyObject *unicode, /* Unicode object */
301 wchar_t *w, /* wchar_t buffer */
302 Py_ssize_t size /* size of buffer */
303 );
304
305 /* Convert the Unicode object to a wide character string. The output string
306 always ends with a nul character. If size is not NULL, write the number of
307 wide characters (excluding the null character) into *size.
308
309 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
310 on success. On error, returns NULL, *size is undefined and raises a
311 MemoryError. */
312
313 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
314 PyObject *unicode, /* Unicode object */
315 Py_ssize_t *size /* number of characters of the result */
316 );
317
318 #endif
319
320 /* --- Unicode ordinals --------------------------------------------------- */
321
322 /* Create a Unicode Object from the given Unicode code point ordinal.
323
324 The ordinal must be in range(0x110000). A ValueError is
325 raised in case it is not.
326
327 */
328
329 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
330
331 /* --- Free-list management ----------------------------------------------- */
332
333 /* Clear the free list used by the Unicode implementation.
334
335 This can be used to release memory used for objects on the free
336 list back to the Python memory allocator.
337
338 */
339
340 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
341
342 /* === Builtin Codecs =====================================================
343
344 Many of these APIs take two arguments encoding and errors. These
345 parameters encoding and errors have the same semantics as the ones
346 of the builtin str() API.
347
348 Setting encoding to NULL causes the default encoding (UTF-8) to be used.
349
350 Error handling is set by errors which may also be set to NULL
351 meaning to use the default handling defined for the codec. Default
352 error handling for all builtin codecs is "strict" (ValueErrors are
353 raised).
354
355 The codecs all use a similar interface. Only deviation from the
356 generic ones are documented.
357
358 */
359
360 /* --- Manage the default encoding ---------------------------------------- */
361
362 /* Returns "utf-8". */
363 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
364
365 /* --- Generic Codecs ----------------------------------------------------- */
366
367 /* Create a Unicode object by decoding the encoded string s of the
368 given size. */
369
370 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
371 const char *s, /* encoded string */
372 Py_ssize_t size, /* size of buffer */
373 const char *encoding, /* encoding */
374 const char *errors /* error handling */
375 );
376
377 /* Decode a Unicode object unicode and return the result as Python
378 object.
379
380 This API is DEPRECATED. The only supported standard encoding is rot13.
381 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
382 that decode from str. */
383
384 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
385 PyObject *unicode, /* Unicode object */
386 const char *encoding, /* encoding */
387 const char *errors /* error handling */
388 );
389
390 /* Decode a Unicode object unicode and return the result as Unicode
391 object.
392
393 This API is DEPRECATED. The only supported standard encoding is rot13.
394 Use PyCodec_Decode() to decode with rot13 and non-standard codecs
395 that decode from str to str. */
396
397 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
398 PyObject *unicode, /* Unicode object */
399 const char *encoding, /* encoding */
400 const char *errors /* error handling */
401 );
402
403 /* Encodes a Unicode object and returns the result as Python
404 object.
405
406 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString()
407 since all standard encodings (except rot13) encode str to bytes.
408 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
409 that encode form str to non-bytes. */
410
411 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
412 PyObject *unicode, /* Unicode object */
413 const char *encoding, /* encoding */
414 const char *errors /* error handling */
415 );
416
417 /* Encodes a Unicode object and returns the result as Python string
418 object. */
419
420 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
421 PyObject *unicode, /* Unicode object */
422 const char *encoding, /* encoding */
423 const char *errors /* error handling */
424 );
425
426 /* Encodes a Unicode object and returns the result as Unicode
427 object.
428
429 This API is DEPRECATED. The only supported standard encodings is rot13.
430 Use PyCodec_Encode() to encode with rot13 and non-standard codecs
431 that encode from str to str. */
432
433 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
434 PyObject *unicode, /* Unicode object */
435 const char *encoding, /* encoding */
436 const char *errors /* error handling */
437 );
438
439 /* Build an encoding map. */
440
441 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
442 PyObject* string /* 256 character map */
443 );
444
445 /* --- UTF-7 Codecs ------------------------------------------------------- */
446
447 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
448 const char *string, /* UTF-7 encoded string */
449 Py_ssize_t length, /* size of string */
450 const char *errors /* error handling */
451 );
452
453 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
454 const char *string, /* UTF-7 encoded string */
455 Py_ssize_t length, /* size of string */
456 const char *errors, /* error handling */
457 Py_ssize_t *consumed /* bytes consumed */
458 );
459
460 /* --- UTF-8 Codecs ------------------------------------------------------- */
461
462 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
463 const char *string, /* UTF-8 encoded string */
464 Py_ssize_t length, /* size of string */
465 const char *errors /* error handling */
466 );
467
468 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
469 const char *string, /* UTF-8 encoded string */
470 Py_ssize_t length, /* size of string */
471 const char *errors, /* error handling */
472 Py_ssize_t *consumed /* bytes consumed */
473 );
474
475 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
476 PyObject *unicode /* Unicode object */
477 );
478
479 /* --- UTF-32 Codecs ------------------------------------------------------ */
480
481 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
482 the corresponding Unicode object.
483
484 errors (if non-NULL) defines the error handling. It defaults
485 to "strict".
486
487 If byteorder is non-NULL, the decoder starts decoding using the
488 given byte order:
489
490 *byteorder == -1: little endian
491 *byteorder == 0: native order
492 *byteorder == 1: big endian
493
494 In native mode, the first four bytes of the stream are checked for a
495 BOM mark. If found, the BOM mark is analysed, the byte order
496 adjusted and the BOM skipped. In the other modes, no BOM mark
497 interpretation is done. After completion, *byteorder is set to the
498 current byte order at the end of input data.
499
500 If byteorder is NULL, the codec starts in native order mode.
501
502 */
503
504 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
505 const char *string, /* UTF-32 encoded string */
506 Py_ssize_t length, /* size of string */
507 const char *errors, /* error handling */
508 int *byteorder /* pointer to byteorder to use
509 0=native;-1=LE,1=BE; updated on
510 exit */
511 );
512
513 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
514 const char *string, /* UTF-32 encoded string */
515 Py_ssize_t length, /* size of string */
516 const char *errors, /* error handling */
517 int *byteorder, /* pointer to byteorder to use
518 0=native;-1=LE,1=BE; updated on
519 exit */
520 Py_ssize_t *consumed /* bytes consumed */
521 );
522
523 /* Returns a Python string using the UTF-32 encoding in native byte
524 order. The string always starts with a BOM mark. */
525
526 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
527 PyObject *unicode /* Unicode object */
528 );
529
530 /* Returns a Python string object holding the UTF-32 encoded value of
531 the Unicode data.
532
533 If byteorder is not 0, output is written according to the following
534 byte order:
535
536 byteorder == -1: little endian
537 byteorder == 0: native byte order (writes a BOM mark)
538 byteorder == 1: big endian
539
540 If byteorder is 0, the output string will always start with the
541 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
542 prepended.
543
544 */
545
546 /* --- UTF-16 Codecs ------------------------------------------------------ */
547
548 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
549 the corresponding Unicode object.
550
551 errors (if non-NULL) defines the error handling. It defaults
552 to "strict".
553
554 If byteorder is non-NULL, the decoder starts decoding using the
555 given byte order:
556
557 *byteorder == -1: little endian
558 *byteorder == 0: native order
559 *byteorder == 1: big endian
560
561 In native mode, the first two bytes of the stream are checked for a
562 BOM mark. If found, the BOM mark is analysed, the byte order
563 adjusted and the BOM skipped. In the other modes, no BOM mark
564 interpretation is done. After completion, *byteorder is set to the
565 current byte order at the end of input data.
566
567 If byteorder is NULL, the codec starts in native order mode.
568
569 */
570
571 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
572 const char *string, /* UTF-16 encoded string */
573 Py_ssize_t length, /* size of string */
574 const char *errors, /* error handling */
575 int *byteorder /* pointer to byteorder to use
576 0=native;-1=LE,1=BE; updated on
577 exit */
578 );
579
580 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
581 const char *string, /* UTF-16 encoded string */
582 Py_ssize_t length, /* size of string */
583 const char *errors, /* error handling */
584 int *byteorder, /* pointer to byteorder to use
585 0=native;-1=LE,1=BE; updated on
586 exit */
587 Py_ssize_t *consumed /* bytes consumed */
588 );
589
590 /* Returns a Python string using the UTF-16 encoding in native byte
591 order. The string always starts with a BOM mark. */
592
593 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
594 PyObject *unicode /* Unicode object */
595 );
596
597 /* --- Unicode-Escape Codecs ---------------------------------------------- */
598
599 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
600 const char *string, /* Unicode-Escape encoded string */
601 Py_ssize_t length, /* size of string */
602 const char *errors /* error handling */
603 );
604
605 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
606 PyObject *unicode /* Unicode object */
607 );
608
609 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
610
611 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
612 const char *string, /* Raw-Unicode-Escape encoded string */
613 Py_ssize_t length, /* size of string */
614 const char *errors /* error handling */
615 );
616
617 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
618 PyObject *unicode /* Unicode object */
619 );
620
621 /* --- Latin-1 Codecs -----------------------------------------------------
622
623 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
624
625 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
626 const char *string, /* Latin-1 encoded string */
627 Py_ssize_t length, /* size of string */
628 const char *errors /* error handling */
629 );
630
631 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
632 PyObject *unicode /* Unicode object */
633 );
634
635 /* --- ASCII Codecs -------------------------------------------------------
636
637 Only 7-bit ASCII data is excepted. All other codes generate errors.
638
639 */
640
641 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
642 const char *string, /* ASCII encoded string */
643 Py_ssize_t length, /* size of string */
644 const char *errors /* error handling */
645 );
646
647 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
648 PyObject *unicode /* Unicode object */
649 );
650
651 /* --- Character Map Codecs -----------------------------------------------
652
653 This codec uses mappings to encode and decode characters.
654
655 Decoding mappings must map byte ordinals (integers in the range from 0 to
656 255) to Unicode strings, integers (which are then interpreted as Unicode
657 ordinals) or None. Unmapped data bytes (ones which cause a LookupError)
658 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
659 mapping" and cause an error.
660
661 Encoding mappings must map Unicode ordinal integers to bytes objects,
662 integers in the range from 0 to 255 or None. Unmapped character
663 ordinals (ones which cause a LookupError) as well as mapped to
664 None are treated as "undefined mapping" and cause an error.
665
666 */
667
668 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
669 const char *string, /* Encoded string */
670 Py_ssize_t length, /* size of string */
671 PyObject *mapping, /* decoding mapping */
672 const char *errors /* error handling */
673 );
674
675 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
676 PyObject *unicode, /* Unicode object */
677 PyObject *mapping /* encoding mapping */
678 );
679
680 /* --- MBCS codecs for Windows -------------------------------------------- */
681
682 #ifdef MS_WINDOWS
683 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
684 const char *string, /* MBCS encoded string */
685 Py_ssize_t length, /* size of string */
686 const char *errors /* error handling */
687 );
688
689 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
690 const char *string, /* MBCS encoded string */
691 Py_ssize_t length, /* size of string */
692 const char *errors, /* error handling */
693 Py_ssize_t *consumed /* bytes consumed */
694 );
695
696 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
697 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
698 int code_page, /* code page number */
699 const char *string, /* encoded string */
700 Py_ssize_t length, /* size of string */
701 const char *errors, /* error handling */
702 Py_ssize_t *consumed /* bytes consumed */
703 );
704 #endif
705
706 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
707 PyObject *unicode /* Unicode object */
708 );
709
710 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
711 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
712 int code_page, /* code page number */
713 PyObject *unicode, /* Unicode object */
714 const char *errors /* error handling */
715 );
716 #endif
717
718 #endif /* MS_WINDOWS */
719
720 /* --- Locale encoding --------------------------------------------------- */
721
722 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
723 /* Decode a string from the current locale encoding. The decoder is strict if
724 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
725 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
726 be decoded as a surrogate character and *surrogateescape* is not equal to
727 zero, the byte sequence is escaped using the 'surrogateescape' error handler
728 instead of being decoded. *str* must end with a null character but cannot
729 contain embedded null characters. */
730
731 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
732 const char *str,
733 Py_ssize_t len,
734 const char *errors);
735
736 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
737 length using strlen(). */
738
739 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
740 const char *str,
741 const char *errors);
742
743 /* Encode a Unicode object to the current locale encoding. The encoder is
744 strict is *surrogateescape* is equal to zero, otherwise the
745 "surrogateescape" error handler is used. Return a bytes object. The string
746 cannot contain embedded null characters. */
747
748 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
749 PyObject *unicode,
750 const char *errors
751 );
752 #endif
753
754 /* --- File system encoding ---------------------------------------------- */
755
756 /* ParseTuple converter: encode str objects to bytes using
757 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
758
759 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
760
761 /* ParseTuple converter: decode bytes objects to unicode using
762 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
763
764 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
765
766 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
767 and the "surrogateescape" error handler.
768
769 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
770 encoding.
771
772 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
773 */
774
775 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
776 const char *s /* encoded string */
777 );
778
779 /* Decode a string using Py_FileSystemDefaultEncoding
780 and the "surrogateescape" error handler.
781
782 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
783 encoding.
784 */
785
786 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
787 const char *s, /* encoded string */
788 Py_ssize_t size /* size */
789 );
790
791 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
792 "surrogateescape" error handler, and return bytes.
793
794 If Py_FileSystemDefaultEncoding is not set, fall back to the locale
795 encoding.
796 */
797
798 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
799 PyObject *unicode
800 );
801
802 /* --- Methods & Slots ----------------------------------------------------
803
804 These are capable of handling Unicode objects and strings on input
805 (we refer to them as strings in the descriptions) and return
806 Unicode objects or integers as appropriate. */
807
808 /* Concat two strings giving a new Unicode string. */
809
810 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
811 PyObject *left, /* Left string */
812 PyObject *right /* Right string */
813 );
814
815 /* Concat two strings and put the result in *pleft
816 (sets *pleft to NULL on error) */
817
818 PyAPI_FUNC(void) PyUnicode_Append(
819 PyObject **pleft, /* Pointer to left string */
820 PyObject *right /* Right string */
821 );
822
823 /* Concat two strings, put the result in *pleft and drop the right object
824 (sets *pleft to NULL on error) */
825
826 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
827 PyObject **pleft, /* Pointer to left string */
828 PyObject *right /* Right string */
829 );
830
831 /* Split a string giving a list of Unicode strings.
832
833 If sep is NULL, splitting will be done at all whitespace
834 substrings. Otherwise, splits occur at the given separator.
835
836 At most maxsplit splits will be done. If negative, no limit is set.
837
838 Separators are not included in the resulting list.
839
840 */
841
842 PyAPI_FUNC(PyObject*) PyUnicode_Split(
843 PyObject *s, /* String to split */
844 PyObject *sep, /* String separator */
845 Py_ssize_t maxsplit /* Maxsplit count */
846 );
847
848 /* Dito, but split at line breaks.
849
850 CRLF is considered to be one line break. Line breaks are not
851 included in the resulting list. */
852
853 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
854 PyObject *s, /* String to split */
855 int keepends /* If true, line end markers are included */
856 );
857
858 /* Partition a string using a given separator. */
859
860 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
861 PyObject *s, /* String to partition */
862 PyObject *sep /* String separator */
863 );
864
865 /* Partition a string using a given separator, searching from the end of the
866 string. */
867
868 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
869 PyObject *s, /* String to partition */
870 PyObject *sep /* String separator */
871 );
872
873 /* Split a string giving a list of Unicode strings.
874
875 If sep is NULL, splitting will be done at all whitespace
876 substrings. Otherwise, splits occur at the given separator.
877
878 At most maxsplit splits will be done. But unlike PyUnicode_Split
879 PyUnicode_RSplit splits from the end of the string. If negative,
880 no limit is set.
881
882 Separators are not included in the resulting list.
883
884 */
885
886 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
887 PyObject *s, /* String to split */
888 PyObject *sep, /* String separator */
889 Py_ssize_t maxsplit /* Maxsplit count */
890 );
891
892 /* Translate a string by applying a character mapping table to it and
893 return the resulting Unicode object.
894
895 The mapping table must map Unicode ordinal integers to Unicode strings,
896 Unicode ordinal integers or None (causing deletion of the character).
897
898 Mapping tables may be dictionaries or sequences. Unmapped character
899 ordinals (ones which cause a LookupError) are left untouched and
900 are copied as-is.
901
902 */
903
904 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
905 PyObject *str, /* String */
906 PyObject *table, /* Translate table */
907 const char *errors /* error handling */
908 );
909
910 /* Join a sequence of strings using the given separator and return
911 the resulting Unicode string. */
912
913 PyAPI_FUNC(PyObject*) PyUnicode_Join(
914 PyObject *separator, /* Separator string */
915 PyObject *seq /* Sequence object */
916 );
917
918 /* Return 1 if substr matches str[start:end] at the given tail end, 0
919 otherwise. */
920
921 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
922 PyObject *str, /* String */
923 PyObject *substr, /* Prefix or Suffix string */
924 Py_ssize_t start, /* Start index */
925 Py_ssize_t end, /* Stop index */
926 int direction /* Tail end: -1 prefix, +1 suffix */
927 );
928
929 /* Return the first position of substr in str[start:end] using the
930 given search direction or -1 if not found. -2 is returned in case
931 an error occurred and an exception is set. */
932
933 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
934 PyObject *str, /* String */
935 PyObject *substr, /* Substring to find */
936 Py_ssize_t start, /* Start index */
937 Py_ssize_t end, /* Stop index */
938 int direction /* Find direction: +1 forward, -1 backward */
939 );
940
941 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
942 /* Like PyUnicode_Find, but search for single character only. */
943 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
944 PyObject *str,
945 Py_UCS4 ch,
946 Py_ssize_t start,
947 Py_ssize_t end,
948 int direction
949 );
950 #endif
951
952 /* Count the number of occurrences of substr in str[start:end]. */
953
954 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
955 PyObject *str, /* String */
956 PyObject *substr, /* Substring to count */
957 Py_ssize_t start, /* Start index */
958 Py_ssize_t end /* Stop index */
959 );
960
961 /* Replace at most maxcount occurrences of substr in str with replstr
962 and return the resulting Unicode object. */
963
964 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
965 PyObject *str, /* String */
966 PyObject *substr, /* Substring to find */
967 PyObject *replstr, /* Substring to replace */
968 Py_ssize_t maxcount /* Max. number of replacements to apply;
969 -1 = all */
970 );
971
972 /* Compare two strings and return -1, 0, 1 for less than, equal,
973 greater than resp.
974 Raise an exception and return -1 on error. */
975
976 PyAPI_FUNC(int) PyUnicode_Compare(
977 PyObject *left, /* Left string */
978 PyObject *right /* Right string */
979 );
980
981 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
982 equal, and greater than, respectively. It is best to pass only
983 ASCII-encoded strings, but the function interprets the input string as
984 ISO-8859-1 if it contains non-ASCII characters.
985 This function does not raise exceptions. */
986
987 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
988 PyObject *left,
989 const char *right /* ASCII-encoded string */
990 );
991
992 /* Rich compare two strings and return one of the following:
993
994 - NULL in case an exception was raised
995 - Py_True or Py_False for successful comparisons
996 - Py_NotImplemented in case the type combination is unknown
997
998 Possible values for op:
999
1000 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1001
1002 */
1003
1004 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1005 PyObject *left, /* Left string */
1006 PyObject *right, /* Right string */
1007 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1008 );
1009
1010 /* Apply an argument tuple or dictionary to a format string and return
1011 the resulting Unicode string. */
1012
1013 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1014 PyObject *format, /* Format string */
1015 PyObject *args /* Argument tuple or dictionary */
1016 );
1017
1018 /* Checks whether element is contained in container and return 1/0
1019 accordingly.
1020
1021 element has to coerce to a one element Unicode string. -1 is
1022 returned in case of an error. */
1023
1024 PyAPI_FUNC(int) PyUnicode_Contains(
1025 PyObject *container, /* Container string */
1026 PyObject *element /* Element string */
1027 );
1028
1029 /* Checks whether argument is a valid identifier. */
1030
1031 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1032
1033 /* === Characters Type APIs =============================================== */
1034
1035 #ifndef Py_LIMITED_API
1036 # define Py_CPYTHON_UNICODEOBJECT_H
1037 # include "cpython/unicodeobject.h"
1038 # undef Py_CPYTHON_UNICODEOBJECT_H
1039 #endif
1040
1041 #ifdef __cplusplus
1042 }
1043 #endif
1044 #endif /* !Py_UNICODEOBJECT_H */