jpayne@69: #ifndef Py_CPYTHON_UNICODEOBJECT_H jpayne@69: # error "this header file must not be included directly" jpayne@69: #endif jpayne@69: jpayne@69: #ifdef __cplusplus jpayne@69: extern "C" { jpayne@69: #endif jpayne@69: jpayne@69: /* Py_UNICODE was the native Unicode storage format (code unit) used by jpayne@69: Python and represents a single Unicode element in the Unicode type. jpayne@69: With PEP 393, Py_UNICODE is deprecated and replaced with a jpayne@69: typedef to wchar_t. */ jpayne@69: #define PY_UNICODE_TYPE wchar_t jpayne@69: /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE; jpayne@69: jpayne@69: /* --- Internal Unicode Operations ---------------------------------------- */ jpayne@69: jpayne@69: /* Since splitting on whitespace is an important use case, and jpayne@69: whitespace in most situations is solely ASCII whitespace, we jpayne@69: optimize for the common case by using a quick look-up table jpayne@69: _Py_ascii_whitespace (see below) with an inlined check. jpayne@69: jpayne@69: */ jpayne@69: #define Py_UNICODE_ISSPACE(ch) \ jpayne@69: ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch)) jpayne@69: jpayne@69: #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch) jpayne@69: #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch) jpayne@69: #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch) jpayne@69: #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch) jpayne@69: jpayne@69: #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch) jpayne@69: #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch) jpayne@69: #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch) jpayne@69: jpayne@69: #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch) jpayne@69: #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch) jpayne@69: #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch) jpayne@69: #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch) jpayne@69: jpayne@69: #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch) jpayne@69: #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch) jpayne@69: #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch) jpayne@69: jpayne@69: #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) jpayne@69: jpayne@69: #define Py_UNICODE_ISALNUM(ch) \ jpayne@69: (Py_UNICODE_ISALPHA(ch) || \ jpayne@69: Py_UNICODE_ISDECIMAL(ch) || \ jpayne@69: Py_UNICODE_ISDIGIT(ch) || \ jpayne@69: Py_UNICODE_ISNUMERIC(ch)) jpayne@69: jpayne@69: #define Py_UNICODE_COPY(target, source, length) \ jpayne@69: memcpy((target), (source), (length)*sizeof(Py_UNICODE)) jpayne@69: jpayne@69: #define Py_UNICODE_FILL(target, value, length) \ jpayne@69: do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\ jpayne@69: for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\ jpayne@69: } while (0) jpayne@69: jpayne@69: /* macros to work with surrogates */ jpayne@69: #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) jpayne@69: #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) jpayne@69: #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) jpayne@69: /* Join two surrogate characters and return a single Py_UCS4 value. */ jpayne@69: #define Py_UNICODE_JOIN_SURROGATES(high, low) \ jpayne@69: (((((Py_UCS4)(high) & 0x03FF) << 10) | \ jpayne@69: ((Py_UCS4)(low) & 0x03FF)) + 0x10000) jpayne@69: /* high surrogate = top 10 bits added to D800 */ jpayne@69: #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10)) jpayne@69: /* low surrogate = bottom 10 bits added to DC00 */ jpayne@69: #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF)) jpayne@69: jpayne@69: /* Check if substring matches at given offset. The offset must be jpayne@69: valid, and the substring must not be empty. */ jpayne@69: jpayne@69: #define Py_UNICODE_MATCH(string, offset, substring) \ jpayne@69: ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \ jpayne@69: ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \ jpayne@69: !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE))) jpayne@69: jpayne@69: /* --- Unicode Type ------------------------------------------------------- */ jpayne@69: jpayne@69: /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject jpayne@69: structure. state.ascii and state.compact are set, and the data jpayne@69: immediately follow the structure. utf8_length and wstr_length can be found jpayne@69: in the length field; the utf8 pointer is equal to the data pointer. */ jpayne@69: typedef struct { jpayne@69: /* There are 4 forms of Unicode strings: jpayne@69: jpayne@69: - compact ascii: jpayne@69: jpayne@69: * structure = PyASCIIObject jpayne@69: * test: PyUnicode_IS_COMPACT_ASCII(op) jpayne@69: * kind = PyUnicode_1BYTE_KIND jpayne@69: * compact = 1 jpayne@69: * ascii = 1 jpayne@69: * ready = 1 jpayne@69: * (length is the length of the utf8 and wstr strings) jpayne@69: * (data starts just after the structure) jpayne@69: * (since ASCII is decoded from UTF-8, the utf8 string are the data) jpayne@69: jpayne@69: - compact: jpayne@69: jpayne@69: * structure = PyCompactUnicodeObject jpayne@69: * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op) jpayne@69: * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or jpayne@69: PyUnicode_4BYTE_KIND jpayne@69: * compact = 1 jpayne@69: * ready = 1 jpayne@69: * ascii = 0 jpayne@69: * utf8 is not shared with data jpayne@69: * utf8_length = 0 if utf8 is NULL jpayne@69: * wstr is shared with data and wstr_length=length jpayne@69: if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 jpayne@69: or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4 jpayne@69: * wstr_length = 0 if wstr is NULL jpayne@69: * (data starts just after the structure) jpayne@69: jpayne@69: - legacy string, not ready: jpayne@69: jpayne@69: * structure = PyUnicodeObject jpayne@69: * test: kind == PyUnicode_WCHAR_KIND jpayne@69: * length = 0 (use wstr_length) jpayne@69: * hash = -1 jpayne@69: * kind = PyUnicode_WCHAR_KIND jpayne@69: * compact = 0 jpayne@69: * ascii = 0 jpayne@69: * ready = 0 jpayne@69: * interned = SSTATE_NOT_INTERNED jpayne@69: * wstr is not NULL jpayne@69: * data.any is NULL jpayne@69: * utf8 is NULL jpayne@69: * utf8_length = 0 jpayne@69: jpayne@69: - legacy string, ready: jpayne@69: jpayne@69: * structure = PyUnicodeObject structure jpayne@69: * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND jpayne@69: * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or jpayne@69: PyUnicode_4BYTE_KIND jpayne@69: * compact = 0 jpayne@69: * ready = 1 jpayne@69: * data.any is not NULL jpayne@69: * utf8 is shared and utf8_length = length with data.any if ascii = 1 jpayne@69: * utf8_length = 0 if utf8 is NULL jpayne@69: * wstr is shared with data.any and wstr_length = length jpayne@69: if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2 jpayne@69: or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4 jpayne@69: * wstr_length = 0 if wstr is NULL jpayne@69: jpayne@69: Compact strings use only one memory block (structure + characters), jpayne@69: whereas legacy strings use one block for the structure and one block jpayne@69: for characters. jpayne@69: jpayne@69: Legacy strings are created by PyUnicode_FromUnicode() and jpayne@69: PyUnicode_FromStringAndSize(NULL, size) functions. They become ready jpayne@69: when PyUnicode_READY() is called. jpayne@69: jpayne@69: See also _PyUnicode_CheckConsistency(). jpayne@69: */ jpayne@69: PyObject_HEAD jpayne@69: Py_ssize_t length; /* Number of code points in the string */ jpayne@69: Py_hash_t hash; /* Hash value; -1 if not set */ jpayne@69: struct { jpayne@69: /* jpayne@69: SSTATE_NOT_INTERNED (0) jpayne@69: SSTATE_INTERNED_MORTAL (1) jpayne@69: SSTATE_INTERNED_IMMORTAL (2) jpayne@69: jpayne@69: If interned != SSTATE_NOT_INTERNED, the two references from the jpayne@69: dictionary to this object are *not* counted in ob_refcnt. jpayne@69: */ jpayne@69: unsigned int interned:2; jpayne@69: /* Character size: jpayne@69: jpayne@69: - PyUnicode_WCHAR_KIND (0): jpayne@69: jpayne@69: * character type = wchar_t (16 or 32 bits, depending on the jpayne@69: platform) jpayne@69: jpayne@69: - PyUnicode_1BYTE_KIND (1): jpayne@69: jpayne@69: * character type = Py_UCS1 (8 bits, unsigned) jpayne@69: * all characters are in the range U+0000-U+00FF (latin1) jpayne@69: * if ascii is set, all characters are in the range U+0000-U+007F jpayne@69: (ASCII), otherwise at least one character is in the range jpayne@69: U+0080-U+00FF jpayne@69: jpayne@69: - PyUnicode_2BYTE_KIND (2): jpayne@69: jpayne@69: * character type = Py_UCS2 (16 bits, unsigned) jpayne@69: * all characters are in the range U+0000-U+FFFF (BMP) jpayne@69: * at least one character is in the range U+0100-U+FFFF jpayne@69: jpayne@69: - PyUnicode_4BYTE_KIND (4): jpayne@69: jpayne@69: * character type = Py_UCS4 (32 bits, unsigned) jpayne@69: * all characters are in the range U+0000-U+10FFFF jpayne@69: * at least one character is in the range U+10000-U+10FFFF jpayne@69: */ jpayne@69: unsigned int kind:3; jpayne@69: /* Compact is with respect to the allocation scheme. Compact unicode jpayne@69: objects only require one memory block while non-compact objects use jpayne@69: one block for the PyUnicodeObject struct and another for its data jpayne@69: buffer. */ jpayne@69: unsigned int compact:1; jpayne@69: /* The string only contains characters in the range U+0000-U+007F (ASCII) jpayne@69: and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is jpayne@69: set, use the PyASCIIObject structure. */ jpayne@69: unsigned int ascii:1; jpayne@69: /* The ready flag indicates whether the object layout is initialized jpayne@69: completely. This means that this is either a compact object, or jpayne@69: the data pointer is filled out. The bit is redundant, and helps jpayne@69: to minimize the test in PyUnicode_IS_READY(). */ jpayne@69: unsigned int ready:1; jpayne@69: /* Padding to ensure that PyUnicode_DATA() is always aligned to jpayne@69: 4 bytes (see issue #19537 on m68k). */ jpayne@69: unsigned int :24; jpayne@69: } state; jpayne@69: wchar_t *wstr; /* wchar_t representation (null-terminated) */ jpayne@69: } PyASCIIObject; jpayne@69: jpayne@69: /* Non-ASCII strings allocated through PyUnicode_New use the jpayne@69: PyCompactUnicodeObject structure. state.compact is set, and the data jpayne@69: immediately follow the structure. */ jpayne@69: typedef struct { jpayne@69: PyASCIIObject _base; jpayne@69: Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the jpayne@69: * terminating \0. */ jpayne@69: char *utf8; /* UTF-8 representation (null-terminated) */ jpayne@69: Py_ssize_t wstr_length; /* Number of code points in wstr, possible jpayne@69: * surrogates count as two code points. */ jpayne@69: } PyCompactUnicodeObject; jpayne@69: jpayne@69: /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the jpayne@69: PyUnicodeObject structure. The actual string data is initially in the wstr jpayne@69: block, and copied into the data block using _PyUnicode_Ready. */ jpayne@69: typedef struct { jpayne@69: PyCompactUnicodeObject _base; jpayne@69: union { jpayne@69: void *any; jpayne@69: Py_UCS1 *latin1; jpayne@69: Py_UCS2 *ucs2; jpayne@69: Py_UCS4 *ucs4; jpayne@69: } data; /* Canonical, smallest-form Unicode buffer */ jpayne@69: } PyUnicodeObject; jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_CheckConsistency( jpayne@69: PyObject *op, jpayne@69: int check_content); jpayne@69: jpayne@69: /* Fast access macros */ jpayne@69: #define PyUnicode_WSTR_LENGTH(op) \ jpayne@69: (PyUnicode_IS_COMPACT_ASCII(op) ? \ jpayne@69: ((PyASCIIObject*)op)->length : \ jpayne@69: ((PyCompactUnicodeObject*)op)->wstr_length) jpayne@69: jpayne@69: /* Returns the deprecated Py_UNICODE representation's size in code units jpayne@69: (this includes surrogate pairs as 2 units). jpayne@69: If the Py_UNICODE representation is not available, it will be computed jpayne@69: on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ jpayne@69: #define PyUnicode_GET_SIZE(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: (((PyASCIIObject *)(op))->wstr) ? \ jpayne@69: PyUnicode_WSTR_LENGTH(op) : \ jpayne@69: ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\ jpayne@69: assert(((PyASCIIObject *)(op))->wstr), \ jpayne@69: PyUnicode_WSTR_LENGTH(op))) jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ jpayne@69: #define PyUnicode_GET_DATA_SIZE(op) \ jpayne@69: (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) jpayne@69: jpayne@69: /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE jpayne@69: representation on demand. Using this macro is very inefficient now, jpayne@69: try to port your code to use the new PyUnicode_*BYTE_DATA() macros or jpayne@69: use PyUnicode_WRITE() and PyUnicode_READ(). */ jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ jpayne@69: #define PyUnicode_AS_UNICODE(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ jpayne@69: PyUnicode_AsUnicode(_PyObject_CAST(op))) jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ jpayne@69: #define PyUnicode_AS_DATA(op) \ jpayne@69: ((const char *)(PyUnicode_AS_UNICODE(op))) jpayne@69: jpayne@69: jpayne@69: /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ jpayne@69: jpayne@69: /* Values for PyASCIIObject.state: */ jpayne@69: jpayne@69: /* Interning state. */ jpayne@69: #define SSTATE_NOT_INTERNED 0 jpayne@69: #define SSTATE_INTERNED_MORTAL 1 jpayne@69: #define SSTATE_INTERNED_IMMORTAL 2 jpayne@69: jpayne@69: /* Return true if the string contains only ASCII characters, or 0 if not. The jpayne@69: string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be jpayne@69: ready. */ jpayne@69: #define PyUnicode_IS_ASCII(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: assert(PyUnicode_IS_READY(op)), \ jpayne@69: ((PyASCIIObject*)op)->state.ascii) jpayne@69: jpayne@69: /* Return true if the string is compact or 0 if not. jpayne@69: No type checks or Ready calls are performed. */ jpayne@69: #define PyUnicode_IS_COMPACT(op) \ jpayne@69: (((PyASCIIObject*)(op))->state.compact) jpayne@69: jpayne@69: /* Return true if the string is a compact ASCII string (use PyASCIIObject jpayne@69: structure), or 0 if not. No type checks or Ready calls are performed. */ jpayne@69: #define PyUnicode_IS_COMPACT_ASCII(op) \ jpayne@69: (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op)) jpayne@69: jpayne@69: enum PyUnicode_Kind { jpayne@69: /* String contains only wstr byte characters. This is only possible jpayne@69: when the string was created with a legacy API and _PyUnicode_Ready() jpayne@69: has not been called yet. */ jpayne@69: PyUnicode_WCHAR_KIND = 0, jpayne@69: /* Return values of the PyUnicode_KIND() macro: */ jpayne@69: PyUnicode_1BYTE_KIND = 1, jpayne@69: PyUnicode_2BYTE_KIND = 2, jpayne@69: PyUnicode_4BYTE_KIND = 4 jpayne@69: }; jpayne@69: jpayne@69: /* Return pointers to the canonical representation cast to unsigned char, jpayne@69: Py_UCS2, or Py_UCS4 for direct character access. jpayne@69: No checks are performed, use PyUnicode_KIND() before to ensure jpayne@69: these will work correctly. */ jpayne@69: jpayne@69: #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op)) jpayne@69: #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op)) jpayne@69: #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op)) jpayne@69: jpayne@69: /* Return one of the PyUnicode_*_KIND values defined above. */ jpayne@69: #define PyUnicode_KIND(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: assert(PyUnicode_IS_READY(op)), \ jpayne@69: ((PyASCIIObject *)(op))->state.kind) jpayne@69: jpayne@69: /* Return a void pointer to the raw unicode buffer. */ jpayne@69: #define _PyUnicode_COMPACT_DATA(op) \ jpayne@69: (PyUnicode_IS_ASCII(op) ? \ jpayne@69: ((void*)((PyASCIIObject*)(op) + 1)) : \ jpayne@69: ((void*)((PyCompactUnicodeObject*)(op) + 1))) jpayne@69: jpayne@69: #define _PyUnicode_NONCOMPACT_DATA(op) \ jpayne@69: (assert(((PyUnicodeObject*)(op))->data.any), \ jpayne@69: ((((PyUnicodeObject *)(op))->data.any))) jpayne@69: jpayne@69: #define PyUnicode_DATA(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \ jpayne@69: _PyUnicode_NONCOMPACT_DATA(op)) jpayne@69: jpayne@69: /* In the access macros below, "kind" may be evaluated more than once. jpayne@69: All other macro parameters are evaluated exactly once, so it is safe jpayne@69: to put side effects into them (such as increasing the index). */ jpayne@69: jpayne@69: /* Write into the canonical representation, this macro does not do any sanity jpayne@69: checks and is intended for usage in loops. The caller should cache the jpayne@69: kind and data pointers obtained from other macro calls. jpayne@69: index is the index in the string (starts at 0) and value is the new jpayne@69: code point value which should be written to that location. */ jpayne@69: #define PyUnicode_WRITE(kind, data, index, value) \ jpayne@69: do { \ jpayne@69: switch ((kind)) { \ jpayne@69: case PyUnicode_1BYTE_KIND: { \ jpayne@69: ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \ jpayne@69: break; \ jpayne@69: } \ jpayne@69: case PyUnicode_2BYTE_KIND: { \ jpayne@69: ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \ jpayne@69: break; \ jpayne@69: } \ jpayne@69: default: { \ jpayne@69: assert((kind) == PyUnicode_4BYTE_KIND); \ jpayne@69: ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \ jpayne@69: } \ jpayne@69: } \ jpayne@69: } while (0) jpayne@69: jpayne@69: /* Read a code point from the string's canonical representation. No checks jpayne@69: or ready calls are performed. */ jpayne@69: #define PyUnicode_READ(kind, data, index) \ jpayne@69: ((Py_UCS4) \ jpayne@69: ((kind) == PyUnicode_1BYTE_KIND ? \ jpayne@69: ((const Py_UCS1 *)(data))[(index)] : \ jpayne@69: ((kind) == PyUnicode_2BYTE_KIND ? \ jpayne@69: ((const Py_UCS2 *)(data))[(index)] : \ jpayne@69: ((const Py_UCS4 *)(data))[(index)] \ jpayne@69: ) \ jpayne@69: )) jpayne@69: jpayne@69: /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it jpayne@69: calls PyUnicode_KIND() and might call it twice. For single reads, use jpayne@69: PyUnicode_READ_CHAR, for multiple consecutive reads callers should jpayne@69: cache kind and use PyUnicode_READ instead. */ jpayne@69: #define PyUnicode_READ_CHAR(unicode, index) \ jpayne@69: (assert(PyUnicode_Check(unicode)), \ jpayne@69: assert(PyUnicode_IS_READY(unicode)), \ jpayne@69: (Py_UCS4) \ jpayne@69: (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \ jpayne@69: ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \ jpayne@69: (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \ jpayne@69: ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \ jpayne@69: ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \ jpayne@69: ) \ jpayne@69: )) jpayne@69: jpayne@69: /* Returns the length of the unicode string. The caller has to make sure that jpayne@69: the string has it's canonical representation set before calling jpayne@69: this macro. Call PyUnicode_(FAST_)Ready to ensure that. */ jpayne@69: #define PyUnicode_GET_LENGTH(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: assert(PyUnicode_IS_READY(op)), \ jpayne@69: ((PyASCIIObject *)(op))->length) jpayne@69: jpayne@69: jpayne@69: /* Fast check to determine whether an object is ready. Equivalent to jpayne@69: PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */ jpayne@69: jpayne@69: #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready) jpayne@69: jpayne@69: /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best jpayne@69: case. If the canonical representation is not yet set, it will still call jpayne@69: _PyUnicode_Ready(). jpayne@69: Returns 0 on success and -1 on errors. */ jpayne@69: #define PyUnicode_READY(op) \ jpayne@69: (assert(PyUnicode_Check(op)), \ jpayne@69: (PyUnicode_IS_READY(op) ? \ jpayne@69: 0 : _PyUnicode_Ready(_PyObject_CAST(op)))) jpayne@69: jpayne@69: /* Return a maximum character value which is suitable for creating another jpayne@69: string based on op. This is always an approximation but more efficient jpayne@69: than iterating over the string. */ jpayne@69: #define PyUnicode_MAX_CHAR_VALUE(op) \ jpayne@69: (assert(PyUnicode_IS_READY(op)), \ jpayne@69: (PyUnicode_IS_ASCII(op) ? \ jpayne@69: (0x7f) : \ jpayne@69: (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \ jpayne@69: (0xffU) : \ jpayne@69: (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \ jpayne@69: (0xffffU) : \ jpayne@69: (0x10ffffU))))) jpayne@69: jpayne@69: /* === Public API ========================================================= */ jpayne@69: jpayne@69: /* --- Plain Py_UNICODE --------------------------------------------------- */ jpayne@69: jpayne@69: /* With PEP 393, this is the recommended way to allocate a new unicode object. jpayne@69: This function will allocate the object and its buffer in a single memory jpayne@69: block. Objects created using this function are not resizable. */ jpayne@69: PyAPI_FUNC(PyObject*) PyUnicode_New( jpayne@69: Py_ssize_t size, /* Number of code points in the new string */ jpayne@69: Py_UCS4 maxchar /* maximum code point value in the string */ jpayne@69: ); jpayne@69: jpayne@69: /* Initializes the canonical string representation from the deprecated jpayne@69: wstr/Py_UNICODE representation. This function is used to convert Unicode jpayne@69: objects which were created using the old API to the new flexible format jpayne@69: introduced with PEP 393. jpayne@69: jpayne@69: Don't call this function directly, use the public PyUnicode_READY() macro jpayne@69: instead. */ jpayne@69: PyAPI_FUNC(int) _PyUnicode_Ready( jpayne@69: PyObject *unicode /* Unicode object */ jpayne@69: ); jpayne@69: jpayne@69: /* Get a copy of a Unicode string. */ jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_Copy( jpayne@69: PyObject *unicode jpayne@69: ); jpayne@69: jpayne@69: /* Copy character from one unicode object into another, this function performs jpayne@69: character conversion when necessary and falls back to memcpy() if possible. jpayne@69: jpayne@69: Fail if to is too small (smaller than *how_many* or smaller than jpayne@69: len(from)-from_start), or if kind(from[from_start:from_start+how_many]) > jpayne@69: kind(to), or if *to* has more than 1 reference. jpayne@69: jpayne@69: Return the number of written character, or return -1 and raise an exception jpayne@69: on error. jpayne@69: jpayne@69: Pseudo-code: jpayne@69: jpayne@69: how_many = min(how_many, len(from) - from_start) jpayne@69: to[to_start:to_start+how_many] = from[from_start:from_start+how_many] jpayne@69: return how_many jpayne@69: jpayne@69: Note: The function doesn't write a terminating null character. jpayne@69: */ jpayne@69: PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters( jpayne@69: PyObject *to, jpayne@69: Py_ssize_t to_start, jpayne@69: PyObject *from, jpayne@69: Py_ssize_t from_start, jpayne@69: Py_ssize_t how_many jpayne@69: ); jpayne@69: jpayne@69: /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so jpayne@69: may crash if parameters are invalid (e.g. if the output string jpayne@69: is too short). */ jpayne@69: PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters( jpayne@69: PyObject *to, jpayne@69: Py_ssize_t to_start, jpayne@69: PyObject *from, jpayne@69: Py_ssize_t from_start, jpayne@69: Py_ssize_t how_many jpayne@69: ); jpayne@69: jpayne@69: /* Fill a string with a character: write fill_char into jpayne@69: unicode[start:start+length]. jpayne@69: jpayne@69: Fail if fill_char is bigger than the string maximum character, or if the jpayne@69: string has more than 1 reference. jpayne@69: jpayne@69: Return the number of written character, or return -1 and raise an exception jpayne@69: on error. */ jpayne@69: PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill( jpayne@69: PyObject *unicode, jpayne@69: Py_ssize_t start, jpayne@69: Py_ssize_t length, jpayne@69: Py_UCS4 fill_char jpayne@69: ); jpayne@69: jpayne@69: /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash jpayne@69: if parameters are invalid (e.g. if length is longer than the string). */ jpayne@69: PyAPI_FUNC(void) _PyUnicode_FastFill( jpayne@69: PyObject *unicode, jpayne@69: Py_ssize_t start, jpayne@69: Py_ssize_t length, jpayne@69: Py_UCS4 fill_char jpayne@69: ); jpayne@69: jpayne@69: /* Create a Unicode Object from the Py_UNICODE buffer u of the given jpayne@69: size. jpayne@69: jpayne@69: u may be NULL which causes the contents to be undefined. It is the jpayne@69: user's responsibility to fill in the needed data afterwards. Note jpayne@69: that modifying the Unicode object contents after construction is jpayne@69: only allowed if u was set to NULL. jpayne@69: jpayne@69: The buffer is copied into the new object. */ jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( jpayne@69: const Py_UNICODE *u, /* Unicode buffer */ jpayne@69: Py_ssize_t size /* size of buffer */ jpayne@69: ); jpayne@69: jpayne@69: /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. jpayne@69: Scan the string to find the maximum character. */ jpayne@69: PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( jpayne@69: int kind, jpayne@69: const void *buffer, jpayne@69: Py_ssize_t size); jpayne@69: jpayne@69: /* Create a new string from a buffer of ASCII characters. jpayne@69: WARNING: Don't check if the string contains any non-ASCII character. */ jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII( jpayne@69: const char *buffer, jpayne@69: Py_ssize_t size); jpayne@69: jpayne@69: /* Compute the maximum character of the substring unicode[start:end]. jpayne@69: Return 127 for an empty string. */ jpayne@69: PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar ( jpayne@69: PyObject *unicode, jpayne@69: Py_ssize_t start, jpayne@69: Py_ssize_t end); jpayne@69: jpayne@69: /* Return a read-only pointer to the Unicode object's internal jpayne@69: Py_UNICODE buffer. jpayne@69: If the wchar_t/Py_UNICODE representation is not yet available, this jpayne@69: function will calculate it. */ jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( jpayne@69: PyObject *unicode /* Unicode object */ jpayne@69: ); jpayne@69: jpayne@69: /* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string jpayne@69: contains null characters. */ jpayne@69: PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode( jpayne@69: PyObject *unicode /* Unicode object */ jpayne@69: ); jpayne@69: jpayne@69: /* Return a read-only pointer to the Unicode object's internal jpayne@69: Py_UNICODE buffer and save the length at size. jpayne@69: If the wchar_t/Py_UNICODE representation is not yet available, this jpayne@69: function will calculate it. */ jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize( jpayne@69: PyObject *unicode, /* Unicode object */ jpayne@69: Py_ssize_t *size /* location where to save the length */ jpayne@69: ); jpayne@69: jpayne@69: /* Get the maximum ordinal for a Unicode character. */ jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void); jpayne@69: jpayne@69: jpayne@69: /* --- _PyUnicodeWriter API ----------------------------------------------- */ jpayne@69: jpayne@69: typedef struct { jpayne@69: PyObject *buffer; jpayne@69: void *data; jpayne@69: enum PyUnicode_Kind kind; jpayne@69: Py_UCS4 maxchar; jpayne@69: Py_ssize_t size; jpayne@69: Py_ssize_t pos; jpayne@69: jpayne@69: /* minimum number of allocated characters (default: 0) */ jpayne@69: Py_ssize_t min_length; jpayne@69: jpayne@69: /* minimum character (default: 127, ASCII) */ jpayne@69: Py_UCS4 min_char; jpayne@69: jpayne@69: /* If non-zero, overallocate the buffer (default: 0). */ jpayne@69: unsigned char overallocate; jpayne@69: jpayne@69: /* If readonly is 1, buffer is a shared string (cannot be modified) jpayne@69: and size is set to 0. */ jpayne@69: unsigned char readonly; jpayne@69: } _PyUnicodeWriter ; jpayne@69: jpayne@69: /* Initialize a Unicode writer. jpayne@69: * jpayne@69: * By default, the minimum buffer size is 0 character and overallocation is jpayne@69: * disabled. Set min_length, min_char and overallocate attributes to control jpayne@69: * the allocation of the buffer. */ jpayne@69: PyAPI_FUNC(void) jpayne@69: _PyUnicodeWriter_Init(_PyUnicodeWriter *writer); jpayne@69: jpayne@69: /* Prepare the buffer to write 'length' characters jpayne@69: with the specified maximum character. jpayne@69: jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR) \ jpayne@69: (((MAXCHAR) <= (WRITER)->maxchar \ jpayne@69: && (LENGTH) <= (WRITER)->size - (WRITER)->pos) \ jpayne@69: ? 0 \ jpayne@69: : (((LENGTH) == 0) \ jpayne@69: ? 0 \ jpayne@69: : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR)))) jpayne@69: jpayne@69: /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro jpayne@69: instead. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, jpayne@69: Py_ssize_t length, Py_UCS4 maxchar); jpayne@69: jpayne@69: /* Prepare the buffer to have at least the kind KIND. jpayne@69: For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will jpayne@69: support characters in range U+000-U+FFFF. jpayne@69: jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: #define _PyUnicodeWriter_PrepareKind(WRITER, KIND) \ jpayne@69: (assert((KIND) != PyUnicode_WCHAR_KIND), \ jpayne@69: (KIND) <= (WRITER)->kind \ jpayne@69: ? 0 \ jpayne@69: : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND))) jpayne@69: jpayne@69: /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind() jpayne@69: macro instead. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, jpayne@69: enum PyUnicode_Kind kind); jpayne@69: jpayne@69: /* Append a Unicode character. jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, jpayne@69: Py_UCS4 ch jpayne@69: ); jpayne@69: jpayne@69: /* Append a Unicode string. jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, jpayne@69: PyObject *str /* Unicode string */ jpayne@69: ); jpayne@69: jpayne@69: /* Append a substring of a Unicode string. jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, jpayne@69: PyObject *str, /* Unicode string */ jpayne@69: Py_ssize_t start, jpayne@69: Py_ssize_t end jpayne@69: ); jpayne@69: jpayne@69: /* Append an ASCII-encoded byte string. jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, jpayne@69: const char *str, /* ASCII-encoded byte string */ jpayne@69: Py_ssize_t len /* number of bytes, or -1 if unknown */ jpayne@69: ); jpayne@69: jpayne@69: /* Append a latin1-encoded byte string. jpayne@69: Return 0 on success, raise an exception and return -1 on error. */ jpayne@69: PyAPI_FUNC(int) jpayne@69: _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, jpayne@69: const char *str, /* latin1-encoded byte string */ jpayne@69: Py_ssize_t len /* length in bytes */ jpayne@69: ); jpayne@69: jpayne@69: /* Get the value of the writer as a Unicode string. Clear the jpayne@69: buffer of the writer. Raise an exception and return NULL jpayne@69: on error. */ jpayne@69: PyAPI_FUNC(PyObject *) jpayne@69: _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); jpayne@69: jpayne@69: /* Deallocate memory of a writer (clear its internal buffer). */ jpayne@69: PyAPI_FUNC(void) jpayne@69: _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); jpayne@69: jpayne@69: jpayne@69: /* Format the object based on the format_spec, as defined in PEP 3101 jpayne@69: (Advanced String Formatting). */ jpayne@69: PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter( jpayne@69: _PyUnicodeWriter *writer, jpayne@69: PyObject *obj, jpayne@69: PyObject *format_spec, jpayne@69: Py_ssize_t start, jpayne@69: Py_ssize_t end); jpayne@69: jpayne@69: /* --- wchar_t support for platforms which support it --------------------- */ jpayne@69: jpayne@69: #ifdef HAVE_WCHAR_H jpayne@69: PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); jpayne@69: #endif jpayne@69: jpayne@69: /* --- Manage the default encoding ---------------------------------------- */ jpayne@69: jpayne@69: /* Returns a pointer to the default encoding (UTF-8) of the jpayne@69: Unicode object unicode and the size of the encoded representation jpayne@69: in bytes stored in *size. jpayne@69: jpayne@69: In case of an error, no *size is set. jpayne@69: jpayne@69: This function caches the UTF-8 encoded string in the unicodeobject jpayne@69: and subsequent calls will return the same string. The memory is released jpayne@69: when the unicodeobject is deallocated. jpayne@69: jpayne@69: _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to jpayne@69: support the previous internal function with the same behaviour. jpayne@69: jpayne@69: *** This API is for interpreter INTERNAL USE ONLY and will likely jpayne@69: *** be removed or changed in the future. jpayne@69: jpayne@69: *** If you need to access the Unicode object as UTF-8 bytes string, jpayne@69: *** please use PyUnicode_AsUTF8String() instead. jpayne@69: */ jpayne@69: jpayne@69: PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( jpayne@69: PyObject *unicode, jpayne@69: Py_ssize_t *size); jpayne@69: jpayne@69: #define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize jpayne@69: jpayne@69: /* Returns a pointer to the default encoding (UTF-8) of the jpayne@69: Unicode object unicode. jpayne@69: jpayne@69: Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation jpayne@69: in the unicodeobject. jpayne@69: jpayne@69: _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to jpayne@69: support the previous internal function with the same behaviour. jpayne@69: jpayne@69: Use of this API is DEPRECATED since no size information can be jpayne@69: extracted from the returned data. jpayne@69: jpayne@69: *** This API is for interpreter INTERNAL USE ONLY and will likely jpayne@69: *** be removed or changed for Python 3.1. jpayne@69: jpayne@69: *** If you need to access the Unicode object as UTF-8 bytes string, jpayne@69: *** please use PyUnicode_AsUTF8String() instead. jpayne@69: jpayne@69: */ jpayne@69: jpayne@69: PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode); jpayne@69: jpayne@69: #define _PyUnicode_AsString PyUnicode_AsUTF8 jpayne@69: jpayne@69: /* --- Generic Codecs ----------------------------------------------------- */ jpayne@69: jpayne@69: /* Encodes a Py_UNICODE buffer of the given size and returns a jpayne@69: Python string object. */ jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_Encode( jpayne@69: const Py_UNICODE *s, /* Unicode char buffer */ jpayne@69: Py_ssize_t size, /* number of Py_UNICODE chars to encode */ jpayne@69: const char *encoding, /* encoding */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- UTF-7 Codecs ------------------------------------------------------- */ jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* number of Py_UNICODE chars to encode */ jpayne@69: int base64SetO, /* Encode RFC2152 Set O characters in base64 */ jpayne@69: int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7( jpayne@69: PyObject *unicode, /* Unicode object */ jpayne@69: int base64SetO, /* Encode RFC2152 Set O characters in base64 */ jpayne@69: int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- UTF-8 Codecs ------------------------------------------------------- */ jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String( jpayne@69: PyObject *unicode, jpayne@69: const char *errors); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* number of Py_UNICODE chars to encode */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- UTF-32 Codecs ------------------------------------------------------ */ jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* number of Py_UNICODE chars to encode */ jpayne@69: const char *errors, /* error handling */ jpayne@69: int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32( jpayne@69: PyObject *object, /* Unicode object */ jpayne@69: const char *errors, /* error handling */ jpayne@69: int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ jpayne@69: ); jpayne@69: jpayne@69: /* --- UTF-16 Codecs ------------------------------------------------------ */ jpayne@69: jpayne@69: /* Returns a Python string object holding the UTF-16 encoded value of jpayne@69: the Unicode data. jpayne@69: jpayne@69: If byteorder is not 0, output is written according to the following jpayne@69: byte order: jpayne@69: jpayne@69: byteorder == -1: little endian jpayne@69: byteorder == 0: native byte order (writes a BOM mark) jpayne@69: byteorder == 1: big endian jpayne@69: jpayne@69: If byteorder is 0, the output string will always start with the jpayne@69: Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is jpayne@69: prepended. jpayne@69: jpayne@69: Note that Py_UNICODE data is being interpreted as UTF-16 reduced to jpayne@69: UCS-2. This trick makes it possible to add full UTF-16 capabilities jpayne@69: at a later point without compromising the APIs. jpayne@69: jpayne@69: */ jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* number of Py_UNICODE chars to encode */ jpayne@69: const char *errors, /* error handling */ jpayne@69: int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16( jpayne@69: PyObject* unicode, /* Unicode object */ jpayne@69: const char *errors, /* error handling */ jpayne@69: int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ jpayne@69: ); jpayne@69: jpayne@69: /* --- Unicode-Escape Codecs ---------------------------------------------- */ jpayne@69: jpayne@69: /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape jpayne@69: chars. */ jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( jpayne@69: const char *string, /* Unicode-Escape encoded string */ jpayne@69: Py_ssize_t length, /* size of string */ jpayne@69: const char *errors, /* error handling */ jpayne@69: const char **first_invalid_escape /* on return, points to first jpayne@69: invalid escaped char in jpayne@69: string. */ jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length /* Number of Py_UNICODE chars to encode */ jpayne@69: ); jpayne@69: jpayne@69: /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length /* Number of Py_UNICODE chars to encode */ jpayne@69: ); jpayne@69: jpayne@69: /* --- Latin-1 Codecs ----------------------------------------------------- */ jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String( jpayne@69: PyObject* unicode, jpayne@69: const char* errors); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- ASCII Codecs ------------------------------------------------------- */ jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString( jpayne@69: PyObject* unicode, jpayne@69: const char* errors); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- Character Map Codecs ----------------------------------------------- */ jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ jpayne@69: PyObject *mapping, /* encoding mapping */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap( jpayne@69: PyObject *unicode, /* Unicode object */ jpayne@69: PyObject *mapping, /* encoding mapping */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* Translate a Py_UNICODE buffer of the given length by applying a jpayne@69: character mapping table to it and return the resulting Unicode jpayne@69: object. jpayne@69: jpayne@69: The mapping table must map Unicode ordinal integers to Unicode strings, jpayne@69: Unicode ordinal integers or None (causing deletion of the character). jpayne@69: jpayne@69: Mapping tables may be dictionaries or sequences. Unmapped character jpayne@69: ordinals (ones which cause a LookupError) are left untouched and jpayne@69: are copied as-is. jpayne@69: jpayne@69: */ jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ jpayne@69: PyObject *table, /* Translate table */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* --- MBCS codecs for Windows -------------------------------------------- */ jpayne@69: jpayne@69: #ifdef MS_WINDOWS jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS( jpayne@69: const Py_UNICODE *data, /* Unicode char buffer */ jpayne@69: Py_ssize_t length, /* number of Py_UNICODE chars to encode */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: #endif jpayne@69: jpayne@69: /* --- Decimal Encoder ---------------------------------------------------- */ jpayne@69: jpayne@69: /* Takes a Unicode string holding a decimal value and writes it into jpayne@69: an output buffer using standard ASCII digit codes. jpayne@69: jpayne@69: The output buffer has to provide at least length+1 bytes of storage jpayne@69: area. The output string is 0-terminated. jpayne@69: jpayne@69: The encoder converts whitespace to ' ', decimal characters to their jpayne@69: corresponding ASCII digit and all other Latin-1 characters except jpayne@69: \0 as-is. Characters outside this range (Unicode ordinals 1-256) jpayne@69: are treated as errors. This includes embedded NULL bytes. jpayne@69: jpayne@69: Error handling is defined by the errors argument: jpayne@69: jpayne@69: NULL or "strict": raise a ValueError jpayne@69: "ignore": ignore the wrong characters (these are not copied to the jpayne@69: output buffer) jpayne@69: "replace": replaces illegal characters with '?' jpayne@69: jpayne@69: Returns 0 on success, -1 on failure. jpayne@69: jpayne@69: */ jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(int) PyUnicode_EncodeDecimal( jpayne@69: Py_UNICODE *s, /* Unicode buffer */ jpayne@69: Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ jpayne@69: char *output, /* Output buffer; must have size >= length */ jpayne@69: const char *errors /* error handling */ jpayne@69: ); jpayne@69: jpayne@69: /* Transforms code points that have decimal digit property to the jpayne@69: corresponding ASCII digit code points. jpayne@69: jpayne@69: Returns a new Unicode string on success, NULL on failure. jpayne@69: */ jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ jpayne@69: PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( jpayne@69: Py_UNICODE *s, /* Unicode buffer */ jpayne@69: Py_ssize_t length /* Number of Py_UNICODE chars to transform */ jpayne@69: ); jpayne@69: jpayne@69: /* Coverts a Unicode object holding a decimal value to an ASCII string jpayne@69: for using in int, float and complex parsers. jpayne@69: Transforms code points that have decimal digit property to the jpayne@69: corresponding ASCII digit code points. Transforms spaces to ASCII. jpayne@69: Transforms code points starting from the first non-ASCII code point that jpayne@69: is neither a decimal digit nor a space to the end into '?'. */ jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( jpayne@69: PyObject *unicode /* Unicode object */ jpayne@69: ); jpayne@69: jpayne@69: /* --- Methods & Slots ---------------------------------------------------- */ jpayne@69: jpayne@69: PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray( jpayne@69: PyObject *separator, jpayne@69: PyObject *const *items, jpayne@69: Py_ssize_t seqlen jpayne@69: ); jpayne@69: jpayne@69: /* Test whether a unicode is equal to ASCII identifier. Return 1 if true, jpayne@69: 0 otherwise. The right argument must be ASCII identifier. jpayne@69: Any error occurs inside will be cleared before return. */ jpayne@69: PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId( jpayne@69: PyObject *left, /* Left string */ jpayne@69: _Py_Identifier *right /* Right identifier */ jpayne@69: ); jpayne@69: jpayne@69: /* Test whether a unicode is equal to ASCII string. Return 1 if true, jpayne@69: 0 otherwise. The right argument must be ASCII-encoded string. jpayne@69: Any error occurs inside will be cleared before return. */ jpayne@69: PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString( jpayne@69: PyObject *left, jpayne@69: const char *right /* ASCII-encoded string */ jpayne@69: ); jpayne@69: jpayne@69: /* Externally visible for str.strip(unicode) */ jpayne@69: PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( jpayne@69: PyObject *self, jpayne@69: int striptype, jpayne@69: PyObject *sepobj jpayne@69: ); jpayne@69: jpayne@69: /* Using explicit passed-in values, insert the thousands grouping jpayne@69: into the string pointed to by buffer. For the argument descriptions, jpayne@69: see Objects/stringlib/localeutil.h */ jpayne@69: PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( jpayne@69: _PyUnicodeWriter *writer, jpayne@69: Py_ssize_t n_buffer, jpayne@69: PyObject *digits, jpayne@69: Py_ssize_t d_pos, jpayne@69: Py_ssize_t n_digits, jpayne@69: Py_ssize_t min_width, jpayne@69: const char *grouping, jpayne@69: PyObject *thousands_sep, jpayne@69: Py_UCS4 *maxchar); jpayne@69: jpayne@69: /* === Characters Type APIs =============================================== */ jpayne@69: jpayne@69: /* Helper array used by Py_UNICODE_ISSPACE(). */ jpayne@69: jpayne@69: PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; jpayne@69: jpayne@69: /* These should not be used directly. Use the Py_UNICODE_IS* and jpayne@69: Py_UNICODE_TO* macros instead. jpayne@69: jpayne@69: These APIs are implemented in Objects/unicodectype.c. jpayne@69: jpayne@69: */ jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsLowercase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsUppercase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsTitlecase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsXidStart( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsXidContinue( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsWhitespace( jpayne@69: const Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsLinebreak( jpayne@69: const Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToLowerFull( jpayne@69: Py_UCS4 ch, /* Unicode character */ jpayne@69: Py_UCS4 *res jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToTitleFull( jpayne@69: Py_UCS4 ch, /* Unicode character */ jpayne@69: Py_UCS4 *res jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToUpperFull( jpayne@69: Py_UCS4 ch, /* Unicode character */ jpayne@69: Py_UCS4 *res jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToFoldedFull( jpayne@69: Py_UCS4 ch, /* Unicode character */ jpayne@69: Py_UCS4 *res jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsCased( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_ToDigit( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(double) _PyUnicode_ToNumeric( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsDigit( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsNumeric( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsPrintable( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(int) _PyUnicode_IsAlpha( jpayne@69: Py_UCS4 ch /* Unicode character */ jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(size_t) Py_UNICODE_strlen( jpayne@69: const Py_UNICODE *u jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy( jpayne@69: Py_UNICODE *s1, jpayne@69: const Py_UNICODE *s2); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat( jpayne@69: Py_UNICODE *s1, const Py_UNICODE *s2); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy( jpayne@69: Py_UNICODE *s1, jpayne@69: const Py_UNICODE *s2, jpayne@69: size_t n); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strcmp( jpayne@69: const Py_UNICODE *s1, jpayne@69: const Py_UNICODE *s2 jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(int) Py_UNICODE_strncmp( jpayne@69: const Py_UNICODE *s1, jpayne@69: const Py_UNICODE *s2, jpayne@69: size_t n jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( jpayne@69: const Py_UNICODE *s, jpayne@69: Py_UNICODE c jpayne@69: ); jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr( jpayne@69: const Py_UNICODE *s, jpayne@69: Py_UNICODE c jpayne@69: ); jpayne@69: jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int); jpayne@69: jpayne@69: /* Create a copy of a unicode string ending with a nul character. Return NULL jpayne@69: and raise a MemoryError exception on memory allocation failure, otherwise jpayne@69: return a new allocated buffer (use PyMem_Free() to free the buffer). */ jpayne@69: jpayne@69: Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( jpayne@69: PyObject *unicode jpayne@69: ); jpayne@69: jpayne@69: /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/ jpayne@69: PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*); jpayne@69: /* Clear all static strings. */ jpayne@69: PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void); jpayne@69: jpayne@69: /* Fast equality check when the inputs are known to be exact unicode types jpayne@69: and where the hash values are equal (i.e. a very probable match) */ jpayne@69: PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *); jpayne@69: jpayne@69: #ifdef __cplusplus jpayne@69: } jpayne@69: #endif