Skip to content

Commit a56ead0

Browse files
authored
gh-129173: Use _PyUnicodeError_GetParams in PyCodec_NameReplaceErrors (GH-129135)
1 parent 167cf3a commit a56ead0

File tree

1 file changed

+123
-111
lines changed

1 file changed

+123
-111
lines changed

Python/codecs.c

Lines changed: 123 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
676676
PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
677677

678678

679+
// --- codecs handlers: utilities ---------------------------------------------
680+
681+
/*
682+
* Return the number of characters (including special prefixes)
683+
* needed to represent 'ch' by codec_handler_write_unicode_hex().
684+
*/
685+
static inline Py_ssize_t
686+
codec_handler_unicode_hex_width(Py_UCS4 ch)
687+
{
688+
if (ch >= 0x10000) {
689+
// format: '\\' + 'U' + 8 hex digits
690+
return 1 + 1 + 8;
691+
}
692+
else if (ch >= 0x100) {
693+
// format: '\\' + 'u' + 4 hex digits
694+
return 1 + 1 + 4;
695+
}
696+
else {
697+
// format: '\\' + 'x' + 2 hex digits
698+
return 1 + 1 + 2;
699+
}
700+
}
701+
702+
703+
/*
704+
* Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
705+
* using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
706+
*/
707+
static inline void
708+
codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
709+
{
710+
*(*p)++ = '\\';
711+
if (ch >= 0x10000) {
712+
*(*p)++ = 'U';
713+
*(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
714+
*(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
715+
*(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
716+
*(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
717+
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
718+
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
719+
}
720+
else if (ch >= 0x100) {
721+
*(*p)++ = 'u';
722+
*(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
723+
*(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
724+
}
725+
else {
726+
*(*p)++ = 'x';
727+
}
728+
*(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
729+
*(*p)++ = Py_hexdigits[ch & 0xf];
730+
}
731+
732+
679733
// --- handler: 'strict' ------------------------------------------------------
680734

681735
PyObject *PyCodec_StrictErrors(PyObject *exc)
@@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
942996

943997
Py_ssize_t ressize = 0;
944998
for (Py_ssize_t i = start; i < end; ++i) {
945-
/* object is guaranteed to be "ready" */
946999
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
947-
if (c >= 0x10000) {
948-
ressize += 1 + 1 + 8;
949-
}
950-
else if (c >= 0x100) {
951-
ressize += 1 + 1 + 4;
952-
}
953-
else {
954-
ressize += 1 + 1 + 2;
955-
}
1000+
ressize += codec_handler_unicode_hex_width(c);
9561001
}
9571002
PyObject *res = PyUnicode_New(ressize, 127);
9581003
if (res == NULL) {
@@ -962,122 +1007,86 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
9621007
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
9631008
for (Py_ssize_t i = start; i < end; ++i) {
9641009
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
965-
*outp++ = '\\';
966-
if (c >= 0x00010000) {
967-
*outp++ = 'U';
968-
*outp++ = Py_hexdigits[(c >> 28) & 0xf];
969-
*outp++ = Py_hexdigits[(c >> 24) & 0xf];
970-
*outp++ = Py_hexdigits[(c >> 20) & 0xf];
971-
*outp++ = Py_hexdigits[(c >> 16) & 0xf];
972-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
973-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
974-
}
975-
else if (c >= 0x100) {
976-
*outp++ = 'u';
977-
*outp++ = Py_hexdigits[(c >> 12) & 0xf];
978-
*outp++ = Py_hexdigits[(c >> 8) & 0xf];
979-
}
980-
else {
981-
*outp++ = 'x';
982-
}
983-
*outp++ = Py_hexdigits[(c >> 4) & 0xf];
984-
*outp++ = Py_hexdigits[c & 0xf];
1010+
codec_handler_write_unicode_hex(&outp, c);
9851011
}
9861012
assert(_PyUnicode_CheckConsistency(res, 1));
9871013
Py_DECREF(obj);
9881014
return Py_BuildValue("(Nn)", res, end);
9891015
}
9901016

1017+
1018+
// --- handler: 'namereplace' -------------------------------------------------
1019+
9911020
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
9921021
{
993-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
994-
PyObject *restuple;
995-
PyObject *object;
996-
Py_ssize_t i;
997-
Py_ssize_t start;
998-
Py_ssize_t end;
999-
PyObject *res;
1000-
Py_UCS1 *outp;
1001-
Py_ssize_t ressize;
1002-
int replsize;
1003-
Py_UCS4 c;
1004-
char buffer[256]; /* NAME_MAXLEN */
1005-
if (PyUnicodeEncodeError_GetStart(exc, &start))
1006-
return NULL;
1007-
if (PyUnicodeEncodeError_GetEnd(exc, &end))
1008-
return NULL;
1009-
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1010-
return NULL;
1011-
_PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1012-
if (ucnhash_capi == NULL) {
1013-
return NULL;
1022+
if (!_PyIsUnicodeEncodeError(exc)) {
1023+
wrong_exception_type(exc);
1024+
return NULL;
1025+
}
1026+
1027+
_PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
1028+
if (ucnhash_capi == NULL) {
1029+
return NULL;
1030+
}
1031+
1032+
PyObject *obj;
1033+
Py_ssize_t start, end;
1034+
if (_PyUnicodeError_GetParams(exc,
1035+
&obj, NULL,
1036+
&start, &end, NULL, false) < 0)
1037+
{
1038+
return NULL;
1039+
}
1040+
1041+
char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
1042+
Py_ssize_t imax = start, ressize = 0, replsize;
1043+
for (; imax < end; ++imax) {
1044+
Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
1045+
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1046+
// If 'c' is recognized by getname(), the corresponding replacement
1047+
// is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1048+
// characters. Failures of getname() are ignored by the handler.
1049+
replsize = 1 + 1 + 1 + strlen(buffer) + 1;
10141050
}
1015-
for (i = start, ressize = 0; i < end; ++i) {
1016-
/* object is guaranteed to be "ready" */
1017-
c = PyUnicode_READ_CHAR(object, i);
1018-
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1019-
replsize = 1+1+1+(int)strlen(buffer)+1;
1020-
}
1021-
else if (c >= 0x10000) {
1022-
replsize = 1+1+8;
1023-
}
1024-
else if (c >= 0x100) {
1025-
replsize = 1+1+4;
1026-
}
1027-
else
1028-
replsize = 1+1+2;
1029-
if (ressize > PY_SSIZE_T_MAX - replsize)
1030-
break;
1031-
ressize += replsize;
1051+
else {
1052+
replsize = codec_handler_unicode_hex_width(c);
10321053
}
1033-
end = i;
1034-
res = PyUnicode_New(ressize, 127);
1035-
if (res==NULL)
1036-
return NULL;
1037-
for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1038-
i < end; ++i) {
1039-
c = PyUnicode_READ_CHAR(object, i);
1040-
*outp++ = '\\';
1041-
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1042-
*outp++ = 'N';
1043-
*outp++ = '{';
1044-
strcpy((char *)outp, buffer);
1045-
outp += strlen(buffer);
1046-
*outp++ = '}';
1047-
continue;
1048-
}
1049-
if (c >= 0x00010000) {
1050-
*outp++ = 'U';
1051-
*outp++ = Py_hexdigits[(c>>28)&0xf];
1052-
*outp++ = Py_hexdigits[(c>>24)&0xf];
1053-
*outp++ = Py_hexdigits[(c>>20)&0xf];
1054-
*outp++ = Py_hexdigits[(c>>16)&0xf];
1055-
*outp++ = Py_hexdigits[(c>>12)&0xf];
1056-
*outp++ = Py_hexdigits[(c>>8)&0xf];
1057-
}
1058-
else if (c >= 0x100) {
1059-
*outp++ = 'u';
1060-
*outp++ = Py_hexdigits[(c>>12)&0xf];
1061-
*outp++ = Py_hexdigits[(c>>8)&0xf];
1062-
}
1063-
else
1064-
*outp++ = 'x';
1065-
*outp++ = Py_hexdigits[(c>>4)&0xf];
1066-
*outp++ = Py_hexdigits[c&0xf];
1054+
if (ressize > PY_SSIZE_T_MAX - replsize) {
1055+
break;
10671056
}
1068-
1069-
assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1070-
assert(_PyUnicode_CheckConsistency(res, 1));
1071-
restuple = Py_BuildValue("(Nn)", res, end);
1072-
Py_DECREF(object);
1073-
return restuple;
1057+
ressize += replsize;
10741058
}
1075-
else {
1076-
wrong_exception_type(exc);
1059+
1060+
PyObject *res = PyUnicode_New(ressize, 127);
1061+
if (res == NULL) {
1062+
Py_DECREF(obj);
10771063
return NULL;
10781064
}
1065+
1066+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
1067+
for (Py_ssize_t i = start; i < imax; ++i) {
1068+
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
1069+
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1070+
*outp++ = '\\';
1071+
*outp++ = 'N';
1072+
*outp++ = '{';
1073+
(void)strcpy((char *)outp, buffer);
1074+
outp += strlen(buffer);
1075+
*outp++ = '}';
1076+
}
1077+
else {
1078+
codec_handler_write_unicode_hex(&outp, c);
1079+
}
1080+
}
1081+
1082+
assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1083+
assert(_PyUnicode_CheckConsistency(res, 1));
1084+
PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
1085+
Py_DECREF(obj);
1086+
return restuple;
10791087
}
10801088

1089+
10811090
#define ENC_UNKNOWN -1
10821091
#define ENC_UTF8 0
10831092
#define ENC_UTF16BE 1
@@ -1421,11 +1430,14 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
14211430
return PyCodec_BackslashReplaceErrors(exc);
14221431
}
14231432

1424-
static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1433+
1434+
static inline PyObject *
1435+
namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
14251436
{
14261437
return PyCodec_NameReplaceErrors(exc);
14271438
}
14281439

1440+
14291441
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
14301442
{
14311443
return PyCodec_SurrogatePassErrors(exc);

0 commit comments

Comments
 (0)