@@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
676676 PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
677677
678678
679+ // --- codecs handlers: utilities ---------------------------------------------
680+
681+ /*
682+ * Return the number of characters (including special prefixes)
683+ * needed to represent 'ch' by codec_handler_write_unicode_hex().
684+ */
685+ static inline Py_ssize_t
686+ codec_handler_unicode_hex_width (Py_UCS4 ch )
687+ {
688+ if (ch >= 0x10000 ) {
689+ // format: '\\' + 'U' + 8 hex digits
690+ return 1 + 1 + 8 ;
691+ }
692+ else if (ch >= 0x100 ) {
693+ // format: '\\' + 'u' + 4 hex digits
694+ return 1 + 1 + 4 ;
695+ }
696+ else {
697+ // format: '\\' + 'x' + 2 hex digits
698+ return 1 + 1 + 2 ;
699+ }
700+ }
701+
702+
703+ /*
704+ * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
705+ * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
706+ */
707+ static inline void
708+ codec_handler_write_unicode_hex (Py_UCS1 * * p , Py_UCS4 ch )
709+ {
710+ * (* p )++ = '\\' ;
711+ if (ch >= 0x10000 ) {
712+ * (* p )++ = 'U' ;
713+ * (* p )++ = Py_hexdigits [(ch >> 28 ) & 0xf ];
714+ * (* p )++ = Py_hexdigits [(ch >> 24 ) & 0xf ];
715+ * (* p )++ = Py_hexdigits [(ch >> 20 ) & 0xf ];
716+ * (* p )++ = Py_hexdigits [(ch >> 16 ) & 0xf ];
717+ * (* p )++ = Py_hexdigits [(ch >> 12 ) & 0xf ];
718+ * (* p )++ = Py_hexdigits [(ch >> 8 ) & 0xf ];
719+ }
720+ else if (ch >= 0x100 ) {
721+ * (* p )++ = 'u' ;
722+ * (* p )++ = Py_hexdigits [(ch >> 12 ) & 0xf ];
723+ * (* p )++ = Py_hexdigits [(ch >> 8 ) & 0xf ];
724+ }
725+ else {
726+ * (* p )++ = 'x' ;
727+ }
728+ * (* p )++ = Py_hexdigits [(ch >> 4 ) & 0xf ];
729+ * (* p )++ = Py_hexdigits [ch & 0xf ];
730+ }
731+
732+
679733// --- handler: 'strict' ------------------------------------------------------
680734
681735PyObject * PyCodec_StrictErrors (PyObject * exc )
@@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
942996
943997 Py_ssize_t ressize = 0 ;
944998 for (Py_ssize_t i = start ; i < end ; ++ i ) {
945- /* object is guaranteed to be "ready" */
946999 Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
947- if (c >= 0x10000 ) {
948- ressize += 1 + 1 + 8 ;
949- }
950- else if (c >= 0x100 ) {
951- ressize += 1 + 1 + 4 ;
952- }
953- else {
954- ressize += 1 + 1 + 2 ;
955- }
1000+ ressize += codec_handler_unicode_hex_width (c );
9561001 }
9571002 PyObject * res = PyUnicode_New (ressize , 127 );
9581003 if (res == NULL ) {
@@ -962,122 +1007,86 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
9621007 Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
9631008 for (Py_ssize_t i = start ; i < end ; ++ i ) {
9641009 Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
965- * outp ++ = '\\' ;
966- if (c >= 0x00010000 ) {
967- * outp ++ = 'U' ;
968- * outp ++ = Py_hexdigits [(c >> 28 ) & 0xf ];
969- * outp ++ = Py_hexdigits [(c >> 24 ) & 0xf ];
970- * outp ++ = Py_hexdigits [(c >> 20 ) & 0xf ];
971- * outp ++ = Py_hexdigits [(c >> 16 ) & 0xf ];
972- * outp ++ = Py_hexdigits [(c >> 12 ) & 0xf ];
973- * outp ++ = Py_hexdigits [(c >> 8 ) & 0xf ];
974- }
975- else if (c >= 0x100 ) {
976- * outp ++ = 'u' ;
977- * outp ++ = Py_hexdigits [(c >> 12 ) & 0xf ];
978- * outp ++ = Py_hexdigits [(c >> 8 ) & 0xf ];
979- }
980- else {
981- * outp ++ = 'x' ;
982- }
983- * outp ++ = Py_hexdigits [(c >> 4 ) & 0xf ];
984- * outp ++ = Py_hexdigits [c & 0xf ];
1010+ codec_handler_write_unicode_hex (& outp , c );
9851011 }
9861012 assert (_PyUnicode_CheckConsistency (res , 1 ));
9871013 Py_DECREF (obj );
9881014 return Py_BuildValue ("(Nn)" , res , end );
9891015}
9901016
1017+
1018+ // --- handler: 'namereplace' -------------------------------------------------
1019+
9911020PyObject * PyCodec_NameReplaceErrors (PyObject * exc )
9921021{
993- if (PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
994- PyObject * restuple ;
995- PyObject * object ;
996- Py_ssize_t i ;
997- Py_ssize_t start ;
998- Py_ssize_t end ;
999- PyObject * res ;
1000- Py_UCS1 * outp ;
1001- Py_ssize_t ressize ;
1002- int replsize ;
1003- Py_UCS4 c ;
1004- char buffer [256 ]; /* NAME_MAXLEN */
1005- if (PyUnicodeEncodeError_GetStart (exc , & start ))
1006- return NULL ;
1007- if (PyUnicodeEncodeError_GetEnd (exc , & end ))
1008- return NULL ;
1009- if (!(object = PyUnicodeEncodeError_GetObject (exc )))
1010- return NULL ;
1011- _PyUnicode_Name_CAPI * ucnhash_capi = _PyUnicode_GetNameCAPI ();
1012- if (ucnhash_capi == NULL ) {
1013- return NULL ;
1022+ if (!_PyIsUnicodeEncodeError (exc )) {
1023+ wrong_exception_type (exc );
1024+ return NULL ;
1025+ }
1026+
1027+ _PyUnicode_Name_CAPI * ucnhash_capi = _PyUnicode_GetNameCAPI ();
1028+ if (ucnhash_capi == NULL ) {
1029+ return NULL ;
1030+ }
1031+
1032+ PyObject * obj ;
1033+ Py_ssize_t start , end ;
1034+ if (_PyUnicodeError_GetParams (exc ,
1035+ & obj , NULL ,
1036+ & start , & end , NULL , false) < 0 )
1037+ {
1038+ return NULL ;
1039+ }
1040+
1041+ char buffer [256 ]; /* NAME_MAXLEN in unicodename_db.h */
1042+ Py_ssize_t imax = start , ressize = 0 , replsize ;
1043+ for (; imax < end ; ++ imax ) {
1044+ Py_UCS4 c = PyUnicode_READ_CHAR (obj , imax );
1045+ if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1046+ // If 'c' is recognized by getname(), the corresponding replacement
1047+ // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1048+ // characters. Failures of getname() are ignored by the handler.
1049+ replsize = 1 + 1 + 1 + strlen (buffer ) + 1 ;
10141050 }
1015- for (i = start , ressize = 0 ; i < end ; ++ i ) {
1016- /* object is guaranteed to be "ready" */
1017- c = PyUnicode_READ_CHAR (object , i );
1018- if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1019- replsize = 1 + 1 + 1 + (int )strlen (buffer )+ 1 ;
1020- }
1021- else if (c >= 0x10000 ) {
1022- replsize = 1 + 1 + 8 ;
1023- }
1024- else if (c >= 0x100 ) {
1025- replsize = 1 + 1 + 4 ;
1026- }
1027- else
1028- replsize = 1 + 1 + 2 ;
1029- if (ressize > PY_SSIZE_T_MAX - replsize )
1030- break ;
1031- ressize += replsize ;
1051+ else {
1052+ replsize = codec_handler_unicode_hex_width (c );
10321053 }
1033- end = i ;
1034- res = PyUnicode_New (ressize , 127 );
1035- if (res == NULL )
1036- return NULL ;
1037- for (i = start , outp = PyUnicode_1BYTE_DATA (res );
1038- i < end ; ++ i ) {
1039- c = PyUnicode_READ_CHAR (object , i );
1040- * outp ++ = '\\' ;
1041- if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1042- * outp ++ = 'N' ;
1043- * outp ++ = '{' ;
1044- strcpy ((char * )outp , buffer );
1045- outp += strlen (buffer );
1046- * outp ++ = '}' ;
1047- continue ;
1048- }
1049- if (c >= 0x00010000 ) {
1050- * outp ++ = 'U' ;
1051- * outp ++ = Py_hexdigits [(c >>28 )& 0xf ];
1052- * outp ++ = Py_hexdigits [(c >>24 )& 0xf ];
1053- * outp ++ = Py_hexdigits [(c >>20 )& 0xf ];
1054- * outp ++ = Py_hexdigits [(c >>16 )& 0xf ];
1055- * outp ++ = Py_hexdigits [(c >>12 )& 0xf ];
1056- * outp ++ = Py_hexdigits [(c >>8 )& 0xf ];
1057- }
1058- else if (c >= 0x100 ) {
1059- * outp ++ = 'u' ;
1060- * outp ++ = Py_hexdigits [(c >>12 )& 0xf ];
1061- * outp ++ = Py_hexdigits [(c >>8 )& 0xf ];
1062- }
1063- else
1064- * outp ++ = 'x' ;
1065- * outp ++ = Py_hexdigits [(c >>4 )& 0xf ];
1066- * outp ++ = Py_hexdigits [c & 0xf ];
1054+ if (ressize > PY_SSIZE_T_MAX - replsize ) {
1055+ break ;
10671056 }
1068-
1069- assert (outp == PyUnicode_1BYTE_DATA (res ) + ressize );
1070- assert (_PyUnicode_CheckConsistency (res , 1 ));
1071- restuple = Py_BuildValue ("(Nn)" , res , end );
1072- Py_DECREF (object );
1073- return restuple ;
1057+ ressize += replsize ;
10741058 }
1075- else {
1076- wrong_exception_type (exc );
1059+
1060+ PyObject * res = PyUnicode_New (ressize , 127 );
1061+ if (res == NULL ) {
1062+ Py_DECREF (obj );
10771063 return NULL ;
10781064 }
1065+
1066+ Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
1067+ for (Py_ssize_t i = start ; i < imax ; ++ i ) {
1068+ Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
1069+ if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1070+ * outp ++ = '\\' ;
1071+ * outp ++ = 'N' ;
1072+ * outp ++ = '{' ;
1073+ (void )strcpy ((char * )outp , buffer );
1074+ outp += strlen (buffer );
1075+ * outp ++ = '}' ;
1076+ }
1077+ else {
1078+ codec_handler_write_unicode_hex (& outp , c );
1079+ }
1080+ }
1081+
1082+ assert (outp == PyUnicode_1BYTE_DATA (res ) + ressize );
1083+ assert (_PyUnicode_CheckConsistency (res , 1 ));
1084+ PyObject * restuple = Py_BuildValue ("(Nn)" , res , imax );
1085+ Py_DECREF (obj );
1086+ return restuple ;
10791087}
10801088
1089+
10811090#define ENC_UNKNOWN -1
10821091#define ENC_UTF8 0
10831092#define ENC_UTF16BE 1
@@ -1421,11 +1430,14 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
14211430 return PyCodec_BackslashReplaceErrors (exc );
14221431}
14231432
1424- static PyObject * namereplace_errors (PyObject * self , PyObject * exc )
1433+
1434+ static inline PyObject *
1435+ namereplace_errors (PyObject * Py_UNUSED (self ), PyObject * exc )
14251436{
14261437 return PyCodec_NameReplaceErrors (exc );
14271438}
14281439
1440+
14291441static PyObject * surrogatepass_errors (PyObject * self , PyObject * exc )
14301442{
14311443 return PyCodec_SurrogatePassErrors (exc );
0 commit comments