@@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
676
676
PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
677
677
678
678
679
+ // --- codecs handlers: utilities ---------------------------------------------
680
+
681
+ /*
682
+ * Return the number of characters (including special prefixes)
683
+ * needed to represent 'ch' by codec_handler_write_unicode_hex().
684
+ */
685
+ static inline Py_ssize_t
686
+ codec_handler_unicode_hex_width (Py_UCS4 ch )
687
+ {
688
+ if (ch >= 0x10000 ) {
689
+ // format: '\\' + 'U' + 8 hex digits
690
+ return 1 + 1 + 8 ;
691
+ }
692
+ else if (ch >= 0x100 ) {
693
+ // format: '\\' + 'u' + 4 hex digits
694
+ return 1 + 1 + 4 ;
695
+ }
696
+ else {
697
+ // format: '\\' + 'x' + 2 hex digits
698
+ return 1 + 1 + 2 ;
699
+ }
700
+ }
701
+
702
+
703
+ /*
704
+ * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
705
+ * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
706
+ */
707
+ static inline void
708
+ codec_handler_write_unicode_hex (Py_UCS1 * * p , Py_UCS4 ch )
709
+ {
710
+ * (* p )++ = '\\' ;
711
+ if (ch >= 0x10000 ) {
712
+ * (* p )++ = 'U' ;
713
+ * (* p )++ = Py_hexdigits [(ch >> 28 ) & 0xf ];
714
+ * (* p )++ = Py_hexdigits [(ch >> 24 ) & 0xf ];
715
+ * (* p )++ = Py_hexdigits [(ch >> 20 ) & 0xf ];
716
+ * (* p )++ = Py_hexdigits [(ch >> 16 ) & 0xf ];
717
+ * (* p )++ = Py_hexdigits [(ch >> 12 ) & 0xf ];
718
+ * (* p )++ = Py_hexdigits [(ch >> 8 ) & 0xf ];
719
+ }
720
+ else if (ch >= 0x100 ) {
721
+ * (* p )++ = 'u' ;
722
+ * (* p )++ = Py_hexdigits [(ch >> 12 ) & 0xf ];
723
+ * (* p )++ = Py_hexdigits [(ch >> 8 ) & 0xf ];
724
+ }
725
+ else {
726
+ * (* p )++ = 'x' ;
727
+ }
728
+ * (* p )++ = Py_hexdigits [(ch >> 4 ) & 0xf ];
729
+ * (* p )++ = Py_hexdigits [ch & 0xf ];
730
+ }
731
+
732
+
679
733
// --- handler: 'strict' ------------------------------------------------------
680
734
681
735
PyObject * PyCodec_StrictErrors (PyObject * exc )
@@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
942
996
943
997
Py_ssize_t ressize = 0 ;
944
998
for (Py_ssize_t i = start ; i < end ; ++ i ) {
945
- /* object is guaranteed to be "ready" */
946
999
Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
947
- if (c >= 0x10000 ) {
948
- ressize += 1 + 1 + 8 ;
949
- }
950
- else if (c >= 0x100 ) {
951
- ressize += 1 + 1 + 4 ;
952
- }
953
- else {
954
- ressize += 1 + 1 + 2 ;
955
- }
1000
+ ressize += codec_handler_unicode_hex_width (c );
956
1001
}
957
1002
PyObject * res = PyUnicode_New (ressize , 127 );
958
1003
if (res == NULL ) {
@@ -962,122 +1007,86 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
962
1007
Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
963
1008
for (Py_ssize_t i = start ; i < end ; ++ i ) {
964
1009
Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
965
- * outp ++ = '\\' ;
966
- if (c >= 0x00010000 ) {
967
- * outp ++ = 'U' ;
968
- * outp ++ = Py_hexdigits [(c >> 28 ) & 0xf ];
969
- * outp ++ = Py_hexdigits [(c >> 24 ) & 0xf ];
970
- * outp ++ = Py_hexdigits [(c >> 20 ) & 0xf ];
971
- * outp ++ = Py_hexdigits [(c >> 16 ) & 0xf ];
972
- * outp ++ = Py_hexdigits [(c >> 12 ) & 0xf ];
973
- * outp ++ = Py_hexdigits [(c >> 8 ) & 0xf ];
974
- }
975
- else if (c >= 0x100 ) {
976
- * outp ++ = 'u' ;
977
- * outp ++ = Py_hexdigits [(c >> 12 ) & 0xf ];
978
- * outp ++ = Py_hexdigits [(c >> 8 ) & 0xf ];
979
- }
980
- else {
981
- * outp ++ = 'x' ;
982
- }
983
- * outp ++ = Py_hexdigits [(c >> 4 ) & 0xf ];
984
- * outp ++ = Py_hexdigits [c & 0xf ];
1010
+ codec_handler_write_unicode_hex (& outp , c );
985
1011
}
986
1012
assert (_PyUnicode_CheckConsistency (res , 1 ));
987
1013
Py_DECREF (obj );
988
1014
return Py_BuildValue ("(Nn)" , res , end );
989
1015
}
990
1016
1017
+
1018
+ // --- handler: 'namereplace' -------------------------------------------------
1019
+
991
1020
PyObject * PyCodec_NameReplaceErrors (PyObject * exc )
992
1021
{
993
- if (PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
994
- PyObject * restuple ;
995
- PyObject * object ;
996
- Py_ssize_t i ;
997
- Py_ssize_t start ;
998
- Py_ssize_t end ;
999
- PyObject * res ;
1000
- Py_UCS1 * outp ;
1001
- Py_ssize_t ressize ;
1002
- int replsize ;
1003
- Py_UCS4 c ;
1004
- char buffer [256 ]; /* NAME_MAXLEN */
1005
- if (PyUnicodeEncodeError_GetStart (exc , & start ))
1006
- return NULL ;
1007
- if (PyUnicodeEncodeError_GetEnd (exc , & end ))
1008
- return NULL ;
1009
- if (!(object = PyUnicodeEncodeError_GetObject (exc )))
1010
- return NULL ;
1011
- _PyUnicode_Name_CAPI * ucnhash_capi = _PyUnicode_GetNameCAPI ();
1012
- if (ucnhash_capi == NULL ) {
1013
- return NULL ;
1022
+ if (!_PyIsUnicodeEncodeError (exc )) {
1023
+ wrong_exception_type (exc );
1024
+ return NULL ;
1025
+ }
1026
+
1027
+ _PyUnicode_Name_CAPI * ucnhash_capi = _PyUnicode_GetNameCAPI ();
1028
+ if (ucnhash_capi == NULL ) {
1029
+ return NULL ;
1030
+ }
1031
+
1032
+ PyObject * obj ;
1033
+ Py_ssize_t start , end ;
1034
+ if (_PyUnicodeError_GetParams (exc ,
1035
+ & obj , NULL ,
1036
+ & start , & end , NULL , false) < 0 )
1037
+ {
1038
+ return NULL ;
1039
+ }
1040
+
1041
+ char buffer [256 ]; /* NAME_MAXLEN in unicodename_db.h */
1042
+ Py_ssize_t imax = start , ressize = 0 , replsize ;
1043
+ for (; imax < end ; ++ imax ) {
1044
+ Py_UCS4 c = PyUnicode_READ_CHAR (obj , imax );
1045
+ if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1046
+ // If 'c' is recognized by getname(), the corresponding replacement
1047
+ // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
1048
+ // characters. Failures of getname() are ignored by the handler.
1049
+ replsize = 1 + 1 + 1 + strlen (buffer ) + 1 ;
1014
1050
}
1015
- for (i = start , ressize = 0 ; i < end ; ++ i ) {
1016
- /* object is guaranteed to be "ready" */
1017
- c = PyUnicode_READ_CHAR (object , i );
1018
- if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1019
- replsize = 1 + 1 + 1 + (int )strlen (buffer )+ 1 ;
1020
- }
1021
- else if (c >= 0x10000 ) {
1022
- replsize = 1 + 1 + 8 ;
1023
- }
1024
- else if (c >= 0x100 ) {
1025
- replsize = 1 + 1 + 4 ;
1026
- }
1027
- else
1028
- replsize = 1 + 1 + 2 ;
1029
- if (ressize > PY_SSIZE_T_MAX - replsize )
1030
- break ;
1031
- ressize += replsize ;
1051
+ else {
1052
+ replsize = codec_handler_unicode_hex_width (c );
1032
1053
}
1033
- end = i ;
1034
- res = PyUnicode_New (ressize , 127 );
1035
- if (res == NULL )
1036
- return NULL ;
1037
- for (i = start , outp = PyUnicode_1BYTE_DATA (res );
1038
- i < end ; ++ i ) {
1039
- c = PyUnicode_READ_CHAR (object , i );
1040
- * outp ++ = '\\' ;
1041
- if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1042
- * outp ++ = 'N' ;
1043
- * outp ++ = '{' ;
1044
- strcpy ((char * )outp , buffer );
1045
- outp += strlen (buffer );
1046
- * outp ++ = '}' ;
1047
- continue ;
1048
- }
1049
- if (c >= 0x00010000 ) {
1050
- * outp ++ = 'U' ;
1051
- * outp ++ = Py_hexdigits [(c >>28 )& 0xf ];
1052
- * outp ++ = Py_hexdigits [(c >>24 )& 0xf ];
1053
- * outp ++ = Py_hexdigits [(c >>20 )& 0xf ];
1054
- * outp ++ = Py_hexdigits [(c >>16 )& 0xf ];
1055
- * outp ++ = Py_hexdigits [(c >>12 )& 0xf ];
1056
- * outp ++ = Py_hexdigits [(c >>8 )& 0xf ];
1057
- }
1058
- else if (c >= 0x100 ) {
1059
- * outp ++ = 'u' ;
1060
- * outp ++ = Py_hexdigits [(c >>12 )& 0xf ];
1061
- * outp ++ = Py_hexdigits [(c >>8 )& 0xf ];
1062
- }
1063
- else
1064
- * outp ++ = 'x' ;
1065
- * outp ++ = Py_hexdigits [(c >>4 )& 0xf ];
1066
- * outp ++ = Py_hexdigits [c & 0xf ];
1054
+ if (ressize > PY_SSIZE_T_MAX - replsize ) {
1055
+ break ;
1067
1056
}
1068
-
1069
- assert (outp == PyUnicode_1BYTE_DATA (res ) + ressize );
1070
- assert (_PyUnicode_CheckConsistency (res , 1 ));
1071
- restuple = Py_BuildValue ("(Nn)" , res , end );
1072
- Py_DECREF (object );
1073
- return restuple ;
1057
+ ressize += replsize ;
1074
1058
}
1075
- else {
1076
- wrong_exception_type (exc );
1059
+
1060
+ PyObject * res = PyUnicode_New (ressize , 127 );
1061
+ if (res == NULL ) {
1062
+ Py_DECREF (obj );
1077
1063
return NULL ;
1078
1064
}
1065
+
1066
+ Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
1067
+ for (Py_ssize_t i = start ; i < imax ; ++ i ) {
1068
+ Py_UCS4 c = PyUnicode_READ_CHAR (obj , i );
1069
+ if (ucnhash_capi -> getname (c , buffer , sizeof (buffer ), 1 )) {
1070
+ * outp ++ = '\\' ;
1071
+ * outp ++ = 'N' ;
1072
+ * outp ++ = '{' ;
1073
+ (void )strcpy ((char * )outp , buffer );
1074
+ outp += strlen (buffer );
1075
+ * outp ++ = '}' ;
1076
+ }
1077
+ else {
1078
+ codec_handler_write_unicode_hex (& outp , c );
1079
+ }
1080
+ }
1081
+
1082
+ assert (outp == PyUnicode_1BYTE_DATA (res ) + ressize );
1083
+ assert (_PyUnicode_CheckConsistency (res , 1 ));
1084
+ PyObject * restuple = Py_BuildValue ("(Nn)" , res , imax );
1085
+ Py_DECREF (obj );
1086
+ return restuple ;
1079
1087
}
1080
1088
1089
+
1081
1090
#define ENC_UNKNOWN -1
1082
1091
#define ENC_UTF8 0
1083
1092
#define ENC_UTF16BE 1
@@ -1421,11 +1430,14 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1421
1430
return PyCodec_BackslashReplaceErrors (exc );
1422
1431
}
1423
1432
1424
- static PyObject * namereplace_errors (PyObject * self , PyObject * exc )
1433
+
1434
+ static inline PyObject *
1435
+ namereplace_errors (PyObject * Py_UNUSED (self ), PyObject * exc )
1425
1436
{
1426
1437
return PyCodec_NameReplaceErrors (exc );
1427
1438
}
1428
1439
1440
+
1429
1441
static PyObject * surrogatepass_errors (PyObject * self , PyObject * exc )
1430
1442
{
1431
1443
return PyCodec_SurrogatePassErrors (exc );
0 commit comments