@@ -202,6 +202,11 @@ static PyObject *
202
202
unicode_decode_utf8 (const char * s , Py_ssize_t size ,
203
203
_Py_error_handler error_handler , const char * errors ,
204
204
Py_ssize_t * consumed );
205
+ static int
206
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
207
+ const char * s , Py_ssize_t size ,
208
+ _Py_error_handler error_handler , const char * errors ,
209
+ Py_ssize_t * consumed );
205
210
#ifdef Py_DEBUG
206
211
static inline int unicode_is_finalizing (void );
207
212
static int unicode_is_singleton (PyObject * unicode );
@@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2377
2382
}
2378
2383
2379
2384
static int
2380
- unicode_fromformat_write_cstr (_PyUnicodeWriter * writer , const char * str ,
2385
+ unicode_fromformat_write_utf8 (_PyUnicodeWriter * writer , const char * str ,
2381
2386
Py_ssize_t width , Py_ssize_t precision , int flags )
2382
2387
{
2383
2388
/* UTF-8 */
2384
2389
Py_ssize_t length ;
2385
- PyObject * unicode ;
2386
- int res ;
2387
-
2388
2390
if (precision == -1 ) {
2389
2391
length = strlen (str );
2390
2392
}
@@ -2394,13 +2396,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2394
2396
length ++ ;
2395
2397
}
2396
2398
}
2397
- unicode = PyUnicode_DecodeUTF8Stateful (str , length , "replace" , NULL );
2398
- if (unicode == NULL )
2399
- return -1 ;
2400
2399
2401
- res = unicode_fromformat_write_str (writer , unicode , width , -1 , flags );
2402
- Py_DECREF (unicode );
2403
- return res ;
2400
+ if (width < 0 ) {
2401
+ return unicode_decode_utf8_writer (writer , str , length ,
2402
+ _Py_ERROR_UNKNOWN , "replace" , NULL );
2403
+ }
2404
+ else {
2405
+ PyObject * unicode = PyUnicode_DecodeUTF8Stateful (str , length ,
2406
+ "replace" , NULL );
2407
+ if (unicode == NULL )
2408
+ return -1 ;
2409
+
2410
+ int res = unicode_fromformat_write_str (writer , unicode ,
2411
+ width , -1 , flags );
2412
+ Py_DECREF (unicode );
2413
+ return res ;
2414
+ }
2404
2415
}
2405
2416
2406
2417
static int
@@ -2700,7 +2711,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2700
2711
else {
2701
2712
/* UTF-8 */
2702
2713
const char * s = va_arg (* vargs , const char * );
2703
- if (unicode_fromformat_write_cstr (writer , s , width , precision , flags ) < 0 )
2714
+ if (unicode_fromformat_write_utf8 (writer , s , width , precision , flags ) < 0 )
2704
2715
return NULL ;
2705
2716
}
2706
2717
break ;
@@ -2739,7 +2750,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
2739
2750
}
2740
2751
else {
2741
2752
assert (str != NULL );
2742
- if (unicode_fromformat_write_cstr (writer , str , width , precision , flags ) < 0 )
2753
+ if (unicode_fromformat_write_utf8 (writer , str , width , precision , flags ) < 0 )
2743
2754
return NULL ;
2744
2755
}
2745
2756
break ;
@@ -4737,65 +4748,56 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4737
4748
return p - start ;
4738
4749
}
4739
4750
4740
- static PyObject *
4741
- unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4742
- _Py_error_handler error_handler , const char * errors ,
4743
- Py_ssize_t * consumed )
4744
- {
4745
- if (size == 0 ) {
4746
- if (consumed )
4747
- * consumed = 0 ;
4748
- _Py_RETURN_UNICODE_EMPTY ();
4749
- }
4750
-
4751
- /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4752
- if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4753
- if (consumed ) {
4754
- * consumed = 1 ;
4755
- }
4756
- return get_latin1_char ((unsigned char )s [0 ]);
4757
- }
4758
4751
4752
+ static int
4753
+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4754
+ const char * s , Py_ssize_t size ,
4755
+ _Py_error_handler error_handler , const char * errors ,
4756
+ Py_ssize_t * consumed )
4757
+ {
4759
4758
const char * starts = s ;
4760
4759
const char * end = s + size ;
4761
4760
4762
4761
// fast path: try ASCII string.
4763
- PyObject * u = PyUnicode_New (size , 127 );
4764
- if (u == NULL ) {
4765
- return NULL ;
4762
+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4763
+ return -1 ;
4766
4764
}
4767
- s += ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4768
- if (s == end ) {
4769
- if (consumed ) {
4770
- * consumed = size ;
4765
+
4766
+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4767
+ if (writer -> kind == PyUnicode_1BYTE_KIND
4768
+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4769
+ {
4770
+ Py_ssize_t decoded = ascii_decode (s , end , dest );
4771
+ writer -> pos += decoded ;
4772
+
4773
+ if (decoded == size ) {
4774
+ if (consumed ) {
4775
+ * consumed = size ;
4776
+ }
4777
+ return 0 ;
4771
4778
}
4772
- return u ;
4779
+ s += decoded ;
4773
4780
}
4774
4781
4775
- // Use _PyUnicodeWriter after fast path is failed.
4776
- _PyUnicodeWriter writer ;
4777
- _PyUnicodeWriter_InitWithBuffer (& writer , u );
4778
- writer .pos = s - starts ;
4779
-
4780
4782
Py_ssize_t startinpos , endinpos ;
4781
4783
const char * errmsg = "" ;
4782
4784
PyObject * error_handler_obj = NULL ;
4783
4785
PyObject * exc = NULL ;
4784
4786
4785
4787
while (s < end ) {
4786
4788
Py_UCS4 ch ;
4787
- int kind = writer . kind ;
4789
+ int kind = writer -> kind ;
4788
4790
4789
4791
if (kind == PyUnicode_1BYTE_KIND ) {
4790
- if (PyUnicode_IS_ASCII (writer . buffer ))
4791
- ch = asciilib_utf8_decode (& s , end , writer . data , & writer . pos );
4792
+ if (PyUnicode_IS_ASCII (writer -> buffer ))
4793
+ ch = asciilib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4792
4794
else
4793
- ch = ucs1lib_utf8_decode (& s , end , writer . data , & writer . pos );
4795
+ ch = ucs1lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4794
4796
} else if (kind == PyUnicode_2BYTE_KIND ) {
4795
- ch = ucs2lib_utf8_decode (& s , end , writer . data , & writer . pos );
4797
+ ch = ucs2lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4796
4798
} else {
4797
4799
assert (kind == PyUnicode_4BYTE_KIND );
4798
- ch = ucs4lib_utf8_decode (& s , end , writer . data , & writer . pos );
4800
+ ch = ucs4lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
4799
4801
}
4800
4802
4801
4803
switch (ch ) {
@@ -4826,7 +4828,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4826
4828
endinpos = startinpos + ch - 1 ;
4827
4829
break ;
4828
4830
default :
4829
- if (_PyUnicodeWriter_WriteCharInline (& writer , ch ) < 0 )
4831
+ if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
4830
4832
goto onError ;
4831
4833
continue ;
4832
4834
}
@@ -4840,7 +4842,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4840
4842
break ;
4841
4843
4842
4844
case _Py_ERROR_REPLACE :
4843
- if (_PyUnicodeWriter_WriteCharInline (& writer , 0xfffd ) < 0 )
4845
+ if (_PyUnicodeWriter_WriteCharInline (writer , 0xfffd ) < 0 )
4844
4846
goto onError ;
4845
4847
s += (endinpos - startinpos );
4846
4848
break ;
@@ -4849,13 +4851,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4849
4851
{
4850
4852
Py_ssize_t i ;
4851
4853
4852
- if (_PyUnicodeWriter_PrepareKind (& writer , PyUnicode_2BYTE_KIND ) < 0 )
4854
+ if (_PyUnicodeWriter_PrepareKind (writer , PyUnicode_2BYTE_KIND ) < 0 )
4853
4855
goto onError ;
4854
4856
for (i = startinpos ; i < endinpos ; i ++ ) {
4855
4857
ch = (Py_UCS4 )(unsigned char )(starts [i ]);
4856
- PyUnicode_WRITE (writer . kind , writer . data , writer . pos ,
4858
+ PyUnicode_WRITE (writer -> kind , writer -> data , writer -> pos ,
4857
4859
ch + 0xdc00 );
4858
- writer . pos ++ ;
4860
+ writer -> pos ++ ;
4859
4861
}
4860
4862
s += (endinpos - startinpos );
4861
4863
break ;
@@ -4866,8 +4868,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4866
4868
errors , & error_handler_obj ,
4867
4869
"utf-8" , errmsg ,
4868
4870
& starts , & end , & startinpos , & endinpos , & exc , & s ,
4869
- & writer ))
4871
+ writer )) {
4870
4872
goto onError ;
4873
+ }
4874
+
4875
+ if (_PyUnicodeWriter_Prepare (writer , end - s , 127 ) < 0 ) {
4876
+ return -1 ;
4877
+ }
4871
4878
}
4872
4879
}
4873
4880
@@ -4877,13 +4884,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
4877
4884
4878
4885
Py_XDECREF (error_handler_obj );
4879
4886
Py_XDECREF (exc );
4880
- return _PyUnicodeWriter_Finish ( & writer ) ;
4887
+ return 0 ;
4881
4888
4882
4889
onError :
4883
4890
Py_XDECREF (error_handler_obj );
4884
4891
Py_XDECREF (exc );
4885
- _PyUnicodeWriter_Dealloc (& writer );
4886
- return NULL ;
4892
+ return -1 ;
4893
+ }
4894
+
4895
+
4896
+ static PyObject *
4897
+ unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4898
+ _Py_error_handler error_handler , const char * errors ,
4899
+ Py_ssize_t * consumed )
4900
+ {
4901
+ if (size == 0 ) {
4902
+ if (consumed )
4903
+ * consumed = 0 ;
4904
+ _Py_RETURN_UNICODE_EMPTY ();
4905
+ }
4906
+
4907
+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4908
+ if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4909
+ if (consumed ) {
4910
+ * consumed = 1 ;
4911
+ }
4912
+ return get_latin1_char ((unsigned char )s [0 ]);
4913
+ }
4914
+
4915
+ _PyUnicodeWriter writer ;
4916
+ _PyUnicodeWriter_Init (& writer );
4917
+
4918
+ if (unicode_decode_utf8_writer (& writer , s , size ,
4919
+ error_handler , errors ,
4920
+ consumed ) < 0 ) {
4921
+ _PyUnicodeWriter_Dealloc (& writer );
4922
+ return NULL ;
4923
+ }
4924
+ return _PyUnicodeWriter_Finish (& writer );
4887
4925
}
4888
4926
4889
4927
0 commit comments