Skip to content

REF: Use PyUnicode_AsUTF8AndSize instead of get_c_string_buf_and_size #58227

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Template for each `dtype` helper function for hashtable

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

from cpython.unicode cimport PyUnicode_AsUTF8

{{py:

Expand Down Expand Up @@ -98,7 +98,6 @@ from pandas._libs.khash cimport (
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA


Expand Down Expand Up @@ -998,7 +997,7 @@ cdef class StringHashTable(HashTable):
cdef:
khiter_t k
const char *v
v = get_c_string(val)
v = PyUnicode_AsUTF8(val)

k = kh_get_str(self.table, v)
if k != self.table.n_buckets:
Expand All @@ -1012,7 +1011,7 @@ cdef class StringHashTable(HashTable):
int ret = 0
const char *v

v = get_c_string(key)
v = PyUnicode_AsUTF8(key)

k = kh_put_str(self.table, v, &ret)
if kh_exist_str(self.table, k):
Expand All @@ -1037,7 +1036,7 @@ cdef class StringHashTable(HashTable):
raise MemoryError()
for i in range(n):
val = values[i]
v = get_c_string(val)
v = PyUnicode_AsUTF8(val)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1071,11 +1070,11 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
v = PyUnicode_AsUTF8(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1109,11 +1108,11 @@ cdef class StringHashTable(HashTable):
val = values[i]

if isinstance(val, str):
# GH#31499 if we have a np.str_ get_c_string won't recognize
# GH#31499 if we have a np.str_ PyUnicode_AsUTF8 won't recognize
# it as a str, even though isinstance does.
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
else:
v = get_c_string(self.na_string_sentinel)
v = PyUnicode_AsUTF8(self.na_string_sentinel)
vecs[i] = v

with nogil:
Expand Down Expand Up @@ -1195,9 +1194,9 @@ cdef class StringHashTable(HashTable):
else:
# if ignore_na is False, we also stringify NaN/None/etc.
try:
v = get_c_string(<str>val)
v = PyUnicode_AsUTF8(<str>val)
except UnicodeEncodeError:
v = get_c_string(<str>repr(val))
v = PyUnicode_AsUTF8(<str>repr(val))
vecs[i] = v

# compute
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ from cpython.object cimport (
Py_LT,
Py_NE,
)
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from libc.stdint cimport INT64_MAX

import_datetime()
Expand All @@ -44,7 +45,6 @@ from pandas._libs.tslibs.dtypes cimport (
npy_unit_to_abbrev,
npy_unit_to_attrname,
)
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size


cdef extern from "pandas/datetime/pd_datetime.h":
Expand Down Expand Up @@ -341,13 +341,13 @@ cdef int string_to_dts(
const char* format_buf
FormatRequirement format_requirement

buf = get_c_string_buf_and_size(val, &length)
buf = PyUnicode_AsUTF8AndSize(val, &length)
if format is None:
format_buf = b""
format_length = 0
format_requirement = INFER_FORMAT
else:
format_buf = get_c_string_buf_and_size(format, &format_length)
format_buf = PyUnicode_AsUTF8AndSize(format, &format_length)
format_requirement = <FormatRequirement>exact
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset,
Expand Down
14 changes: 6 additions & 8 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ from cpython.datetime cimport (
from datetime import timezone

from cpython.object cimport PyObject_Str
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cython cimport Py_ssize_t
from libc.string cimport strchr

Expand Down Expand Up @@ -74,10 +75,7 @@ import_pandas_datetime()

from pandas._libs.tslibs.strptime import array_strptime

from pandas._libs.tslibs.util cimport (
get_c_string_buf_and_size,
is_array,
)
from pandas._libs.tslibs.util cimport is_array


cdef extern from "pandas/portable.h":
Expand Down Expand Up @@ -175,7 +173,7 @@ cdef datetime _parse_delimited_date(
int day = 1, month = 1, year
bint can_swap = 0

buf = get_c_string_buf_and_size(date_string, &length)
buf = PyUnicode_AsUTF8AndSize(date_string, &length)
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
# parsing MM?DD?YYYY and DD?MM?YYYY dates
month = _parse_2digit(buf)
Expand Down Expand Up @@ -251,7 +249,7 @@ cdef bint _does_string_look_like_time(str parse_string):
Py_ssize_t length
int hour = -1, minute = -1

buf = get_c_string_buf_and_size(parse_string, &length)
buf = PyUnicode_AsUTF8AndSize(parse_string, &length)
if length >= 4:
if buf[1] == b":":
# h:MM format
Expand Down Expand Up @@ -467,7 +465,7 @@ cpdef bint _does_string_look_like_datetime(str py_string):
char first
int error = 0

buf = get_c_string_buf_and_size(py_string, &length)
buf = PyUnicode_AsUTF8AndSize(py_string, &length)
if length >= 1:
first = buf[0]
if first == b"0":
Expand Down Expand Up @@ -521,7 +519,7 @@ cdef datetime _parse_dateabbr_string(str date_string, datetime default,
pass

if 4 <= date_len <= 7:
buf = get_c_string_buf_and_size(date_string, &date_len)
buf = PyUnicode_AsUTF8AndSize(date_string, &date_len)
try:
i = date_string.index("Q", 1, 6)
if i == 1:
Expand Down
31 changes: 0 additions & 31 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

from cpython.object cimport PyTypeObject
from cpython.unicode cimport PyUnicode_AsUTF8AndSize


cdef extern from "Python.h":
Expand Down Expand Up @@ -155,36 +154,6 @@ cdef inline bint is_nan(object val):
return is_complex_object(val) and val != val


cdef inline const char* get_c_string_buf_and_size(str py_string,
Py_ssize_t *length) except NULL:
"""
Extract internal char* buffer of unicode or bytes object `py_string` with
getting length of this internal buffer saved in `length`.

Notes
-----
Python object owns memory, thus returned char* must not be freed.
`length` can be NULL if getting buffer length is not needed.

Parameters
----------
py_string : str
length : Py_ssize_t*

Returns
-------
buf : const char*
"""
# Note PyUnicode_AsUTF8AndSize() can
# potentially allocate memory inside in unlikely case of when underlying
# unicode object was stored as non-utf8 and utf8 wasn't requested before.
return PyUnicode_AsUTF8AndSize(py_string, length)


cdef inline const char* get_c_string(str py_string) except NULL:
return get_c_string_buf_and_size(py_string, NULL)


cdef inline bytes string_encode_locale(str py_string):
"""As opposed to PyUnicode_Encode, use current system locale to encode."""
return PyUnicode_EncodeLocale(py_string, NULL)
Expand Down