Skip to content

Commit 0432ae7

Browse files
authored
Changing pybind11::str to exclusively hold PyUnicodeObject (#2409)
* Changing pybind11::str to exclusively hold PyUnicodeObject
1 parent 587d5f8 commit 0432ae7

File tree

8 files changed

+136
-13
lines changed

8 files changed

+136
-13
lines changed

docs/changelog.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,15 @@ Changelog
66
Starting with version 1.8.0, pybind11 releases use a `semantic versioning
77
<http://semver.org>`_ policy.
88

9-
v2.6.3 (TBA, not yet released)
9+
v2.7.0 (TBA, not yet released)
1010
------------------------------
1111

12-
* Details to follow here
12+
* ``py::str`` changed to exclusively hold `PyUnicodeObject`. Previously
13+
``py::str`` could also hold `bytes`, which is probably surprising, was
14+
never documented, and can mask bugs (e.g. accidental use of ``py::str``
15+
instead of ``py::bytes``).
16+
`#2409 <https://github.com/pybind/pybind11/pull/2409>`_
17+
1318

1419
v2.6.2 (Jan 26, 2021)
1520
---------------------

docs/upgrade.rst

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,31 @@ modernization and other useful information.
1010

1111
.. _upgrade-guide-2.6:
1212

13+
v2.7
14+
====
15+
16+
*Before* v2.7, ``py::str`` can hold ``PyUnicodeObject`` or ``PyBytesObject``,
17+
and ``py::isinstance<str>()`` is ``true`` for both ``py::str`` and
18+
``py::bytes``. Starting with v2.7, ``py::str`` exclusively holds
19+
``PyUnicodeObject`` (`#2409 <https://github.com/pybind/pybind11/pull/2409>`_),
20+
and ``py::isinstance<str>()`` is ``true`` only for ``py::str``. To help in
21+
the transition of user code, the ``PYBIND11_STR_LEGACY_PERMISSIVE`` macro
22+
is provided as an escape hatch to go back to the legacy behavior. This macro
23+
will be removed in future releases. Two types of required fixes are expected
24+
to be common:
25+
26+
* Accidental use of ``py::str`` instead of ``py::bytes``, masked by the legacy
27+
behavior. These are probably very easy to fix, by changing from
28+
``py::str`` to ``py::bytes``.
29+
30+
* Reliance on py::isinstance<str>(obj) being ``true`` for
31+
``py::bytes``. This is likely to be easy to fix in most cases by adding
32+
``|| py::isinstance<bytes>(obj)``, but a fix may be more involved, e.g. if
33+
``py::isinstance<T>`` appears in a template. Such situations will require
34+
careful review and custom fixes.
35+
36+
37+
1338
v2.6
1439
====
1540

include/pybind11/cast.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,6 +1656,17 @@ struct pyobject_caster {
16561656

16571657
template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
16581658
bool load(handle src, bool /* convert */) {
1659+
#if PY_MAJOR_VERSION < 3 && !defined(PYBIND11_STR_LEGACY_PERMISSIVE)
1660+
// For Python 2, without this implicit conversion, Python code would
1661+
// need to be cluttered with six.ensure_text() or similar, only to be
1662+
// un-cluttered later after Python 2 support is dropped.
1663+
if (std::is_same<T, str>::value && isinstance<bytes>(src)) {
1664+
PyObject *str_from_bytes = PyUnicode_FromEncodedObject(src.ptr(), "utf-8", nullptr);
1665+
if (!str_from_bytes) throw error_already_set();
1666+
value = reinterpret_steal<type>(str_from_bytes);
1667+
return true;
1668+
}
1669+
#endif
16591670
if (!isinstance<type>(src))
16601671
return false;
16611672
value = reinterpret_borrow<type>(src);

include/pybind11/detail/common.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@
163163
#include <typeindex>
164164
#include <type_traits>
165165

166+
// #define PYBIND11_STR_LEGACY_PERMISSIVE
167+
// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
168+
// (probably surprising and never documented, but this was the
169+
// legacy behavior until and including v2.6.x). As a side-effect,
170+
// pybind11::isinstance<str>() is true for both pybind11::str and
171+
// pybind11::bytes.
172+
// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
173+
// pybind11::isinstance<str>() is true only for pybind11::str.
174+
// However, for Python 2 only (!), the pybind11::str caster
175+
// implicitly decodes bytes to PyUnicodeObject. This is to ease
176+
// the transition from the legacy behavior to the non-permissive
177+
// behavior.
178+
166179
#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
167180
#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
168181
#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check

include/pybind11/pytypes.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,12 @@ inline bool PyIterable_Check(PyObject *obj) {
756756
inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
757757
inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
758758

759+
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
759760
inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
761+
#define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
762+
#else
763+
#define PYBIND11_STR_CHECK_FUN PyUnicode_Check
764+
#endif
760765

761766
inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
762767

@@ -936,7 +941,7 @@ class bytes;
936941

937942
class str : public object {
938943
public:
939-
PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
944+
PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
940945

941946
str(const char *c, size_t n)
942947
: object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {

include/pybind11/stl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ template <typename Type, typename Value> struct list_caster {
144144
using value_conv = make_caster<Value>;
145145

146146
bool load(handle src, bool convert) {
147-
if (!isinstance<sequence>(src) || isinstance<str>(src))
147+
if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src))
148148
return false;
149149
auto s = reinterpret_borrow<sequence>(src);
150150
value.clear();

tests/test_pytypes.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,4 +413,15 @@ TEST_SUBMODULE(pytypes, m) {
413413

414414
// test_builtin_functions
415415
m.def("get_len", [](py::handle h) { return py::len(h); });
416+
417+
#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
418+
m.attr("PYBIND11_STR_LEGACY_PERMISSIVE") = true;
419+
#endif
420+
421+
m.def("isinstance_pybind11_bytes", [](py::object o) { return py::isinstance<py::bytes>(o); });
422+
m.def("isinstance_pybind11_str", [](py::object o) { return py::isinstance<py::str>(o); });
423+
424+
m.def("pass_to_pybind11_bytes", [](py::bytes b) { return py::len(b); });
425+
m.def("pass_to_pybind11_str", [](py::str s) { return py::len(s); });
426+
m.def("pass_to_std_string", [](std::string s) { return s.size(); });
416427
}

tests/test_pytypes.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,14 +120,17 @@ def __repr__(self):
120120
assert s1 == s2
121121

122122
malformed_utf8 = b"\x80"
123-
assert m.str_from_object(malformed_utf8) is malformed_utf8 # To be fixed; see #2380
123+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
124+
assert m.str_from_object(malformed_utf8) is malformed_utf8
125+
elif env.PY2:
126+
with pytest.raises(UnicodeDecodeError):
127+
m.str_from_object(malformed_utf8)
128+
else:
129+
assert m.str_from_object(malformed_utf8) == "b'\\x80'"
124130
if env.PY2:
125-
# with pytest.raises(UnicodeDecodeError):
126-
# m.str_from_object(malformed_utf8)
127131
with pytest.raises(UnicodeDecodeError):
128132
m.str_from_handle(malformed_utf8)
129133
else:
130-
# assert m.str_from_object(malformed_utf8) == "b'\\x80'"
131134
assert m.str_from_handle(malformed_utf8) == "b'\\x80'"
132135

133136

@@ -303,13 +306,26 @@ def test_pybind11_str_raw_str():
303306
valid_orig = u"DZ"
304307
valid_utf8 = valid_orig.encode("utf-8")
305308
valid_cvt = cvt(valid_utf8)
306-
assert type(valid_cvt) == bytes # Probably surprising.
307-
assert valid_cvt == b"\xc7\xb1"
309+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
310+
assert valid_cvt is valid_utf8
311+
else:
312+
assert type(valid_cvt) is unicode if env.PY2 else str # noqa: F821
313+
if env.PY2:
314+
assert valid_cvt == valid_orig
315+
else:
316+
assert valid_cvt == "b'\\xc7\\xb1'"
308317

309318
malformed_utf8 = b"\x80"
310-
malformed_cvt = cvt(malformed_utf8)
311-
assert type(malformed_cvt) == bytes # Probably surprising.
312-
assert malformed_cvt == b"\x80"
319+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
320+
assert cvt(malformed_utf8) is malformed_utf8
321+
else:
322+
if env.PY2:
323+
with pytest.raises(UnicodeDecodeError):
324+
cvt(malformed_utf8)
325+
else:
326+
malformed_cvt = cvt(malformed_utf8)
327+
assert type(malformed_cvt) is str
328+
assert malformed_cvt == "b'\\x80'"
313329

314330

315331
def test_implicit_casting():
@@ -488,3 +504,40 @@ def test_builtin_functions():
488504
"object of type 'generator' has no len()",
489505
"'generator' has no length",
490506
] # PyPy
507+
508+
509+
def test_isinstance_string_types():
510+
assert m.isinstance_pybind11_bytes(b"")
511+
assert not m.isinstance_pybind11_bytes(u"")
512+
513+
assert m.isinstance_pybind11_str(u"")
514+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
515+
assert m.isinstance_pybind11_str(b"")
516+
else:
517+
assert not m.isinstance_pybind11_str(b"")
518+
519+
520+
def test_pass_bytes_or_unicode_to_string_types():
521+
assert m.pass_to_pybind11_bytes(b"Bytes") == 5
522+
with pytest.raises(TypeError):
523+
m.pass_to_pybind11_bytes(u"Str")
524+
525+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE") or env.PY2:
526+
assert m.pass_to_pybind11_str(b"Bytes") == 5
527+
else:
528+
with pytest.raises(TypeError):
529+
m.pass_to_pybind11_str(b"Bytes")
530+
assert m.pass_to_pybind11_str(u"Str") == 3
531+
532+
assert m.pass_to_std_string(b"Bytes") == 5
533+
assert m.pass_to_std_string(u"Str") == 3
534+
535+
malformed_utf8 = b"\x80"
536+
if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
537+
assert m.pass_to_pybind11_str(malformed_utf8) == 1
538+
elif env.PY2:
539+
with pytest.raises(UnicodeDecodeError):
540+
m.pass_to_pybind11_str(malformed_utf8)
541+
else:
542+
with pytest.raises(TypeError):
543+
m.pass_to_pybind11_str(malformed_utf8)

0 commit comments

Comments
 (0)