From 139ad73ec01c1f59cd381a3e0a66d4f2c8f666d2 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Wed, 30 Aug 2023 23:56:31 +0200 Subject: [PATCH 01/14] gh-108590: Add sqlite3 text factory howto Document how to handle table columns with invalid Unicode sequences. --- Doc/library/sqlite3.rst | 92 +++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 0abdab52340dfd..a5c7c51f7e54ac 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1444,39 +1444,8 @@ Connection objects and returns a text representation of it. The callable is invoked for SQLite values with the ``TEXT`` data type. By default, this attribute is set to :class:`str`. - If you want to return ``bytes`` instead, set *text_factory* to ``bytes``. - Example: - - .. testcode:: - - con = sqlite3.connect(":memory:") - cur = con.cursor() - - AUSTRIA = "Österreich" - - # by default, rows are returned as str - cur.execute("SELECT ?", (AUSTRIA,)) - row = cur.fetchone() - assert row[0] == AUSTRIA - - # but we can make sqlite3 always return bytestrings ... - con.text_factory = bytes - cur.execute("SELECT ?", (AUSTRIA,)) - row = cur.fetchone() - assert type(row[0]) is bytes - # the bytestrings will be encoded in UTF-8, unless you stored garbage in the - # database ... - assert row[0] == AUSTRIA.encode("utf-8") - - # we can also implement a custom text_factory ... - # here we implement one that appends "foo" to all strings - con.text_factory = lambda x: x.decode("utf-8") + "foo" - cur.execute("SELECT ?", ("bar",)) - row = cur.fetchone() - assert row[0] == "barfoo" - - con.close() + See :ref:`sqlite3-howto-text-factory` for more details. .. attribute:: total_changes @@ -2614,6 +2583,65 @@ With some adjustments, the above recipe can be adapted to use a instead of a :class:`~collections.namedtuple`. +.. _sqlite3-howto-text-factory: + +How to create and use text factories +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, :mod:`!sqlite3` adapts SQLite values with the ``TEXT`` data type +using :class:`str`. +This works well for correctly encoded UTF-8 text, but it will fail for invalid +Unicode sequences and other encodings. +To work around this, you can use a custom :attr:`~Connection.text_factory`. + +Because of SQLites flexible typing, it is not uncommon to encounter table +columns with the ``TEXT`` data type, containing arbitrary data. +Let's create a test database with an invalid Unicode sequence: + +.. testcode:: + + con = sqlite3.connect(":memory:") + con.executescript(""" + CREATE TABLE test (data TEXT); + INSERT INTO test VALUES(CAST(X'619F' AS TEXT)); + """) + +To work with such databases, we can use the following trick, +borrowed from the :ref:`unicode-howto`: + +.. testcode:: + + con.text_factory = lambda data: str(data, errors="surrogateescape") + dump = con.iterdump() + for line in dump: + print(line) + +The dump will now print with Unicode surrogate escapes: + +.. testoutput:: + + BEGIN TRANSACTION; + CREATE TABLE test (data TEXT); + INSERT INTO "test" VALUES('a\udc9f'); + COMMIT; + +Notice that in order to write the invalid Unicode sequence to a file, +you must also use ``errors="surrogateescape"`` as an argument to :func:`open`: + +.. testcode:: + + with open("dump.sql", "w", errors="surrogateescape") as f: + sql = "\n".join(dump) + f.write(sql) + +.. note:: + + Unlike :attr:`~Cursor.row_factory`, which exists as an attribute both on + :class:`Cursor` and :class:`Connection` objects, + :attr:`~Connection.text_factory` only exists as an attribute on + :class:`!Connection` objects. + + .. _sqlite3-explanation: Explanation From 88a75991291fbc7aa5707bb050a3c552e7081b64 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 31 Aug 2023 00:53:39 +0200 Subject: [PATCH 02/14] Apply suggestions from code review Co-authored-by: Alex Waygood --- Doc/library/sqlite3.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index a5c7c51f7e54ac..6955d593fcd120 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2594,9 +2594,9 @@ This works well for correctly encoded UTF-8 text, but it will fail for invalid Unicode sequences and other encodings. To work around this, you can use a custom :attr:`~Connection.text_factory`. -Because of SQLites flexible typing, it is not uncommon to encounter table +Because of SQLite's flexible typing, it is not uncommon to encounter table columns with the ``TEXT`` data type, containing arbitrary data. -Let's create a test database with an invalid Unicode sequence: +To demonstrate, let's create a test database with an invalid Unicode sequence: .. testcode:: @@ -2606,7 +2606,7 @@ Let's create a test database with an invalid Unicode sequence: INSERT INTO test VALUES(CAST(X'619F' AS TEXT)); """) -To work with such databases, we can use the following trick, +To work with such databases, we can use the following technique, borrowed from the :ref:`unicode-howto`: .. testcode:: @@ -2625,8 +2625,8 @@ The dump will now print with Unicode surrogate escapes: INSERT INTO "test" VALUES('a\udc9f'); COMMIT; -Notice that in order to write the invalid Unicode sequence to a file, -you must also use ``errors="surrogateescape"`` as an argument to :func:`open`: +Note that in order to write the invalid Unicode sequence to a file, +you will also have to use ``errors="surrogateescape"`` as an argument to :func:`open`: .. testcode:: From f9aac630adcbce8c40bec9db4415ba53c4749a2e Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 31 Aug 2023 09:53:15 +0200 Subject: [PATCH 03/14] Add note to iterdump(); explain why we use CAST --- Doc/library/sqlite3.rst | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index a5c7c51f7e54ac..9d8cd84051c78b 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1157,6 +1157,13 @@ Connection objects f.write('%s\n' % line) con.close() + .. note:: + + If your database contains ``TEXT`` values with invalid Unicode + sequences, or encodings incompatible with UTF-8, + you must use a custom :attr:`text_factory`. + See :ref:`sqlite3-howto-text-factory` for more details. + .. method:: backup(target, *, pages=-1, progress=None, name="main", sleep=0.250) @@ -2594,9 +2601,12 @@ This works well for correctly encoded UTF-8 text, but it will fail for invalid Unicode sequences and other encodings. To work around this, you can use a custom :attr:`~Connection.text_factory`. -Because of SQLites flexible typing, it is not uncommon to encounter table +Because of SQLites `flexible typing`_, it is not uncommon to encounter table columns with the ``TEXT`` data type, containing arbitrary data. -Let's create a test database with an invalid Unicode sequence: +Let's create a test database with an invalid Unicode sequence. +We will use a `CAST expression`_ to coerce an invalid Unicode sequence, +represented as a hexadecimal string ``X'619F'``, +into the ``TEXT`` data type: .. testcode:: @@ -2616,7 +2626,7 @@ borrowed from the :ref:`unicode-howto`: for line in dump: print(line) -The dump will now print with Unicode surrogate escapes: +The loop above will print the offending line using Unicode surrogate escapes: .. testoutput:: @@ -2641,6 +2651,8 @@ you must also use ``errors="surrogateescape"`` as an argument to :func:`open`: :attr:`~Connection.text_factory` only exists as an attribute on :class:`!Connection` objects. +.. _CAST expression: https://www.sqlite.org/lang_expr.html#castexpr + .. _sqlite3-explanation: From fdc240f9ae40f7419e76984e8069a688e0670c19 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 31 Aug 2023 10:05:23 +0200 Subject: [PATCH 04/14] Mention that execute() and friends only accept UTF-8 encoded strings --- Doc/library/sqlite3.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 7c04b0de426b08..af5e9d747986bd 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1538,6 +1538,8 @@ Cursor objects Use :meth:`executescript` to execute multiple SQL statements. + :meth:`!execute` only accepts UTF-8 encoded strings. + .. method:: executemany(sql, parameters, /) For every item in *parameters*, @@ -1586,6 +1588,8 @@ Cursor objects Starting with Python 3.14, :exc:`ProgrammingError` will be raised instead. + :meth:`!executemany` only accepts UTF-8 encoded strings. + .. method:: executescript(sql_script, /) Execute the SQL statements in *sql_script*. @@ -1611,6 +1615,7 @@ Cursor objects COMMIT; """) + :meth:`!executescript` only accepts UTF-8 encoded strings. .. method:: fetchone() From 62073e5364c49d6acb85fe6e7dd0d95b22173afd Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 31 Aug 2023 10:48:48 +0200 Subject: [PATCH 05/14] Try to emphasize that strings with surrogate escapes cannot be passed to sqlite3 API --- Doc/library/sqlite3.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index af5e9d747986bd..8b08c6c4f3c320 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2640,8 +2640,11 @@ The loop above will print the offending line using Unicode surrogate escapes: INSERT INTO "test" VALUES('a\udc9f'); COMMIT; -Note that in order to write the invalid Unicode sequence to a file, -you will also have to use ``errors="surrogateescape"`` as an argument to :func:`open`: +Note that strings containing surrogate escapes must be treated with care. +You cannot pass them back to SQLite, for example using :meth:`~Cursor.execute`, +since the :mod:`!sqlite3` module APIs only accept UTF-8 encoded strings. +In order to write strings containing surrogate escapes to a file, +you will have to use ``errors="surrogateescape"`` as an argument to :func:`open`: .. testcode:: From 35c6e9dd09b2889ba9f223b33067e3c4ef8a6683 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 31 Aug 2023 10:52:30 +0200 Subject: [PATCH 06/14] Add seealso for the Unicode HOWTO --- Doc/library/sqlite3.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 8b08c6c4f3c320..9a3afebdb998ae 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2659,6 +2659,10 @@ you will have to use ``errors="surrogateescape"`` as an argument to :func:`open` :attr:`~Connection.text_factory` only exists as an attribute on :class:`!Connection` objects. +.. seealso:: + + :ref:`unicode-howto` + .. _CAST expression: https://www.sqlite.org/lang_expr.html#castexpr From 3b9f620aacac66c20344473a218af5d75959736f Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Mon, 4 Sep 2023 09:58:21 +0200 Subject: [PATCH 07/14] Try to address reviews: - Use a Latin 2 example; a Czech-English dictionary. We need the _values_ to be encoded in Latin 2, not the _column names_. - Reword the title; the how-to is about the problem, not the means. - Reword UTF-8 passages as requested by Ezio. --- Doc/library/sqlite3.rst | 89 ++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 50 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 9a3afebdb998ae..795f8e474c9077 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1157,12 +1157,9 @@ Connection objects f.write('%s\n' % line) con.close() - .. note:: + .. seealso:: - If your database contains ``TEXT`` values with invalid Unicode - sequences, or encodings incompatible with UTF-8, - you must use a custom :attr:`text_factory`. - See :ref:`sqlite3-howto-text-factory` for more details. + :ref:`sqlite3-howto-encoding` .. method:: backup(target, *, pages=-1, progress=None, name="main", sleep=0.250) @@ -1230,6 +1227,10 @@ Connection objects .. versionadded:: 3.7 + .. seealso: + + :ref:`sqlite3-howto-encoding` + .. method:: getlimit(category, /) Get a connection runtime limit. @@ -1452,7 +1453,7 @@ Connection objects The callable is invoked for SQLite values with the ``TEXT`` data type. By default, this attribute is set to :class:`str`. - See :ref:`sqlite3-howto-text-factory` for more details. + See :ref:`sqlite3-howto-encoding` for more details. .. attribute:: total_changes @@ -2595,62 +2596,52 @@ With some adjustments, the above recipe can be adapted to use a instead of a :class:`~collections.namedtuple`. -.. _sqlite3-howto-text-factory: +.. _sqlite3-howto-encoding: -How to create and use text factories -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +How to handle non-UTF-8 text encodings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ By default, :mod:`!sqlite3` adapts SQLite values with the ``TEXT`` data type using :class:`str`. -This works well for correctly encoded UTF-8 text, but it will fail for invalid -Unicode sequences and other encodings. -To work around this, you can use a custom :attr:`~Connection.text_factory`. +This works well for UTF-8 encoded text, but it will fail for other encodings +and invalid UTF-8. +You can use a custom :attr:`~Connection.text_factory` to handle such cases. Because of SQLite's `flexible typing`_, it is not uncommon to encounter table -columns with the ``TEXT`` data type, containing arbitrary data. -To demonstrate, let's create a test database with an invalid Unicode sequence. -We will use a `CAST expression`_ to coerce an invalid Unicode sequence, -represented as a hexadecimal string ``X'619F'``, -into the ``TEXT`` data type: +columns with the ``TEXT`` data type, containing non-UTF-8 encodings, +or even arbitrary data. +To demonstrate, let's assume we build a database by importing a ISO-8859-2 +(Latin 2) encoded CSV file, for example a list of Czech-English dictionary +entries, :file:`entries.csv`: + +.. code-block:: shell + + $ python3 + >>> with open("entries.csv", "w", encoding="latin2") as f: + ... f.write("czech,english\n") # header row + ... f.write("položka,entry\n") # data rows + ... f.write("částka,sum\n") + >>> quit() + $ sqlite3 dictionary.db + sqlite> .mode csv + sqlite> .import entries.csv dict + sqlite> .quit + +:file:`dictionary.db` now contains a database that contains Latin 2 encoded text. +Assuming we now have a :class:`Connection` instance :py:data:`!con` +connected to :file:`dictionary.db`, +we can decode the Latin 2 encoded text using this :attr:`~Connection.text_factory`: .. testcode:: - con = sqlite3.connect(":memory:") - con.executescript(""" - CREATE TABLE test (data TEXT); - INSERT INTO test VALUES(CAST(X'619F' AS TEXT)); - """) + con.text_factory = lambda data: str(data, encoding="latin2") -To work with such databases, we can use the following technique, -borrowed from the :ref:`unicode-howto`: +For invalid UTF-8 or arbitrary data in stored in ``TEXT`` table columns, +you can use the following technique, borrowed from the :ref:`unicode-howto`: .. testcode:: con.text_factory = lambda data: str(data, errors="surrogateescape") - dump = con.iterdump() - for line in dump: - print(line) - -The loop above will print the offending line using Unicode surrogate escapes: - -.. testoutput:: - - BEGIN TRANSACTION; - CREATE TABLE test (data TEXT); - INSERT INTO "test" VALUES('a\udc9f'); - COMMIT; - -Note that strings containing surrogate escapes must be treated with care. -You cannot pass them back to SQLite, for example using :meth:`~Cursor.execute`, -since the :mod:`!sqlite3` module APIs only accept UTF-8 encoded strings. -In order to write strings containing surrogate escapes to a file, -you will have to use ``errors="surrogateescape"`` as an argument to :func:`open`: - -.. testcode:: - - with open("dump.sql", "w", errors="surrogateescape") as f: - sql = "\n".join(dump) - f.write(sql) .. note:: @@ -2663,8 +2654,6 @@ you will have to use ``errors="surrogateescape"`` as an argument to :func:`open` :ref:`unicode-howto` -.. _CAST expression: https://www.sqlite.org/lang_expr.html#castexpr - .. _sqlite3-explanation: From 57f1a5793c2f0c8dd6bc9b5852a3b8d19f1b3c0d Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Mon, 4 Sep 2023 10:05:02 +0200 Subject: [PATCH 08/14] Fix note --- Doc/library/sqlite3.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 795f8e474c9077..fee10a06a487c7 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2645,10 +2645,10 @@ you can use the following technique, borrowed from the :ref:`unicode-howto`: .. note:: - Unlike :attr:`~Cursor.row_factory`, which exists as an attribute both on - :class:`Cursor` and :class:`Connection` objects, - :attr:`~Connection.text_factory` only exists as an attribute on - :class:`!Connection` objects. + Strings containing surrogate escapes and must be treated with care. + You cannot simply pass them back to SQLite, + for example using :meth:`~Cursor.execute`, + since the :mod:`!sqlite3` module APIs only accept UTF-8 encoded strings. .. seealso:: From c4cbe84d9fa3ea49f16f07c40bc3a3e69ffcc3e7 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Mon, 4 Sep 2023 11:17:20 +0200 Subject: [PATCH 09/14] Address review: don't need to show how to create the database --- Doc/library/sqlite3.rst | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index fee10a06a487c7..df77c01a06c2c3 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2610,26 +2610,10 @@ You can use a custom :attr:`~Connection.text_factory` to handle such cases. Because of SQLite's `flexible typing`_, it is not uncommon to encounter table columns with the ``TEXT`` data type, containing non-UTF-8 encodings, or even arbitrary data. -To demonstrate, let's assume we build a database by importing a ISO-8859-2 -(Latin 2) encoded CSV file, for example a list of Czech-English dictionary -entries, :file:`entries.csv`: - -.. code-block:: shell - - $ python3 - >>> with open("entries.csv", "w", encoding="latin2") as f: - ... f.write("czech,english\n") # header row - ... f.write("položka,entry\n") # data rows - ... f.write("částka,sum\n") - >>> quit() - $ sqlite3 dictionary.db - sqlite> .mode csv - sqlite> .import entries.csv dict - sqlite> .quit - -:file:`dictionary.db` now contains a database that contains Latin 2 encoded text. +To demonstrate, let's assume we've got a database with ISO-8859-2 (Latin 2) +encoded text, for example a table of Czech-English dictionary entries. Assuming we now have a :class:`Connection` instance :py:data:`!con` -connected to :file:`dictionary.db`, +connected to this database, we can decode the Latin 2 encoded text using this :attr:`~Connection.text_factory`: .. testcode:: From 9b78bf2b135b42746ba71b9371dedeaf25f1906d Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Mon, 4 Sep 2023 11:29:26 +0200 Subject: [PATCH 10/14] Remove execute*() changes --- Doc/library/sqlite3.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index df77c01a06c2c3..6f19afa983d3d9 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1539,8 +1539,6 @@ Cursor objects Use :meth:`executescript` to execute multiple SQL statements. - :meth:`!execute` only accepts UTF-8 encoded strings. - .. method:: executemany(sql, parameters, /) For every item in *parameters*, @@ -1589,8 +1587,6 @@ Cursor objects Starting with Python 3.14, :exc:`ProgrammingError` will be raised instead. - :meth:`!executemany` only accepts UTF-8 encoded strings. - .. method:: executescript(sql_script, /) Execute the SQL statements in *sql_script*. @@ -1616,8 +1612,6 @@ Cursor objects COMMIT; """) - :meth:`!executescript` only accepts UTF-8 encoded strings. - .. method:: fetchone() If :attr:`~Cursor.row_factory` is ``None``, From a91fa7cf1b63901baaa9cb2b21c0307c9c295571 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 7 Sep 2023 09:34:31 +0200 Subject: [PATCH 11/14] Update Doc/library/sqlite3.rst --- Doc/library/sqlite3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 6f19afa983d3d9..451d54a964cafe 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -1227,7 +1227,7 @@ Connection objects .. versionadded:: 3.7 - .. seealso: + .. seealso:: :ref:`sqlite3-howto-encoding` From 341179169acff88020d4339d432987422c9d87eb Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 7 Sep 2023 09:39:10 +0200 Subject: [PATCH 12/14] Update Doc/library/sqlite3.rst --- Doc/library/sqlite3.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 451d54a964cafe..df472ecfc0b3f5 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2624,9 +2624,9 @@ you can use the following technique, borrowed from the :ref:`unicode-howto`: .. note:: Strings containing surrogate escapes and must be treated with care. - You cannot simply pass them back to SQLite, - for example using :meth:`~Cursor.execute`, - since the :mod:`!sqlite3` module APIs only accept UTF-8 encoded strings. + For example, you cannot pass them back to SQLite, + since the :mod:`!sqlite3` module API does not support strings + containing surrogate escape codes. .. seealso:: From dc4c8205ea174f10fc7205cf88e6a845e9f52884 Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Thu, 12 Oct 2023 18:50:16 +0200 Subject: [PATCH 13/14] Apply suggestions from CAM and Ezio Co-authored-by: C.A.M. Gerlach Co-authored-by: Ezio Melotti --- Doc/library/sqlite3.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index df472ecfc0b3f5..3682f715ebf75d 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2595,20 +2595,20 @@ instead of a :class:`~collections.namedtuple`. How to handle non-UTF-8 text encodings ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default, :mod:`!sqlite3` adapts SQLite values with the ``TEXT`` data type -using :class:`str`. -This works well for UTF-8 encoded text, but it will fail for other encodings +By default, :mod:`!sqlite3` uses :class:`str` to adapt SQLite values +with the ``TEXT`` data type. +This works well for UTF-8 encoded text, but it might fail for other encodings and invalid UTF-8. You can use a custom :attr:`~Connection.text_factory` to handle such cases. Because of SQLite's `flexible typing`_, it is not uncommon to encounter table -columns with the ``TEXT`` data type, containing non-UTF-8 encodings, +columns with the ``TEXT`` data type containing non-UTF-8 encodings, or even arbitrary data. -To demonstrate, let's assume we've got a database with ISO-8859-2 (Latin 2) +To demonstrate, let's assume we have a database with ISO-8859-2 (Latin-2) encoded text, for example a table of Czech-English dictionary entries. Assuming we now have a :class:`Connection` instance :py:data:`!con` connected to this database, -we can decode the Latin 2 encoded text using this :attr:`~Connection.text_factory`: +we can decode the Latin-2 encoded text using this :attr:`~Connection.text_factory`: .. testcode:: @@ -2623,7 +2623,7 @@ you can use the following technique, borrowed from the :ref:`unicode-howto`: .. note:: - Strings containing surrogate escapes and must be treated with care. + Strings containing surrogate escapes must be treated with care. For example, you cannot pass them back to SQLite, since the :mod:`!sqlite3` module API does not support strings containing surrogate escape codes. From 7e0e615f4533b54af6a2ed62f9b499f91632f1ff Mon Sep 17 00:00:00 2001 From: "Erlend E. Aasland" Date: Fri, 13 Oct 2023 09:58:05 +0200 Subject: [PATCH 14/14] Keep it simple Co-authored-by: Ezio Melotti --- Doc/library/sqlite3.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Doc/library/sqlite3.rst b/Doc/library/sqlite3.rst index 3682f715ebf75d..f558dee0ef5503 100644 --- a/Doc/library/sqlite3.rst +++ b/Doc/library/sqlite3.rst @@ -2623,10 +2623,8 @@ you can use the following technique, borrowed from the :ref:`unicode-howto`: .. note:: - Strings containing surrogate escapes must be treated with care. - For example, you cannot pass them back to SQLite, - since the :mod:`!sqlite3` module API does not support strings - containing surrogate escape codes. + The :mod:`!sqlite3` module API does not support strings + containing surrogates. .. seealso::