|
1 |
| -import unittest |
| 1 | +import struct |
2 | 2 | import sys
|
| 3 | +import unittest |
3 | 4 | from test import support
|
4 | 5 | from test.support import import_helper
|
5 | 6 |
|
@@ -28,6 +29,14 @@ class Str(str):
|
28 | 29 | pass
|
29 | 30 |
|
30 | 31 |
|
| 32 | +PyUnicode_FORMAT_ASCII = 0x01 |
| 33 | +PyUnicode_FORMAT_UCS1 = 0x02 |
| 34 | +PyUnicode_FORMAT_UCS2 = 0x04 |
| 35 | +PyUnicode_FORMAT_UCS4 = 0x08 |
| 36 | +PyUnicode_FORMAT_UTF8 = 0x10 |
| 37 | +# Invalid native format |
| 38 | +PyUnicode_FORMAT_INVALID = 0x20 |
| 39 | + |
31 | 40 | class CAPITest(unittest.TestCase):
|
32 | 41 |
|
33 | 42 | @support.cpython_only
|
@@ -1721,6 +1730,139 @@ def test_pep393_utf8_caching_bug(self):
|
1721 | 1730 | # Check that the second call returns the same result
|
1722 | 1731 | self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
|
1723 | 1732 |
|
| 1733 | + def test_unicode_export(self): |
| 1734 | + # Test PyUnicode_Export() and PyUnicode_FreeExport() |
| 1735 | + unicode_export = _testlimitedcapi.unicode_export |
| 1736 | + if sys.byteorder == 'little': |
| 1737 | + ucs2_enc = 'utf-16le' |
| 1738 | + ucs4_enc = 'utf-32le' |
| 1739 | + else: |
| 1740 | + ucs2_enc = 'utf-16be' |
| 1741 | + ucs4_enc = 'utf-32be' |
| 1742 | + |
| 1743 | + # export to the native format |
| 1744 | + formats = (PyUnicode_FORMAT_ASCII |
| 1745 | + | PyUnicode_FORMAT_UCS1 |
| 1746 | + | PyUnicode_FORMAT_UCS2 |
| 1747 | + | PyUnicode_FORMAT_UCS4) |
| 1748 | + BUFFER_UCS1 = 'B' |
| 1749 | + BUFFER_UCS2 = 'H' |
| 1750 | + if struct.calcsize('I') == 4: |
| 1751 | + BUFFER_UCS4 = 'I' |
| 1752 | + elif struct.calcsize('L') == 4: |
| 1753 | + BUFFER_UCS4 = 'L' |
| 1754 | + else: |
| 1755 | + self.fail("unable to get BUFFER_UCS4 ") |
| 1756 | + |
| 1757 | + self.assertEqual(unicode_export("abc", formats), |
| 1758 | + (b'abc', PyUnicode_FORMAT_ASCII, 1, BUFFER_UCS1)) |
| 1759 | + self.assertEqual(unicode_export("latin1:\xe9", formats), |
| 1760 | + (b'latin1:\xe9', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1)) |
| 1761 | + self.assertEqual(unicode_export('ucs2:\u20ac', formats), |
| 1762 | + ('ucs2:\u20ac'.encode(ucs2_enc), |
| 1763 | + PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) |
| 1764 | + self.assertEqual(unicode_export('ucs4:\U0010ffff', formats), |
| 1765 | + ('ucs4:\U0010ffff'.encode(ucs4_enc), |
| 1766 | + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) |
| 1767 | + |
| 1768 | + # export ASCII as UCS1 |
| 1769 | + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS1), |
| 1770 | + (b'abc', PyUnicode_FORMAT_UCS1, 1, BUFFER_UCS1)) |
| 1771 | + |
| 1772 | + # export ASCII and UCS1 to UCS2 |
| 1773 | + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS2), |
| 1774 | + ('abc'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) |
| 1775 | + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS2), |
| 1776 | + ('latin1:\xe9'.encode(ucs2_enc), PyUnicode_FORMAT_UCS2, 2, BUFFER_UCS2)) |
| 1777 | + |
| 1778 | + # always export to UCS4 |
| 1779 | + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UCS4), |
| 1780 | + ('abc'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) |
| 1781 | + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UCS4), |
| 1782 | + ('latin1:\xe9'.encode(ucs4_enc), PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) |
| 1783 | + self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UCS4), |
| 1784 | + ('ucs2:\u20ac'.encode(ucs4_enc), |
| 1785 | + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) |
| 1786 | + self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UCS4), |
| 1787 | + ('ucs4:\U0010ffff'.encode(ucs4_enc), |
| 1788 | + PyUnicode_FORMAT_UCS4, 4, BUFFER_UCS4)) |
| 1789 | + |
| 1790 | + # always export to UTF8 |
| 1791 | + self.assertEqual(unicode_export("abc", PyUnicode_FORMAT_UTF8), |
| 1792 | + ('abc'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B')) |
| 1793 | + self.assertEqual(unicode_export("latin1:\xe9", PyUnicode_FORMAT_UTF8), |
| 1794 | + ('latin1:\xe9'.encode('utf8'), PyUnicode_FORMAT_UTF8, 1, 'B')) |
| 1795 | + self.assertEqual(unicode_export('ucs2:\u20ac', PyUnicode_FORMAT_UTF8), |
| 1796 | + ('ucs2:\u20ac'.encode('utf8'), |
| 1797 | + PyUnicode_FORMAT_UTF8, 1, 'B')) |
| 1798 | + self.assertEqual(unicode_export('ucs4:\U0010ffff', PyUnicode_FORMAT_UTF8), |
| 1799 | + ('ucs4:\U0010ffff'.encode('utf8'), |
| 1800 | + PyUnicode_FORMAT_UTF8, 1, 'B')) |
| 1801 | + |
| 1802 | + # No supported format or invalid format |
| 1803 | + with self.assertRaisesRegex(ValueError, |
| 1804 | + "unable to find a matching export format"): |
| 1805 | + unicode_export('abc', 0) |
| 1806 | + with self.assertRaisesRegex(ValueError, |
| 1807 | + "unable to find a matching export format"): |
| 1808 | + unicode_export('abc', PyUnicode_FORMAT_INVALID) |
| 1809 | + |
| 1810 | + def test_unicode_import(self): |
| 1811 | + # Test PyUnicode_Import() |
| 1812 | + unicode_import = _testlimitedcapi.unicode_import |
| 1813 | + if sys.byteorder == 'little': |
| 1814 | + ucs2_enc = 'utf-16le' |
| 1815 | + ucs4_enc = 'utf-32le' |
| 1816 | + else: |
| 1817 | + ucs2_enc = 'utf-16be' |
| 1818 | + ucs4_enc = 'utf-32be' |
| 1819 | + |
| 1820 | + self.assertEqual(unicode_import(b'abc', PyUnicode_FORMAT_ASCII), |
| 1821 | + "abc") |
| 1822 | + self.assertEqual(unicode_import(b'latin1:\xe9', PyUnicode_FORMAT_UCS1), |
| 1823 | + "latin1:\xe9") |
| 1824 | + |
| 1825 | + self.assertEqual(unicode_import('ucs2:\u20ac'.encode(ucs2_enc), |
| 1826 | + PyUnicode_FORMAT_UCS2), |
| 1827 | + 'ucs2:\u20ac') |
| 1828 | + |
| 1829 | + self.assertEqual(unicode_import('ucs4:\U0010ffff'.encode(ucs4_enc), |
| 1830 | + PyUnicode_FORMAT_UCS4), |
| 1831 | + 'ucs4:\U0010ffff') |
| 1832 | + |
| 1833 | + text = "abc\xe9\U0010ffff" |
| 1834 | + self.assertEqual(unicode_import(text.encode('utf8'), |
| 1835 | + PyUnicode_FORMAT_UTF8), |
| 1836 | + text) |
| 1837 | + |
| 1838 | + # Empty string |
| 1839 | + for native_format in ( |
| 1840 | + PyUnicode_FORMAT_ASCII, |
| 1841 | + PyUnicode_FORMAT_UCS1, |
| 1842 | + PyUnicode_FORMAT_UCS2, |
| 1843 | + PyUnicode_FORMAT_UCS4, |
| 1844 | + PyUnicode_FORMAT_UTF8, |
| 1845 | + ): |
| 1846 | + with self.subTest(native_format=native_format): |
| 1847 | + self.assertEqual(unicode_import(b'', native_format), |
| 1848 | + '') |
| 1849 | + |
| 1850 | + # Invalid format |
| 1851 | + with self.assertRaises(ValueError): |
| 1852 | + unicode_import(b'', PyUnicode_FORMAT_INVALID) |
| 1853 | + |
| 1854 | + # Invalid size |
| 1855 | + ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc) |
| 1856 | + with self.assertRaises(ValueError): |
| 1857 | + unicode_import(ucs2[:-1], PyUnicode_FORMAT_UCS2) |
| 1858 | + ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc) |
| 1859 | + with self.assertRaises(ValueError): |
| 1860 | + unicode_import(ucs4[:-1], PyUnicode_FORMAT_UCS4) |
| 1861 | + with self.assertRaises(ValueError): |
| 1862 | + unicode_import(ucs4[:-2], PyUnicode_FORMAT_UCS4) |
| 1863 | + with self.assertRaises(ValueError): |
| 1864 | + unicode_import(ucs4[:-3], PyUnicode_FORMAT_UCS4) |
| 1865 | + |
1724 | 1866 |
|
1725 | 1867 | class PyUnicodeWriterTest(unittest.TestCase):
|
1726 | 1868 | def create_writer(self, size):
|
@@ -1903,6 +2045,38 @@ def test_recover_error(self):
|
1903 | 2045 |
|
1904 | 2046 | self.assertEqual(writer.finish(), 'Hello World.')
|
1905 | 2047 |
|
1906 |
| - |
1907 |
| -if __name__ == "__main__": |
| 2048 | + def test_unicode_export_import_roundtrip(self): |
| 2049 | + unicode_export = _testlimitedcapi.unicode_export |
| 2050 | + unicode_import = _testlimitedcapi.unicode_import |
| 2051 | + |
| 2052 | + ASCII = PyUnicode_FORMAT_ASCII |
| 2053 | + UCS1 = PyUnicode_FORMAT_UCS1 |
| 2054 | + UCS2 = PyUnicode_FORMAT_UCS2 |
| 2055 | + UCS4 = PyUnicode_FORMAT_UCS4 |
| 2056 | + UTF8 = PyUnicode_FORMAT_UTF8 |
| 2057 | + ALL = (ASCII | UCS1 | UCS2 | UCS4 | UTF8) |
| 2058 | + |
| 2059 | + for string, allowed_formats in ( |
| 2060 | + ('', {ASCII, UCS1, UCS2, UCS4, UTF8}), |
| 2061 | + ('ascii', {ASCII, UCS1, UCS2, UCS4, UTF8}), |
| 2062 | + ('latin1:\xe9', {UCS1, UCS2, UCS4, UTF8}), |
| 2063 | + ('ucs2:\u20ac', {UCS2, UCS4, UTF8}), |
| 2064 | + ('ucs4:\U0001f638', {UCS4, UTF8}), |
| 2065 | + ): |
| 2066 | + for format in ASCII, UCS1, UCS2, UCS4, UTF8: |
| 2067 | + with self.subTest(string=string, format=format): |
| 2068 | + if format not in allowed_formats: |
| 2069 | + with self.assertRaises(ValueError): |
| 2070 | + unicode_export(string, format) |
| 2071 | + else: |
| 2072 | + buf, buf_fmt, item_size, view_fmt = unicode_export(string, format) |
| 2073 | + restored = unicode_import(buf, buf_fmt) |
| 2074 | + self.assertEqual(restored, string) |
| 2075 | + |
| 2076 | + buf, buf_fmt, item_size, view_fmt = unicode_export(string, ALL) |
| 2077 | + restored = unicode_import(buf, buf_fmt) |
| 2078 | + self.assertEqual(restored, string) |
| 2079 | + |
| 2080 | + |
| 2081 | +if __name__ == '__main__': |
1908 | 2082 | unittest.main()
|
0 commit comments