bpo-24665: Add CJK support in textwrap by default.

JulienPalard · fgallaire · JulienPalard · commit 57b28823c7df · 2018-03-06T23:43:38.000+01:00
Co-authored-by: Florent Gallaire &lt;fgallaire@gmail.com&gt;
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
@@ -435,6 +435,9 @@ def test_bad_width(self):
         text = "Whatever, it doesn't matter."
         self.assertRaises(ValueError, wrap, text, 0)
         self.assertRaises(ValueError, wrap, text, -1)
+        # Ensure that we raise while trying to split wide characters.
+        text = 'Did you say "いろはにほへとちりぬるをいろはにほ?"'
+        self.assertRaises(ValueError, wrap, text, 1)
 
     def test_no_split_at_umlaut(self):
         text = "Die Empf\xe4nger-Auswahl"
@@ -578,7 +581,10 @@ def setUp(self):
 Did you say "supercalifragilisticexpialidocious?"
 How *do* you spell that odd word, anyways?
 '''
-
+        self.text_cjk = '''\
+Did you say "いろはにほへとちりぬるをいろはにほ?"
+How りぬ るをいろはにほり ぬるは, anyways?
+'''
     def test_break_long(self):
         # Wrap text with long words and lots of punctuation
 
@@ -590,7 +596,14 @@ def test_break_long(self):
         self.check_wrap(self.text, 50,
                         ['Did you say "supercalifragilisticexpialidocious?"',
                          'How *do* you spell that odd word, anyways?'])
-
+        self.check_wrap(self.text_cjk, 30,
+                        ['Did you say "いろはにほへとち',
+                         'りぬるをいろはにほ?" How りぬ',
+                         'るをいろはにほり ぬるは,',
+                         'anyways?'])
+        self.check_wrap(self.text_cjk, 50,
+                        ['Did you say "いろはにほへとちりぬるをいろはにほ?"',
+                         'How りぬ るをいろはにほり ぬるは, anyways?'])
         # SF bug 797650.  Prevent an infinite loop by making sure that at
         # least one character gets split off on every pass.
         self.check_wrap('-'*10+'hello', 10,
diff --git a/Lib/textwrap.py b/Lib/textwrap.py
@@ -14,6 +14,40 @@
 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
 _whitespace = '\t\n\x0b\x0c\r '
 
+try:
+    from unicodedata import east_asian_width
+
+    def _width(text):
+        """Return the display width of the text in columns, according to
+        unicodedata.east_asian_width only.
+        """
+        return sum(2 if east_asian_width(char) in {'F', 'W'} else 1
+                 for char in text)
+
+    def _slice(text, index):
+        """Return the two slices of text cut to index.
+        """
+        width = 0
+        pos = 0
+        for char in text:
+            width += 2 if east_asian_width(char) in {'F', 'W'} else 1
+            if width > index:
+                break
+            pos += 1
+        return text[:pos], text[pos:]
+
+except ImportError:
+
+    def _width(text):
+        """Fallback in case unicodedata is not available: The display width of
+        a text is just its number of characters.
+        """
+        return len(text)
+
+    def _slice(text, index):
+        return text[:index], text[index:]
+
+
 class TextWrapper:
     """
     Object for wrapping/filling text.  The public interface consists of
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
         # If we're allowed to break long words, then do so: put as much
         # of the next chunk onto the current line as will fit.
         if self.break_long_words:
-            cur_line.append(reversed_chunks[-1][:space_left])
-            reversed_chunks[-1] = reversed_chunks[-1][space_left:]
+            left, right = _slice(reversed_chunks[-1], space_left)
+            cur_line.append(left)
+            reversed_chunks[-1] = right
 
         # Otherwise, we have to preserve the long word intact.  Only add
         # it to the current line if there's nothing already there --
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks):
         lines, but apart from that whitespace is preserved.
         """
         lines = []
-        if self.width <= 0:
-            raise ValueError("invalid width %r (must be > 0)" % self.width)
         if self.max_lines is not None:
             if self.max_lines > 1:
                 indent = self.subsequent_indent
             else:
                 indent = self.initial_indent
-            if len(indent) + len(self.placeholder.lstrip()) > self.width:
+            if (_width(indent) +
+                _width(self.placeholder.lstrip()) > self.width):
                 raise ValueError("placeholder too large for max width")
 
         # Arrange in reverse order so items can be efficiently popped
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks):
                 indent = self.initial_indent
 
             # Maximum width for this line.
-            width = self.width - len(indent)
+            width = self.width - _width(indent)
 
             # First chunk on line is whitespace -- drop it, unless this
             # is the very beginning of the text (ie. no lines started yet).
             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
                 del chunks[-1]
 
             while chunks:
-                l = len(chunks[-1])
+                l = _width(chunks[-1])
 
                 # Can at least squeeze this chunk onto the current line.
                 if cur_len + l <= width:
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks):
                 # Nope, this line is full.
                 else:
                     break
-
             # The current line is full, and the next chunk is too big to
             # fit on *any* line (not just this one).
-            if chunks and len(chunks[-1]) > width:
+            if chunks and _width(chunks[-1]) > width:
                 self._handle_long_word(chunks, cur_line, cur_len, width)
-                cur_len = sum(map(len, cur_line))
+                cur_len = sum(map(_width, cur_line))
 
             # If the last chunk on this line is all whitespace, drop it.
             if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
-                cur_len -= len(cur_line[-1])
+                cur_len -= _width(cur_line[-1])
                 del cur_line[-1]
 
             if cur_line:
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks):
                 else:
                     while cur_line:
                         if (cur_line[-1].strip() and
-                            cur_len + len(self.placeholder) <= width):
+                            cur_len + _width(self.placeholder) <= width):
                             cur_line.append(self.placeholder)
                             lines.append(indent + ''.join(cur_line))
                             break
-                        cur_len -= len(cur_line[-1])
+                        cur_len -= _width(cur_line[-1])
                         del cur_line[-1]
                     else:
                         if lines:
                             prev_line = lines[-1].rstrip()
-                            if (len(prev_line) + len(self.placeholder) <=
-                                    self.width):
+                            if (_width(prev_line) +
+                                _width(self.placeholder) <= self.width):
                                 lines[-1] = prev_line + self.placeholder
                                 break
                         lines.append(indent + self.placeholder.lstrip())
@@ -348,6 +381,10 @@ def wrap(self, text):
         and all other whitespace characters (including newline) are
         converted to space.
         """
+        if self.width <= 0:
+            raise ValueError("invalid width %r (must be > 0)" % self.width)
+        elif self.width == 1 and _width(text) > len(text):
+            raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
         chunks = self._split_chunks(text)
         if self.fix_sentence_endings:
             self._fix_sentence_endings(chunks)
diff --git a/Misc/ACKS b/Misc/ACKS
@@ -514,6 +514,7 @@ Lele Gaifax
 Santiago Gala
 Yitzchak Gale
 Matthew Gallagher
+Florent Gallaire
 Quentin Gallet-Gilles
 Riccardo Attilio Galli
 Raymund Galvin
diff --git a/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst b/Misc/NEWS.d/next/Library/2018-02-13-02-06-24.bpo-24665.re7KqM.rst
@@ -0,0 +1,2 @@
+Textwrap now take into account CJK double characters while measuring line
+width.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Textwrap now take into account CJK double characters while measuring line`
	`2`	`+width.`