Skip to content

Commit 57b2882

Browse files
bpo-24665: Add CJK support in textwrap by default.
Co-authored-by: Florent Gallaire <[email protected]>
1 parent f34e03e commit 57b2882

File tree

4 files changed

+70
-17
lines changed

4 files changed

+70
-17
lines changed

Lib/test/test_textwrap.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,9 @@ def test_bad_width(self):
435435
text = "Whatever, it doesn't matter."
436436
self.assertRaises(ValueError, wrap, text, 0)
437437
self.assertRaises(ValueError, wrap, text, -1)
438+
# Ensure that we raise while trying to split wide characters.
439+
text = 'Did you say "いろはにほへとちりぬるをいろはにほ?"'
440+
self.assertRaises(ValueError, wrap, text, 1)
438441

439442
def test_no_split_at_umlaut(self):
440443
text = "Die Empf\xe4nger-Auswahl"
@@ -578,7 +581,10 @@ def setUp(self):
578581
Did you say "supercalifragilisticexpialidocious?"
579582
How *do* you spell that odd word, anyways?
580583
'''
581-
584+
self.text_cjk = '''\
585+
Did you say "いろはにほへとちりぬるをいろはにほ?"
586+
How りぬ るをいろはにほり ぬるは, anyways?
587+
'''
582588
def test_break_long(self):
583589
# Wrap text with long words and lots of punctuation
584590

@@ -590,7 +596,14 @@ def test_break_long(self):
590596
self.check_wrap(self.text, 50,
591597
['Did you say "supercalifragilisticexpialidocious?"',
592598
'How *do* you spell that odd word, anyways?'])
593-
599+
self.check_wrap(self.text_cjk, 30,
600+
['Did you say "いろはにほへとち',
601+
'りぬるをいろはにほ?" How りぬ',
602+
'るをいろはにほり ぬるは,',
603+
'anyways?'])
604+
self.check_wrap(self.text_cjk, 50,
605+
['Did you say "いろはにほへとちりぬるをいろはにほ?"',
606+
'How りぬ るをいろはにほり ぬるは, anyways?'])
594607
# SF bug 797650. Prevent an infinite loop by making sure that at
595608
# least one character gets split off on every pass.
596609
self.check_wrap('-'*10+'hello', 10,

Lib/textwrap.py

+52-15
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,40 @@
1414
# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
1515
_whitespace = '\t\n\x0b\x0c\r '
1616

17+
try:
18+
from unicodedata import east_asian_width
19+
20+
def _width(text):
21+
"""Return the display width of the text in columns, according to
22+
unicodedata.east_asian_width only.
23+
"""
24+
return sum(2 if east_asian_width(char) in {'F', 'W'} else 1
25+
for char in text)
26+
27+
def _slice(text, index):
28+
"""Return the two slices of text cut to index.
29+
"""
30+
width = 0
31+
pos = 0
32+
for char in text:
33+
width += 2 if east_asian_width(char) in {'F', 'W'} else 1
34+
if width > index:
35+
break
36+
pos += 1
37+
return text[:pos], text[pos:]
38+
39+
except ImportError:
40+
41+
def _width(text):
42+
"""Fallback in case unicodedata is not available: The display width of
43+
a text is just its number of characters.
44+
"""
45+
return len(text)
46+
47+
def _slice(text, index):
48+
return text[:index], text[index:]
49+
50+
1751
class TextWrapper:
1852
"""
1953
Object for wrapping/filling text. The public interface consists of
@@ -215,8 +249,9 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
215249
# If we're allowed to break long words, then do so: put as much
216250
# of the next chunk onto the current line as will fit.
217251
if self.break_long_words:
218-
cur_line.append(reversed_chunks[-1][:space_left])
219-
reversed_chunks[-1] = reversed_chunks[-1][space_left:]
252+
left, right = _slice(reversed_chunks[-1], space_left)
253+
cur_line.append(left)
254+
reversed_chunks[-1] = right
220255

221256
# Otherwise, we have to preserve the long word intact. Only add
222257
# it to the current line if there's nothing already there --
@@ -244,14 +279,13 @@ def _wrap_chunks(self, chunks):
244279
lines, but apart from that whitespace is preserved.
245280
"""
246281
lines = []
247-
if self.width <= 0:
248-
raise ValueError("invalid width %r (must be > 0)" % self.width)
249282
if self.max_lines is not None:
250283
if self.max_lines > 1:
251284
indent = self.subsequent_indent
252285
else:
253286
indent = self.initial_indent
254-
if len(indent) + len(self.placeholder.lstrip()) > self.width:
287+
if (_width(indent) +
288+
_width(self.placeholder.lstrip()) > self.width):
255289
raise ValueError("placeholder too large for max width")
256290

257291
# Arrange in reverse order so items can be efficiently popped
@@ -272,15 +306,15 @@ def _wrap_chunks(self, chunks):
272306
indent = self.initial_indent
273307

274308
# Maximum width for this line.
275-
width = self.width - len(indent)
309+
width = self.width - _width(indent)
276310

277311
# First chunk on line is whitespace -- drop it, unless this
278312
# is the very beginning of the text (ie. no lines started yet).
279313
if self.drop_whitespace and chunks[-1].strip() == '' and lines:
280314
del chunks[-1]
281315

282316
while chunks:
283-
l = len(chunks[-1])
317+
l = _width(chunks[-1])
284318

285319
# Can at least squeeze this chunk onto the current line.
286320
if cur_len + l <= width:
@@ -290,16 +324,15 @@ def _wrap_chunks(self, chunks):
290324
# Nope, this line is full.
291325
else:
292326
break
293-
294327
# The current line is full, and the next chunk is too big to
295328
# fit on *any* line (not just this one).
296-
if chunks and len(chunks[-1]) > width:
329+
if chunks and _width(chunks[-1]) > width:
297330
self._handle_long_word(chunks, cur_line, cur_len, width)
298-
cur_len = sum(map(len, cur_line))
331+
cur_len = sum(map(_width, cur_line))
299332

300333
# If the last chunk on this line is all whitespace, drop it.
301334
if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
302-
cur_len -= len(cur_line[-1])
335+
cur_len -= _width(cur_line[-1])
303336
del cur_line[-1]
304337

305338
if cur_line:
@@ -315,17 +348,17 @@ def _wrap_chunks(self, chunks):
315348
else:
316349
while cur_line:
317350
if (cur_line[-1].strip() and
318-
cur_len + len(self.placeholder) <= width):
351+
cur_len + _width(self.placeholder) <= width):
319352
cur_line.append(self.placeholder)
320353
lines.append(indent + ''.join(cur_line))
321354
break
322-
cur_len -= len(cur_line[-1])
355+
cur_len -= _width(cur_line[-1])
323356
del cur_line[-1]
324357
else:
325358
if lines:
326359
prev_line = lines[-1].rstrip()
327-
if (len(prev_line) + len(self.placeholder) <=
328-
self.width):
360+
if (_width(prev_line) +
361+
_width(self.placeholder) <= self.width):
329362
lines[-1] = prev_line + self.placeholder
330363
break
331364
lines.append(indent + self.placeholder.lstrip())
@@ -348,6 +381,10 @@ def wrap(self, text):
348381
and all other whitespace characters (including newline) are
349382
converted to space.
350383
"""
384+
if self.width <= 0:
385+
raise ValueError("invalid width %r (must be > 0)" % self.width)
386+
elif self.width == 1 and _width(text) > len(text):
387+
raise ValueError("invalid width 1 (must be > 1 when CJK chars)")
351388
chunks = self._split_chunks(text)
352389
if self.fix_sentence_endings:
353390
self._fix_sentence_endings(chunks)

Misc/ACKS

+1
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ Lele Gaifax
514514
Santiago Gala
515515
Yitzchak Gale
516516
Matthew Gallagher
517+
Florent Gallaire
517518
Quentin Gallet-Gilles
518519
Riccardo Attilio Galli
519520
Raymund Galvin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Textwrap now take into account CJK double characters while measuring line
2+
width.

0 commit comments

Comments
 (0)