From df3f367ca583f55d7bf47d45629d0b26895d21cb Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Fri, 3 Sep 2021 10:06:17 +0200 Subject: [PATCH 01/10] bpo-12499: textwrap.wrap: add control for fonts with different character widths This also provides a generic solution for bpo-24665 --- Doc/library/textwrap.rst | 6 ++++++ Lib/test/test_textwrap.py | 20 ++++++++++++++++++++ Lib/textwrap.py | 22 +++++++++++++--------- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/Doc/library/textwrap.rst b/Doc/library/textwrap.rst index 7780e241769657..59958e2bfe143f 100644 --- a/Doc/library/textwrap.rst +++ b/Doc/library/textwrap.rst @@ -281,6 +281,12 @@ hyphenated words; only then will long words be broken if necessary, unless .. versionadded:: 3.4 + .. attribute:: text_len + + (default: ``len``) Used to determine the length of a string. You can + provide a custom function, e.g. to account for wide characters. + + .. index:: single: ...; placeholder .. attribute:: placeholder diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index dfbc2b93dfc0d6..7ebb4a49534ec1 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -9,6 +9,7 @@ # import unittest +import unicodedata from textwrap import TextWrapper, wrap, fill, dedent, indent, shorten @@ -1076,5 +1077,24 @@ def test_first_word_too_long_but_placeholder_fits(self): self.check_shorten("Helloo", 5, "[...]") +class WideCharacterTestCase(BaseTestCase): + def setUp(self): + def text_len(text): + n = 0 + for c in text: + if unicodedata.east_asian_width(c) in ['F', 'W']: + n += 2 + else: + n += 1 + return n + + self.wrapper = TextWrapper(width=5, text_len=text_len) + + def test_wide_character(self): + text = "123 🔧" + result = self.wrapper.wrap(text, **kwargs) + self.check(result, ["123", "🔧"]) + + if __name__ == '__main__': unittest.main() diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 841de9baecf5d8..c304571a672850 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -125,7 +125,8 @@ def __init__(self, tabsize=8, *, max_lines=None, - placeholder=' [...]'): + placeholder=' [...]', + text_len=len): self.width = width self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent @@ -138,6 +139,7 @@ def __init__(self, self.tabsize = tabsize self.max_lines = max_lines self.placeholder = placeholder + self.text_len = text_len # -- Private methods ----------------------------------------------- @@ -217,7 +219,7 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): if self.break_long_words: end = space_left chunk = reversed_chunks[-1] - if self.break_on_hyphens and len(chunk) > space_left: + if self.break_on_hyphens and self.text_len(chunk) > space_left: # break after last hyphen, but only if there are # non-hyphens before it hyphen = chunk.rfind('-', 0, space_left) @@ -259,7 +261,8 @@ def _wrap_chunks(self, chunks): indent = self.subsequent_indent else: indent = self.initial_indent - if len(indent) + len(self.placeholder.lstrip()) > self.width: + if self.text_len(indent) + + self.text_len(self.placeholder.lstrip()) > self.width: raise ValueError("placeholder too large for max width") # Arrange in reverse order so items can be efficiently popped @@ -280,7 +283,7 @@ def _wrap_chunks(self, chunks): indent = self.initial_indent # Maximum width for this line. - width = self.width - len(indent) + width = self.width - self.text_len(indent) # First chunk on line is whitespace -- drop it, unless this # is the very beginning of the text (ie. no lines started yet). @@ -303,11 +306,11 @@ def _wrap_chunks(self, chunks): # fit on *any* line (not just this one). if chunks and len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) - cur_len = sum(map(len, cur_line)) + cur_len = sum(map(self.text_len, cur_line)) # If the last chunk on this line is all whitespace, drop it. if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': - cur_len -= len(cur_line[-1]) + cur_len -= self.text_len(cur_line[-1]) del cur_line[-1] if cur_line: @@ -323,16 +326,17 @@ def _wrap_chunks(self, chunks): else: while cur_line: if (cur_line[-1].strip() and - cur_len + len(self.placeholder) <= width): + cur_len + self.text_len(self.placeholder) <= width): cur_line.append(self.placeholder) lines.append(indent + ''.join(cur_line)) break - cur_len -= len(cur_line[-1]) + cur_len -= self.text_len(cur_line[-1]) del cur_line[-1] else: if lines: prev_line = lines[-1].rstrip() - if (len(prev_line) + len(self.placeholder) <= + if (self.text_len(prev_line) + + self.text_len(self.placeholder) <= self.width): lines[-1] = prev_line + self.placeholder break From 97a2ec81f13e87ca5a9f89488bf633c3dfa86f91 Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Fri, 3 Sep 2021 18:09:00 +0200 Subject: [PATCH 02/10] react to feedback --- Doc/library/textwrap.rst | 2 ++ Lib/test/test_textwrap.py | 11 ++++------- Lib/textwrap.py | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Doc/library/textwrap.rst b/Doc/library/textwrap.rst index 59958e2bfe143f..e7c153877e7a18 100644 --- a/Doc/library/textwrap.rst +++ b/Doc/library/textwrap.rst @@ -286,6 +286,8 @@ hyphenated words; only then will long words be broken if necessary, unless (default: ``len``) Used to determine the length of a string. You can provide a custom function, e.g. to account for wide characters. + .. versionadded:: 3.11 + .. index:: single: ...; placeholder diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 7ebb4a49534ec1..21167008e4ae39 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1078,22 +1078,19 @@ def test_first_word_too_long_but_placeholder_fits(self): class WideCharacterTestCase(BaseTestCase): - def setUp(self): + def test_wide_character(self): def text_len(text): n = 0 for c in text: - if unicodedata.east_asian_width(c) in ['F', 'W']: + if unicodedata.east_asian_width(c) in {'F', 'W'}: n += 2 else: n += 1 return n - self.wrapper = TextWrapper(width=5, text_len=text_len) - - def test_wide_character(self): text = "123 🔧" - result = self.wrapper.wrap(text, **kwargs) - self.check(result, ["123", "🔧"]) + expected = ["123", "🔧"] + self.check_wrap(text, 6, expected, text_len=text_len) if __name__ == '__main__': diff --git a/Lib/textwrap.py b/Lib/textwrap.py index c304571a672850..9dd7d940552510 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -261,8 +261,8 @@ def _wrap_chunks(self, chunks): indent = self.subsequent_indent else: indent = self.initial_indent - if self.text_len(indent) + - self.text_len(self.placeholder.lstrip()) > self.width: + if (self.text_len(indent) + + self.text_len(self.placeholder.lstrip()) > self.width): raise ValueError("placeholder too large for max width") # Arrange in reverse order so items can be efficiently popped From db82b6cdc04e3ed7f4628b6840742060d7c933c4 Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Fri, 3 Sep 2021 18:15:40 +0200 Subject: [PATCH 03/10] typo --- Lib/test/test_textwrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 21167008e4ae39..bb3586073d9e88 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1090,7 +1090,7 @@ def text_len(text): text = "123 🔧" expected = ["123", "🔧"] - self.check_wrap(text, 6, expected, text_len=text_len) + self.check_wrap(text, 5, expected, text_len=text_len) if __name__ == '__main__': From f457e20d835bc5099da512fb621b64ea887306ef Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Fri, 3 Sep 2021 20:18:38 +0200 Subject: [PATCH 04/10] fix missing len occurences --- Lib/textwrap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 9dd7d940552510..76358b235c4a32 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -291,7 +291,7 @@ def _wrap_chunks(self, chunks): del chunks[-1] while chunks: - l = len(chunks[-1]) + l = self.text_len(chunks[-1]) # Can at least squeeze this chunk onto the current line. if cur_len + l <= width: @@ -304,7 +304,7 @@ def _wrap_chunks(self, chunks): # The current line is full, and the next chunk is too big to # fit on *any* line (not just this one). - if chunks and len(chunks[-1]) > width: + if chunks and self.text_len(chunks[-1]) > width: self._handle_long_word(chunks, cur_line, cur_len, width) cur_len = sum(map(self.text_len, cur_line)) From e164780bf4ebf5713c10c088929e3876626699be Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Sat, 4 Sep 2021 22:43:34 +0200 Subject: [PATCH 05/10] add more tests --- Lib/test/test_textwrap.py | 50 ++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index bb3586073d9e88..3e07ea9f9e3999 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1078,19 +1078,47 @@ def test_first_word_too_long_but_placeholder_fits(self): class WideCharacterTestCase(BaseTestCase): - def test_wide_character(self): - def text_len(text): - n = 0 - for c in text: - if unicodedata.east_asian_width(c) in {'F', 'W'}: - n += 2 - else: - n += 1 - return n + def text_len(self, text): + n = 0 + for c in text: + if unicodedata.east_asian_width(c) in {'F', 'W'}: + n += 2 + else: + n += 1 + return n + def check_shorten(self, text, width, expect, **kwargs): + result = shorten(text, width, **kwargs) + self.check(result, expect) + + def test_wrap(self): text = "123 🔧" - expected = ["123", "🔧"] - self.check_wrap(text, 5, expected, text_len=text_len) + self.check_wrap(text, 5, ["123 🔧"]) + self.check_wrap(text, 5, ["123", "🔧"], text_len=self.text_len) + + def test_wrap_initial_indent(self): + text = "12 12" + self.check_wrap(text, 6, ["🔧12 12"], initial_indent="🔧") + self.check_wrap(text, 6, ["🔧12", "12"], initial_indent="🔧", + text_len=self.text_len) + + def test_wrap_subsequent_indent(self): + text = "12 12 12 12" + self.check_wrap(text, 6, ["12 12", "🔧12 12"], subsequent_indent="🔧") + self.check_wrap(text, 6, ["12 12", "🔧12", "🔧12"], + subsequent_indent="🔧", text_len=self.text_len) + + def test_shorten(self): + text = "123 1234🔧" + expected = "123 [...]" + self.check_shorten(text, 9, "123 1234🔧") + self.check_shorten(text, 9, "123 [...]", text_len=self.text_len) + + def test_shorten_placeholder(self): + text = "123 1 123" + self.check_shorten(text, 7, "123 1 🔧", placeholder=" 🔧") + self.check_shorten(text, 7, "123 🔧", placeholder=" 🔧", + text_len=self.text_len) if __name__ == '__main__': From 09ce4cee17454ce8067c8dd1f82ac170c1054e06 Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Sun, 5 Sep 2021 07:50:29 +0200 Subject: [PATCH 06/10] fix idle test --- Lib/idlelib/idle_test/test_calltip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/idlelib/idle_test/test_calltip.py b/Lib/idlelib/idle_test/test_calltip.py index b23915c5ab7849..0900d7fca6c4a8 100644 --- a/Lib/idlelib/idle_test/test_calltip.py +++ b/Lib/idlelib/idle_test/test_calltip.py @@ -99,7 +99,7 @@ def test_signature_wrap(self): (width=70, initial_indent='', subsequent_indent='', expand_tabs=True, replace_whitespace=True, fix_sentence_endings=False, break_long_words=True, drop_whitespace=True, break_on_hyphens=True, tabsize=8, *, max_lines=None, - placeholder=' [...]') + placeholder=' [...]', text_len=) Object for wrapping/filling text. The public interface consists of the wrap() and fill() methods; the other methods are just there for subclasses to override in order to tweak the default behaviour. From bf8dad5755cd86ba60af8deb6805e2244b400650 Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Sun, 5 Sep 2021 07:50:35 +0200 Subject: [PATCH 07/10] optimize text_len function --- Lib/test/test_textwrap.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 3e07ea9f9e3999..831d826d100ea1 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1079,13 +1079,10 @@ def test_first_word_too_long_but_placeholder_fits(self): class WideCharacterTestCase(BaseTestCase): def text_len(self, text): - n = 0 - for c in text: - if unicodedata.east_asian_width(c) in {'F', 'W'}: - n += 2 - else: - n += 1 - return n + sum( + 2 if unicodedata.east_asian_width(c) in {'F', 'W'} else 1 + for c in text + ) def check_shorten(self, text, width, expect, **kwargs): result = shorten(text, width, **kwargs) From 68e8098a68a952ddca55981abefb8b159c4b2bcf Mon Sep 17 00:00:00 2001 From: Tobias Bengfort Date: Wed, 13 Oct 2021 07:38:00 +0200 Subject: [PATCH 08/10] fixup --- Lib/test/test_textwrap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 831d826d100ea1..b14fe366aeb7ae 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1079,7 +1079,7 @@ def test_first_word_too_long_but_placeholder_fits(self): class WideCharacterTestCase(BaseTestCase): def text_len(self, text): - sum( + return sum( 2 if unicodedata.east_asian_width(c) in {'F', 'W'} else 1 for c in text ) From e5d6d88c57615a9aa70ccf763a546917c836187e Mon Sep 17 00:00:00 2001 From: Tip ten Brink <75669206+tiptenbrink@users.noreply.github.com> Date: Tue, 9 Nov 2021 21:00:49 +0100 Subject: [PATCH 09/10] _find_width_index and _handle_long_word change --- Lib/test/test_textwrap.py | 13 +++++++++++++ Lib/textwrap.py | 21 +++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index b14fe366aeb7ae..d983d1ab3f01ac 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1118,5 +1118,18 @@ def test_shorten_placeholder(self): text_len=self.text_len) +class ZeroWidthTestCase(BaseTestCase): + def text_len(self, text): + return sum( + 0 if c == 'Q' else 1 + for c in text + ) + + def test_zero_width_text_len(self): + + text = "0QQ1234QQ56789" + self.check_wrap(text, 6, ["0QQ1234QQ5", "6789"], text_len=self.text_len) + + if __name__ == '__main__': unittest.main() diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 76358b235c4a32..5dfec9e1ca746b 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -199,6 +199,23 @@ def _fix_sentence_endings(self, chunks): else: i += 1 + def _find_width_index(self, text, width): + """_find_width_index(text : string, width: int) + + Find at which index the text has the required width. + """ + # In most cases text_len will just use the number of characters, so this heuristic prevents calculating width + # for each character + if self.text_len(text[:width]) == width: + # For character widths greater than one, width can be more than the number of characters + return min(width, len(text)) + cur_text = '' + for i, c in enumerate(text): + cur_text += c + cur_width = self.text_len(cur_text) + if cur_width >= width: + return i+1 + def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): """_handle_long_word(chunks : [string], cur_line : [string], @@ -217,12 +234,12 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): # If we're allowed to break long words, then do so: put as much # of the next chunk onto the current line as will fit. if self.break_long_words: - end = space_left chunk = reversed_chunks[-1] + end = self._find_width_index(chunk, space_left) if self.break_on_hyphens and self.text_len(chunk) > space_left: # break after last hyphen, but only if there are # non-hyphens before it - hyphen = chunk.rfind('-', 0, space_left) + hyphen = chunk.rfind('-', 0, end) if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]): end = hyphen + 1 cur_line.append(chunk[:end]) From 7b32d0b1228987e2e13fcd324988b854c754f51b Mon Sep 17 00:00:00 2001 From: Tip ten Brink <75669206+tiptenbrink@users.noreply.github.com> Date: Wed, 10 Nov 2021 17:38:55 +0100 Subject: [PATCH 10/10] Apply changes, ensure min 1 char on line --- Lib/test/test_textwrap.py | 19 ++++++++++++++++--- Lib/textwrap.py | 16 +++++++++------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index d983d1ab3f01ac..665418e7a072fe 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -1118,18 +1118,31 @@ def test_shorten_placeholder(self): text_len=self.text_len) -class ZeroWidthTestCase(BaseTestCase): +class CustomWidthTestCase(BaseTestCase): def text_len(self, text): + lengths = { + 'A': 4, + 'B': 2, + 'Q': 0, + } + return sum( - 0 if c == 'Q' else 1 + lengths[c] if c in lengths else 1 for c in text ) def test_zero_width_text_len(self): - text = "0QQ1234QQ56789" self.check_wrap(text, 6, ["0QQ1234QQ5", "6789"], text_len=self.text_len) + def test_char_longer_than_width(self): + text = "AA0123" + self.check_wrap(text, 3, ["A", "A", "012", "3"], text_len=self.text_len) + + def test_next_char_overflow(self): + text = "BB0123" + self.check_wrap(text, 3, ["B", "B0", "123"], text_len=self.text_len) + if __name__ == '__main__': unittest.main() diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 5dfec9e1ca746b..d4334154d9d9cb 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -200,21 +200,23 @@ def _fix_sentence_endings(self, chunks): i += 1 def _find_width_index(self, text, width): - """_find_width_index(text : string, width: int) + """_find_length_index(text : string, width : int) - Find at which index the text has the required width. + Find at which index the text has the required width, since when using a + different text_len, this index will not be equal to the required width. """ - # In most cases text_len will just use the number of characters, so this heuristic prevents calculating width - # for each character + # When using default len as self.text_len, the required index and width + # will be equal, this prevents calculation time. if self.text_len(text[:width]) == width: - # For character widths greater than one, width can be more than the number of characters + # For character widths greater than one, width can be more than the + # number of characters return min(width, len(text)) cur_text = '' for i, c in enumerate(text): cur_text += c cur_width = self.text_len(cur_text) - if cur_width >= width: - return i+1 + if cur_width > width: + return max(i, 1) def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): """_handle_long_word(chunks : [string],