From 4d7082c2a9541ea1ddff0fc36873838836971a5b Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 17:37:11 -0700
Subject: [PATCH 1/7] Clean up docstring style

Trim spaces around docstring text, use consistent style for multiline docstrings.
---
 web_monitoring_diff/html_render_diff.py | 42 ++++++++++++++++---------
 1 file changed, 27 insertions(+), 15 deletions(-)
diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 3ddc24c..5af5ede 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -619,8 +619,9 @@ def _count_changes(opcodes):
 # lxml.html.diff. We plan to change it significantly.
 
 def expand_tokens(tokens, equal=False):
-    """Given a list of tokens, return a generator of the chunks of
-    text for the data in the tokens.
+    """
+    Given a list of tokens, return a generator of the chunks of text for the
+    data in the tokens.
     """
     for token in tokens:
         for pre in token.pre_tags:
@@ -635,7 +636,8 @@ def expand_tokens(tokens, equal=False):
 
 
 class DiffToken(str):
-    """ Represents a diffable token, generally a word that is displayed to
+    """
+    Represents a diffable token, generally a word that is displayed to
     the user.  Opening tags are attached to this token when they are
     adjacent (pre_tags) and closing tags that follow the word
     (post_tags).  Some exceptions occur when there are empty tags
@@ -645,7 +647,8 @@ class DiffToken(str):
     We also keep track of whether the word was originally followed by
     whitespace, even though we do not want to treat the word as
     equivalent to a similar word that does not have a trailing
-    space."""
+    space.
+    """
 
     # When this is true, the token will be eliminated from the
     # displayed diff if no change has occurred:
@@ -677,10 +680,11 @@ def html(self):
 
 
 class tag_token(DiffToken):
-
-    """ Represents a token that is actually a tag.  Currently this is just
+    """
+    Represents a token that is actually a tag.  Currently this is just
     the <img> tag, which takes up visible space just like a word but
-    is only represented in a document by a tag.  """
+    is only represented in a document by a tag.
+    """
 
     def __new__(cls, tag, data, html_repr, comparator, pre_tags=None,
                 post_tags=None, trailing_whitespace=""):
@@ -708,8 +712,10 @@ def html(self):
 
 
 class href_token(DiffToken):
-    """ Represents the href in an anchor tag.  Unlike other words, we only
-    show the href when it changes.  """
+    """
+    Represents the href in an anchor tag.  Unlike other words, we only
+    show the href when it changes.
+    """
 
     hide_when_equal = True
 
@@ -851,12 +857,14 @@ def fixup_chunks(chunks, comparator):
 
 
 def flatten_el(el, include_hrefs, skip_tag=False):
-    """ Takes an lxml element el, and generates all the text chunks for
+    """
+    Takes an lxml element el, and generates all the text chunks for
     that tag.  Each start tag is a chunk, each word is a chunk, and each
     end tag is a chunk.
 
     If skip_tag is true, then the outermost container tag is
-    not returned (just its contents)."""
+    not returned (just its contents).
+    """
     if not skip_tag:
         if el.tag == 'img':
             src_array = []
@@ -899,8 +907,10 @@ def flatten_el(el, include_hrefs, skip_tag=False):
 split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
 
 def split_words(text):
-    """ Splits some text into words. Includes trailing whitespace
-    on each word when appropriate.  """
+    """
+    Splits some text into words. Includes trailing whitespace on each word when
+    appropriate.
+    """
     if not text or not text.strip():
         return []
 
@@ -918,8 +928,10 @@ def start_tag(el):
                          for name, value in el.attrib.items()]))
 
 def end_tag(el):
-    """ The text representation of an end tag for a tag.  Includes
-    trailing whitespace when appropriate.  """
+    """
+    The text representation of an end tag for a tag.  Includes trailing
+    whitespace when appropriate.
+    """
     if el.tail and start_whitespace_re.search(el.tail):
         extra = ' '
     else:

From 5b2db97bd866599df2bf55c074b0dd41037f9100 Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 17:49:11 -0700
Subject: [PATCH 2/7] Fix bug in `_limit_spacers()`

In diffs that went over the maximum number of spacers, it turns out that the `_limit_spacers()` function stripped out important tag information! This fixes the issue, but introduces some performance overhead. To handle that, a follow-on change should consider:
1. Moving the spacer-limiting logic into `customize_tokens()` so we don't even create too many spacers in the first place.
2. Revisit the whole spacer approach in the first place. There may be better approaches now.
---
 web_monitoring_diff/html_render_diff.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 5af5ede..8235262 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -595,13 +595,25 @@ def render_diff(diff_type):
 # tokenization phase, though.
 def _limit_spacers(tokens, max_spacers):
     limited_tokens = []
+    extra_pre_tags = []
     for token in tokens:
         if isinstance(token, SpacerToken):
             if max_spacers <= 0:
+                extra_pre_tags.extend(token.pre_tags)
+                extra_pre_tags.extend(token.post_tags)
                 continue
             max_spacers -= 1
+
+        if len(extra_pre_tags):
+            token.pre_tags = [*extra_pre_tags, *token.pre_tags]
+            extra_pre_tags.clear()
+
         limited_tokens.append(token)
 
+    if len(extra_pre_tags):
+        last = limited_tokens[-1]
+        last.post_tags = [*last.post_tags, *extra_pre_tags]
+
     return limited_tokens
 
 

From 38245ef054527f7c5da1cc6028f3e558856f3abe Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 17:52:11 -0700
Subject: [PATCH 3/7] Make contrast script a raw string

This resolves some not-useful warnings about invalid escapes that we were getting. Nothing should be escaped in here in the first place; it's a pure JavaScript string with no substitutions or dynamic values.
---
 web_monitoring_diff/html_render_diff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 8235262..d1a4132 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -1905,7 +1905,7 @@ def get_diff_styles():
         script {{display: none !important;}}'''
 
 
-UPDATE_CONTRAST_SCRIPT = """
+UPDATE_CONTRAST_SCRIPT = r"""
     (function () {
         // Update the text color of change elements to ensure a readable level
         // of contrast with the background color

From 2e4483d190bfcf301f7a088ba68dd1dfd2338b11 Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 18:02:33 -0700
Subject: [PATCH 4/7] Remove vestigial token balancing code

There's a big TODO about removing this when we finally fully forked lxhtml's differ. That happened a long time ago, and we did in fact make the changes that turned this into effectively wasted iteration/dead code. I ran a few tests over a variety of big and small diffs to make sure the code being removed here really doesn't do anything anymore, and that seems to be the case. Reading the logic, it also seems like this should be entirely vestigial, and never wind up actually changing the tokens.
---
 web_monitoring_diff/html_render_diff.py | 61 ++++++-------------------
 1 file changed, 13 insertions(+), 48 deletions(-)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index d1a4132..4170114 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -850,6 +850,19 @@ def fixup_chunks(chunks, comparator):
             tag_accum.append(chunk[1])
 
         elif current_token == TokenType.end_tag:
+            # Ensure any closing tags get added to the previous token as
+            # `post_tags`, rather than the next token as `pre_tags`. This makes
+            # placing the end of elements in the right place when re-assembling
+            # the final diff from added/removed tokens easier to do.
+            #
+            # That is, given HTML like:
+            #
+            #    <p><a>Hello!</a></p><div>…there.</div>
+            #
+            # We want output like:
+            #
+            #    [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
+            #     ('…there.', pre=[<div>'], post=['</div>'])]
             if tag_accum:
                 tag_accum.append(chunk[1])
             else:
@@ -1034,54 +1047,6 @@ def __hash__(self):
 def _customize_tokens(tokens):
     SPACER_STRING = '\nSPACER'
 
-    # Balance out pre- and post-tags so that a token of text is surrounded by
-    # the opening and closing tags of the element it's in. For example:
-    #
-    #    <p><a>Hello!</a></p><div>…there.</div>
-    #
-    # Currently parses as:
-    #    [('Hello!', pre=['<p>','<a>'], post=[]),
-    #     ('…there.', pre=['</a>','</p>','<div>'], post=['</div>'])]
-    #    (Note the '</div>' post tag is only present at the end of the doc)
-    #
-    # But this attempts make it more like:
-    #
-    #    [('Hello!', pre=['<p>','<a>'], post=['</a>','</p>']),
-    #     ('…there.', pre=[<div>'], post=['</div>'])]
-    #
-    # TODO: when we get around to also forking the parse/tokenize part of this
-    # diff, do this as part of the original tokenization instead.
-    for token_index, token in enumerate(tokens):
-        # logger.debug(f'Handling token {token_index}: {token}')
-        if token_index == 0:
-            continue
-        previous = tokens[token_index - 1]
-        previous_post_complete = False
-        for post_index, tag in enumerate(previous.post_tags):
-            if not tag.startswith('</'):
-                # TODO: should we attempt to fill pure-structure tags here with
-                # spacers? e.g. should we take the "<p><em></em></p>" here and
-                # wrap a spacer token in it instead of moving to "next-text's"
-                # pre_tags? "text</p><p><em></em></p><p>next-text"
-                token.pre_tags = previous.post_tags[post_index:] + token.pre_tags
-                previous.post_tags = previous.post_tags[:post_index]
-                previous_post_complete = True
-                break
-
-        if not previous_post_complete:
-            for pre_index, tag in enumerate(token.pre_tags):
-                if not tag.startswith('</'):
-                    if pre_index > 0:
-                        previous.post_tags.extend(token.pre_tags[:pre_index])
-                        token.pre_tags = token.pre_tags[pre_index:]
-                    break
-            else:
-                previous.post_tags.extend(token.pre_tags)
-                token.pre_tags = []
-
-
-        # logger.debug(f'  Result...\n    pre: {token.pre_tags}\n    token: "{token}"\n    post: {token.post_tags}')
-
     result = []
     # for token in tokens:
     for token_index, token in enumerate(tokens):

From cb540b612047065af823d5577cc19648bcc99ed9 Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 18:12:48 -0700
Subject: [PATCH 5/7] Get rid of `_customize_token()`

The only thing this function was doing was replacing `href_token` instances with `MinimalHrefToken`. We did this at a time when we were using parts of the tokenization internals from lxml instead of fully forking it. We have long since fully forked it, however, and we should just be creating `MinimalHrefToken` where we want them in the first place instead of looping through and replacing other tokens with them.
---
 web_monitoring_diff/html_render_diff.py | 38 ++++++-------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 4170114..78c23cc 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -831,7 +831,7 @@ def fixup_chunks(chunks, comparator):
 
         elif current_token == TokenType.href:
             href = chunk[1]
-            cur_word = href_token(href, comparator=comparator, pre_tags=tag_accum, trailing_whitespace=" ")
+            cur_word = MinimalHrefToken(href, comparator=comparator, pre_tags=tag_accum, trailing_whitespace=" ")
             tag_accum = []
             result.append(cur_word)
 
@@ -1112,20 +1112,19 @@ def _customize_tokens(tokens):
         #     result.append(SpacerToken(SPACER_STRING))
         #     result.append(SpacerToken(SPACER_STRING))
 
-        customized = _customize_token(token)
-        result.append(customized)
+        result.append(token)
 
-        if str(customized) == "Posts" and str(tokens[token_index - 1]) == 'Other' and str(tokens[token_index - 2]) == 'and': # and str(tokens[token_index - 3]) == 'posts':
+        if str(token) == "Posts" and str(tokens[token_index - 1]) == 'Other' and str(tokens[token_index - 2]) == 'and': # and str(tokens[token_index - 3]) == 'posts':
             logger.debug(f'SPECIAL TAG!\n  pre: {token.pre_tags}\n  token: "{token}"\n  post: {token.post_tags}')
             next_token = tokens[token_index + 1]
             logger.debug(f'SPECIAL TAG!\n  pre: {next_token.pre_tags}\n  token: "{next_token}"\n  post: {next_token.post_tags}')
-            for tag_index, tag in enumerate(customized.post_tags):
+            for tag_index, tag in enumerate(token.post_tags):
                 if tag.startswith('</ul>'):
                     new_token = SpacerToken(SPACER_STRING)
                     result.append(new_token)
-                    new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index:])
+                    new_token = SpacerToken(SPACER_STRING, pre_tags=token.post_tags[tag_index:])
                     result.append(new_token)
-                    customized.post_tags = customized.post_tags[:tag_index]
+                    token.post_tags = token.post_tags[:tag_index]
 
         # if isinstance(customized, ImgTagToken):
         #     result.append(SpacerToken(SPACER_STRING))
@@ -1143,7 +1142,7 @@ def _customize_tokens(tokens):
         #     # result.append(SpacerToken(SPACER_STRING, post_tags=customized.post_tags, trailing_whitespace=customized.trailing_whitespace))
         #     customized.post_tags = []
         #     # customized.trailing_whitespace = ''
-        for tag_index, tag in enumerate(customized.post_tags):
+        for tag_index, tag in enumerate(token.post_tags):
             split_here = False
             for name in SEPARATABLE_TAGS:
                 if tag.startswith(f'<{name}'):
@@ -1156,8 +1155,8 @@ def _customize_tokens(tokens):
                 # new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index:])
                 # customized.post_tags = customized.post_tags[0:tag_index]
 
-                new_token = SpacerToken(SPACER_STRING, post_tags=customized.post_tags[tag_index:])
-                customized.post_tags = customized.post_tags[0:tag_index]
+                new_token = SpacerToken(SPACER_STRING, post_tags=token.post_tags[tag_index:])
+                token.post_tags = token.post_tags[0:tag_index]
 
                 # tokens.insert(token_index + 1, token)
                 # token = new_token
@@ -1193,25 +1192,6 @@ def _has_heading_tags(tag_list):
                 return True
 
 
-# Seemed so nice and clean! But should probably be merged into
-# `_customize_tokens()` now. Or otherwise it needs to be able to produce more
-# than one token to replace the given token in the stream.
-def _customize_token(token):
-    """
-    Replace existing diffing tokens with customized ones for better output.
-    """
-    if isinstance(token, href_token):
-        return MinimalHrefToken(
-            str(token),
-            comparator=token.comparator,
-            pre_tags=token.pre_tags,
-            post_tags=token.post_tags,
-            trailing_whitespace=token.trailing_whitespace)
-        # return token
-    else:
-        return token
-
-
 # TODO: merge and reconcile this with `merge_change_groups()`, which is 90%
 # the same thing; it outputs the change elements as nested lists of tokens.
 def merge_changes(change_chunks, doc, tag_type='ins'):

From 6b425b68ea335c05d0a22c88d4b6338e5f4347d8 Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 18:17:58 -0700
Subject: [PATCH 6/7] Rename `_customize_tokens` to `_insert_spacers`

This is now more accurate to what the function does.
---
 web_monitoring_diff/html_render_diff.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 78c23cc..84f335f 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -554,10 +554,8 @@ def _htmldiff(old, new, comparator, include='all'):
     """
     old_tokens = tokenize(old, comparator)
     new_tokens = tokenize(new, comparator)
-    # old_tokens = [_customize_token(token) for token in old_tokens]
-    # new_tokens = [_customize_token(token) for token in new_tokens]
-    old_tokens = _limit_spacers(_customize_tokens(old_tokens), MAX_SPACERS)
-    new_tokens = _limit_spacers(_customize_tokens(new_tokens), MAX_SPACERS)
+    old_tokens = _limit_spacers(_insert_spacers(old_tokens), MAX_SPACERS)
+    new_tokens = _limit_spacers(_insert_spacers(new_tokens), MAX_SPACERS)
     # result = htmldiff_tokens(old_tokens, new_tokens)
     # result = diff_tokens(old_tokens, new_tokens) #, include='delete')
     logger.debug('CUSTOMIZED!')
@@ -1044,7 +1042,17 @@ def __hash__(self):
         return super().__hash__()
 
 
-def _customize_tokens(tokens):
+# FIXME: Add a `max` parameter or similar to limit the number of spacers. We
+# currently do this by making a second pass to remove extra spacers
+# (in `_limit_spacers()`), which is completely pointless extra work, and very
+# expensive on big HTML documents.
+#
+# TODO: This entire bit of functionality should be rethought. The spacers were
+# a bit of a hack from the early days when we were slightly customizing lxml's
+# differ, and we've since changed the internals a lot. The spacers have never
+# worked especially well, and we may be better off without them. This needs
+# *lots* of testing, though.
+def _insert_spacers(tokens):
     SPACER_STRING = '\nSPACER'
 
     result = []

From b020d0cea6178e390e74578975aeed25b9e528f1 Mon Sep 17 00:00:00 2001
From: Rob Brackett <rob@robbrackett.com>
Date: Mon, 1 May 2023 19:32:57 -0700
Subject: [PATCH 7/7] Don't insert more spacers than we are allowed

---
 web_monitoring_diff/html_render_diff.py | 108 +++++++++++-------------
 1 file changed, 48 insertions(+), 60 deletions(-)

diff --git a/web_monitoring_diff/html_render_diff.py b/web_monitoring_diff/html_render_diff.py
index 84f335f..a94711c 100644
--- a/web_monitoring_diff/html_render_diff.py
+++ b/web_monitoring_diff/html_render_diff.py
@@ -554,8 +554,8 @@ def _htmldiff(old, new, comparator, include='all'):
     """
     old_tokens = tokenize(old, comparator)
     new_tokens = tokenize(new, comparator)
-    old_tokens = _limit_spacers(_insert_spacers(old_tokens), MAX_SPACERS)
-    new_tokens = _limit_spacers(_insert_spacers(new_tokens), MAX_SPACERS)
+    old_tokens = _insert_spacers(old_tokens, MAX_SPACERS)
+    new_tokens = _insert_spacers(new_tokens, MAX_SPACERS)
     # result = htmldiff_tokens(old_tokens, new_tokens)
     # result = diff_tokens(old_tokens, new_tokens) #, include='delete')
     logger.debug('CUSTOMIZED!')
@@ -586,35 +586,6 @@ def render_diff(diff_type):
     return metadata, diffs
 
 
-# FIXME: this is utterly ridiculous -- the crazy spacer token solution we came
-# up with can add so much extra stuff to some kinds of pages that
-# SequenceMatcher chokes on it. This strips out excess spacers. We should
-# really re-examine the whole spacer token concept now that we control the
-# tokenization phase, though.
-def _limit_spacers(tokens, max_spacers):
-    limited_tokens = []
-    extra_pre_tags = []
-    for token in tokens:
-        if isinstance(token, SpacerToken):
-            if max_spacers <= 0:
-                extra_pre_tags.extend(token.pre_tags)
-                extra_pre_tags.extend(token.post_tags)
-                continue
-            max_spacers -= 1
-
-        if len(extra_pre_tags):
-            token.pre_tags = [*extra_pre_tags, *token.pre_tags]
-            extra_pre_tags.clear()
-
-        limited_tokens.append(token)
-
-    if len(extra_pre_tags):
-        last = limited_tokens[-1]
-        last.post_tags = [*last.post_tags, *extra_pre_tags]
-
-    return limited_tokens
-
-
 def _count_changes(opcodes):
     counts = Counter(map(lambda operation: operation[0], opcodes))
     return {
@@ -1052,12 +1023,28 @@ def __hash__(self):
 # differ, and we've since changed the internals a lot. The spacers have never
 # worked especially well, and we may be better off without them. This needs
 # *lots* of testing, though.
-def _insert_spacers(tokens):
+def _insert_spacers(tokens, max = MAX_SPACERS):
+    if max < 1:
+        return tokens
+
     SPACER_STRING = '\nSPACER'
 
+    def allocate_spacer(count = 1):
+        nonlocal max
+        if max < count:
+            return False
+        else:
+            max -= count
+            return True
+
     result = []
     # for token in tokens:
     for token_index, token in enumerate(tokens):
+        # Bail out early if we've run out of spacers.
+        if max < 1:
+            result.append(token)
+            continue
+
         # if str(token).lower().startswith('impacts'):
         # if str(token).lower().startswith('although'):
         #     logger.debug(f'SPECIAL TAG!\n  pre: {token.pre_tags}\n  token: "{token}"\n  post: {token.post_tags}')
@@ -1073,6 +1060,7 @@ def _insert_spacers(tokens):
         try_splitting = len(token.pre_tags) > 0
         split_start = 0
         while try_splitting:
+            try_splitting = False
             for tag_index, tag in enumerate(token.pre_tags[split_start:]):
                 split_here = False
                 for name in SEPARATABLE_TAGS:
@@ -1080,22 +1068,21 @@ def _insert_spacers(tokens):
                         split_here = True
                         break
                 if split_here:
-                    # new_token = SpacerToken(SPACER_STRING, pre_tags=token.pre_tags[0:tag_index + 1])
-                    # token.pre_tags = token.pre_tags[tag_index + 1:]
-
-                    new_token = SpacerToken(SPACER_STRING, pre_tags=token.pre_tags[0:tag_index + split_start])
-                    token.pre_tags = token.pre_tags[tag_index + split_start:]
-
-                    # tokens.insert(token_index + 1, token)
-                    # token = new_token
-                    result.append(new_token)
-                    result.append(SpacerToken(SPACER_STRING))
-                    result.append(SpacerToken(SPACER_STRING))
-                    try_splitting = len(token.pre_tags) > 1
-                    split_start = 1
+                    if allocate_spacer(3):
+                        # new_token = SpacerToken(SPACER_STRING, pre_tags=token.pre_tags[0:tag_index + 1])
+                        # token.pre_tags = token.pre_tags[tag_index + 1:]
+
+                        new_token = SpacerToken(SPACER_STRING, pre_tags=token.pre_tags[0:tag_index + split_start])
+                        token.pre_tags = token.pre_tags[tag_index + split_start:]
+
+                        # tokens.insert(token_index + 1, token)
+                        # token = new_token
+                        result.append(new_token)
+                        result.append(SpacerToken(SPACER_STRING))
+                        result.append(SpacerToken(SPACER_STRING))
+                        try_splitting = len(token.pre_tags) > 1
+                        split_start = 1
                     break
-                else:
-                    try_splitting = False
 
 
         # This is a CRITICAL scenario, but should probably be generalized and
@@ -1110,7 +1097,7 @@ def _insert_spacers(tokens):
         for index, tag in enumerate(token.pre_tags):
             if tag.startswith('<a') and len(token.pre_tags) > index + 1:
                 next_tag = token.pre_tags[index + 1]
-                if next_tag and next_tag.startswith('</a'):
+                if next_tag and next_tag.startswith('</a') and allocate_spacer():
                     result.append(SpacerToken('~EMPTY~', pre_tags=token.pre_tags[0:index], post_tags=token.pre_tags[index:]))
                     token.pre_tags = []
 
@@ -1127,7 +1114,7 @@ def _insert_spacers(tokens):
             next_token = tokens[token_index + 1]
             logger.debug(f'SPECIAL TAG!\n  pre: {next_token.pre_tags}\n  token: "{next_token}"\n  post: {next_token.post_tags}')
             for tag_index, tag in enumerate(token.post_tags):
-                if tag.startswith('</ul>'):
+                if tag.startswith('</ul>') and allocate_spacer(2):
                     new_token = SpacerToken(SPACER_STRING)
                     result.append(new_token)
                     new_token = SpacerToken(SPACER_STRING, pre_tags=token.post_tags[tag_index:])
@@ -1157,20 +1144,21 @@ def _insert_spacers(tokens):
                     split_here = True
                     break
             if split_here:
-                # new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index + 1:])
-                # customized.post_tags = customized.post_tags[0:tag_index + 1]
+                if allocate_spacer(3):
+                    # new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index + 1:])
+                    # customized.post_tags = customized.post_tags[0:tag_index + 1]
 
-                # new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index:])
-                # customized.post_tags = customized.post_tags[0:tag_index]
+                    # new_token = SpacerToken(SPACER_STRING, pre_tags=customized.post_tags[tag_index:])
+                    # customized.post_tags = customized.post_tags[0:tag_index]
 
-                new_token = SpacerToken(SPACER_STRING, post_tags=token.post_tags[tag_index:])
-                token.post_tags = token.post_tags[0:tag_index]
+                    new_token = SpacerToken(SPACER_STRING, post_tags=token.post_tags[tag_index:])
+                    token.post_tags = token.post_tags[0:tag_index]
 
-                # tokens.insert(token_index + 1, token)
-                # token = new_token
-                result.append(new_token)
-                result.append(SpacerToken(SPACER_STRING))
-                result.append(SpacerToken(SPACER_STRING))
+                    # tokens.insert(token_index + 1, token)
+                    # token = new_token
+                    result.append(new_token)
+                    result.append(SpacerToken(SPACER_STRING))
+                    result.append(SpacerToken(SPACER_STRING))
                 break
 
     return result