From 56a249db111a611809c834603d93693863d464dd Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Mon, 4 Nov 2013 16:20:08 -0500
Subject: [PATCH 1/6] refs #64: Added memoization around certain expensive
 operations.

---
 pydocx/utils.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/pydocx/utils.py b/pydocx/utils.py
index fabe7863..6c88f37e 100644
--- a/pydocx/utils.py
+++ b/pydocx/utils.py
@@ -1,4 +1,6 @@
 import re
+import collections
+import functools
 
 from collections import defaultdict
 from xml.etree import cElementTree
@@ -21,6 +23,39 @@
 )
 
 
+class memoized(object):
+    '''
+    Decorator. Caches a function's return value each time it is called.
+    If called later with the same arguments, the cached value is returned
+    (not reevaluated).
+    Stolen from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+    '''
+    def __init__(self, func):
+        self.func = func
+        self.cache = {}
+
+    def __call__(self, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func(*args)
+        if args in self.cache:
+            return self.cache[args]
+        else:
+            value = self.func(*args)
+            self.cache[args] = value
+            return value
+
+    def __repr__(self):
+        '''Return the function's docstring.'''
+        return self.func.__doc__
+
+    def __get__(self, obj, objtype):
+        '''Support instance methods.'''
+        return functools.partial(self.__call__, obj)
+
+
+@memoized
 def el_iter(el):
     """
     Go through all elements
@@ -31,6 +66,7 @@ def el_iter(el):
         return el.findall('.//*')
 
 
+@memoized
 def find_first(el, tag):
     """
     Find the first occurrence of a tag beneath the current element.
@@ -38,6 +74,7 @@ def find_first(el, tag):
     return el.find('.//' + tag)
 
 
+@memoized
 def find_all(el, tag):
     """
     Find all occurrences of a tag
@@ -45,6 +82,7 @@ def find_all(el, tag):
     return el.findall('.//' + tag)
 
 
+@memoized
 def find_ancestor_with_tag(pre_processor, el, tag):
     """
     Find the first ancestor with that is a `tag`.
@@ -56,6 +94,7 @@ def find_ancestor_with_tag(pre_processor, el, tag):
     return None
 
 
+@memoized
 def has_descendant_with_tag(el, tag):
     """
     Determine if there is a child ahead in the element tree.

From e3cc3ecfce607c36fbf742f43c3430265b3c13ad Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Mon, 4 Nov 2013 16:21:07 -0500
Subject: [PATCH 2/6] refs #64: Update note

---
 CHANGELOG | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 829d1041..2522b5ec 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,9 @@
 
 Changelog
 =========
+* 0.3.13
+    * Significant performance gains for documents with a large number of table
+      cells.
 * 0.3.12
     * Added command line support to convert from docx to either html or
       markdown.

From 154fded8cf9f374d56ab78809a6e7afbff64a620 Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Mon, 4 Nov 2013 16:53:46 -0500
Subject: [PATCH 3/6] refs #64: Something odd with el_iter in 2.7

---
 pydocx/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pydocx/utils.py b/pydocx/utils.py
index 6c88f37e..d0f604ca 100644
--- a/pydocx/utils.py
+++ b/pydocx/utils.py
@@ -55,7 +55,6 @@ def __get__(self, obj, objtype):
         return functools.partial(self.__call__, obj)
 
 
-@memoized
 def el_iter(el):
     """
     Go through all elements

From ff1ba6567f16474170a6684b599d268834df4726 Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Tue, 5 Nov 2013 13:13:41 -0500
Subject: [PATCH 4/6] refs #64: fixed typos, added memoization, set() are
 significantly faster than lists (in this case)

---
 README.rst           |  8 ++---
 pydocx/DocxParser.py | 58 +++++++++++++++++++++-------------
 pydocx/utils.py      | 75 +++++++++++++++++++++++---------------------
 3 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/README.rst b/README.rst
index fe21f717..6c41ad8c 100644
--- a/README.rst
+++ b/README.rst
@@ -185,16 +185,16 @@ When creating your own Parser (as described above) you can now add in your own c
 ::
 
     class Docx2Foo(DocxParser):
-        pre_processor_class = FooPrePorcessor
+        pre_processor_class = FooPreProcessor
 
 
-The `FooPrePorcessor` will need a few things to get you going:
+The `FooPreProcessor` will need a few things to get you going:
 
 ::
 
-    class FooPrePorcessor(PydocxPrePorcessor):
+    class FooPreProcessor(PydocxPreProcessor):
         def perform_pre_processing(self, root, *args, **kwargs):
-            super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs)
+            super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs)
             self._set_foo(root)
 
         def _set_foo(self, root):
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
index 5d618ecd..70bf3643 100644
--- a/pydocx/DocxParser.py
+++ b/pydocx/DocxParser.py
@@ -6,13 +6,14 @@
 from contextlib import contextmanager
 
 from pydocx.utils import (
-    PydocxPrePorcessor,
-    get_list_style,
-    parse_xml_from_string,
-    find_first,
+    MulitMemoizeMixin,
+    PydocxPreProcessor,
     find_all,
     find_ancestor_with_tag,
+    find_first,
+    get_list_style,
     has_descendant_with_tag,
+    parse_xml_from_string,
 )
 from pydocx.exceptions import MalformedDocxException
 
@@ -46,9 +47,9 @@ def ZipFile(path):  # This is not needed in python 3.2+
     f.close()
 
 
-class DocxParser:
+class DocxParser(MulitMemoizeMixin):
     __metaclass__ = ABCMeta
-    pre_processor_class = PydocxPrePorcessor
+    pre_processor_class = PydocxPreProcessor
 
     def _extract_xml(self, f, xml_path):
         try:
@@ -161,13 +162,19 @@ def __init__(
 
         #all blank when we init
         self.comment_store = None
-        self.visited = []
+        self.visited = set()
         self.list_depth = 0
         self.rels_dict = self._parse_rels_root()
         self.styles_dict = self._parse_styles()
         self.parse_begin(self.root)  # begin to parse
 
     def parse_begin(self, el):
+        self.populate_memoization({
+            'find_all': find_all,
+            'find_first': find_first,
+            'has_descendant_with_tag': has_descendant_with_tag,
+            'get_tcs_in_column': self.get_tcs_in_column,
+        })
         self.pre_processor = self.pre_processor_class(
             convert_root_level_upper_roman=self.convert_root_level_upper_roman,
             styles_dict=self.styles_dict,
@@ -179,7 +186,7 @@ def parse_begin(self, el):
     def parse(self, el):
         if el in self.visited:
             return ''
-        self.visited.append(el)
+        self.visited.add(el)
         parsed = ''
         for child in el:
             # recursive. So you can get all the way to the bottom
@@ -417,7 +424,7 @@ def _should_append_break_tag(self, next_el):
         if self.pre_processor.previous(next_el) is None:
             return False
         tag_is_inline_like = any(
-            has_descendant_with_tag(next_el, tag) for
+            self.memod_tree_op('has_descendant_with_tag', next_el, tag) for
             tag in inline_like_tags
         )
         if tag_is_inline_like:
@@ -478,7 +485,20 @@ def _should_parse_next_as_content(el):
         # Create the actual li element
         return self.list_element(parsed)
 
+    def get_tcs_in_column(self, tbl, column_index):
+        return [
+            tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
+            if self.pre_processor.column_index(tc) == column_index
+        ]
+
     def _get_rowspan(self, el, v_merge):
+        restart_in_v_merge = False
+        if v_merge is not None and 'val' in v_merge.attrib:
+            restart_in_v_merge = 'restart' in v_merge.attrib['val']
+
+        if not restart_in_v_merge:
+            return ''
+
         current_row = self.pre_processor.row_index(el)
         current_col = self.pre_processor.column_index(el)
         rowspan = 1
@@ -488,24 +508,20 @@ def _get_rowspan(self, el, v_merge):
         # than the current_row and that are on the current_col
         if tbl is None:
             return ''
+
         tcs = [
-            tc for tc in find_all(tbl, 'tc')
-            if self.pre_processor.row_index(tc) >= current_row and
-            self.pre_processor.column_index(tc) == current_col
+            tc for tc in self.memod_tree_op(
+                'get_tcs_in_column', tbl, current_col,
+            ) if self.pre_processor.row_index(tc) >= current_row
         ]
-        restart_in_v_merge = False
-        if v_merge is not None and 'val' in v_merge.attrib:
-            restart_in_v_merge = 'restart' in v_merge.attrib['val']
 
-        def increment_rowspan(tc):
-            if not restart_in_v_merge:
-                return False
+        def should_increment_rowspan(tc):
             if not self.pre_processor.vmerge_continue(tc):
                 return False
             return True
 
         for tc in tcs:
-            if increment_rowspan(tc):
+            if should_increment_rowspan(tc):
                 rowspan += 1
             else:
                 rowspan = 1
@@ -517,7 +533,7 @@ def get_colspan(self, el):
         grid_span = find_first(el, 'gridSpan')
         if grid_span is None:
             return ''
-        return find_first(el, 'gridSpan').attrib['val']
+        return grid_span.attrib['val']
 
     def parse_table_cell_contents(self, el, text):
         parsed = text
@@ -640,7 +656,7 @@ def parse_r(self, el, parsed):
 
         # Get the rPr for the current style, they are the defaults.
         p = find_ancestor_with_tag(self.pre_processor, el, 'p')
-        paragraph_style = find_first(p, 'pStyle')
+        paragraph_style = self.memod_tree_op('find_first', p, 'pStyle')
         if paragraph_style is not None:
             style = paragraph_style.get('val')
             style_defaults = self.styles_dict.get(style, {})
diff --git a/pydocx/utils.py b/pydocx/utils.py
index d0f604ca..1323302b 100644
--- a/pydocx/utils.py
+++ b/pydocx/utils.py
@@ -1,6 +1,5 @@
 import re
 import collections
-import functools
 
 from collections import defaultdict
 from xml.etree import cElementTree
@@ -23,36 +22,41 @@
 )
 
 
-class memoized(object):
+class MulitMemoize(object):
     '''
-    Decorator. Caches a function's return value each time it is called.
-    If called later with the same arguments, the cached value is returned
-    (not reevaluated).
-    Stolen from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+    Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+    func_names = {
+        'find_all': find_all,
+        ...
+    }
     '''
-    def __init__(self, func):
-        self.func = func
-        self.cache = {}
+    def __init__(self, func_names):
+        self.cache = dict((func_name, {}) for func_name in func_names)
+        self.func_names = func_names
 
-    def __call__(self, *args):
+    def __call__(self, func_name, *args):
         if not isinstance(args, collections.Hashable):
             # uncacheable. a list, for instance.
             # better to not cache than blow up.
-            return self.func(*args)
-        if args in self.cache:
-            return self.cache[args]
+            return self.func_names[func_name](*args)
+        if args in self.cache[func_name]:
+            return self.cache[func_name][args]
         else:
-            value = self.func(*args)
-            self.cache[args] = value
+            value = self.func_names[func_name](*args)
+            self.cache[func_name][args] = value
             return value
 
-    def __repr__(self):
-        '''Return the function's docstring.'''
-        return self.func.__doc__
 
-    def __get__(self, obj, objtype):
-        '''Support instance methods.'''
-        return functools.partial(self.__call__, obj)
+class MulitMemoizeMixin(object):
+    def __init__(self, *args, **kwargs):
+        super(MulitMemoizeMixin, self).__init__(*args, **kwargs)
+        self._memoization = None
+
+    def memod_tree_op(self, func_name, *args):
+        return self._memoization(func_name, *args)
+
+    def populate_memoization(self, func_names):
+        self._memoization = MulitMemoize(func_names)
 
 
 def el_iter(el):
@@ -65,7 +69,6 @@ def el_iter(el):
         return el.findall('.//*')
 
 
-@memoized
 def find_first(el, tag):
     """
     Find the first occurrence of a tag beneath the current element.
@@ -73,7 +76,6 @@ def find_first(el, tag):
     return el.find('.//' + tag)
 
 
-@memoized
 def find_all(el, tag):
     """
     Find all occurrences of a tag
@@ -81,7 +83,6 @@ def find_all(el, tag):
     return el.findall('.//' + tag)
 
 
-@memoized
 def find_ancestor_with_tag(pre_processor, el, tag):
     """
     Find the first ancestor with that is a `tag`.
@@ -93,13 +94,12 @@ def find_ancestor_with_tag(pre_processor, el, tag):
     return None
 
 
-@memoized
 def has_descendant_with_tag(el, tag):
     """
     Determine if there is a child ahead in the element tree.
     """
     # Get child. stop at first child.
-    return True if el.find('.//' + tag) is not None else False
+    return True if find_first(el, tag) is not None else False
 
 
 def _filter_children(element, tags):
@@ -192,7 +192,7 @@ def num_id(self):
         return self._num_id
 
 
-class PydocxPrePorcessor(object):
+class PydocxPreProcessor(MulitMemoizeMixin):
     def __init__(
             self,
             convert_root_level_upper_roman=False,
@@ -205,6 +205,9 @@ def __init__(
         self.numbering_root = numbering_root
 
     def perform_pre_processing(self, root, *args, **kwargs):
+        self.populate_memoization({
+            'find_first': find_first,
+        })
         self._add_parent(root)
         # If we don't have a numbering root there cannot be any lists.
         if self.numbering_root is not None:
@@ -289,14 +292,12 @@ def _set_list_attributes(self, el):
             # Deleted text in a list will have a numId but no ilvl.
             if parent is None:
                 continue
-            if find_first(parent, 'ilvl') is None:
+            parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
+            if parent_ilvl is None:
                 continue
             self.meta_data[parent]['is_list_item'] = True
             self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
-            self.meta_data[parent]['ilvl'] = find_first(
-                parent,
-                'ilvl',
-            ).attrib['val']
+            self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
 
     def _generate_num_id(self, el):
         '''
@@ -402,9 +403,10 @@ def _set_headers(self, elements):
 
         for element in elements:
             # This element is using the default style which is not a heading.
-            if find_first(element, 'pStyle') is None:
+            p_style = find_first(element, 'pStyle')
+            if p_style is None:
                 continue
-            style = find_first(element, 'pStyle').attrib.get('val', '')
+            style = p_style.attrib.get('val', '')
             metadata = self.styles_dict.get(style, {})
             style_name = metadata.get('style_name')
 
@@ -427,6 +429,7 @@ def _convert_upper_roman(self, body):
             if self.is_first_list_item(el)
         ]
         visited_num_ids = []
+        all_p_tags_in_body = find_all(body, 'p')
         for root_list_item in first_root_list_items:
             if self.num_id(root_list_item) in visited_num_ids:
                 continue
@@ -439,11 +442,11 @@ def _convert_upper_roman(self, body):
             if lst_style != 'upperRoman':
                 continue
             ilvl = min(
-                self.ilvl(el) for el in find_all(body, 'p')
+                self.ilvl(el) for el in all_p_tags_in_body
                 if self.num_id(el) == self.num_id(root_list_item)
             )
             root_upper_roman_list_items = [
-                el for el in find_all(body, 'p')
+                el for el in all_p_tags_in_body
                 if self.num_id(el) == self.num_id(root_list_item) and
                 self.ilvl(el) == ilvl
             ]

From 928bcd189837f3649845b8408688d0b24b6fb521 Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Tue, 5 Nov 2013 13:16:01 -0500
Subject: [PATCH 5/6] refs #64: Update note

---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index 2522b5ec..d40440c9 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@ Changelog
 * 0.3.13
     * Significant performance gains for documents with a large number of table
       cells.
+    * Significant performance gains for large documents.
 * 0.3.12
     * Added command line support to convert from docx to either html or
       markdown.

From 014377af180f99738c4f7dd1fcaf958b0639f43d Mon Sep 17 00:00:00 2001
From: Jason Ward <jason.ward@policystat.com>
Date: Tue, 5 Nov 2013 14:41:36 -0500
Subject: [PATCH 6/6] refs #64: Simple name change.

---
 pydocx/DocxParser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
index 70bf3643..3e24f98f 100644
--- a/pydocx/DocxParser.py
+++ b/pydocx/DocxParser.py
@@ -173,7 +173,7 @@ def parse_begin(self, el):
             'find_all': find_all,
             'find_first': find_first,
             'has_descendant_with_tag': has_descendant_with_tag,
-            'get_tcs_in_column': self.get_tcs_in_column,
+            '_get_tcs_in_column': self._get_tcs_in_column,
         })
         self.pre_processor = self.pre_processor_class(
             convert_root_level_upper_roman=self.convert_root_level_upper_roman,
@@ -485,7 +485,7 @@ def _should_parse_next_as_content(el):
         # Create the actual li element
         return self.list_element(parsed)
 
-    def get_tcs_in_column(self, tbl, column_index):
+    def _get_tcs_in_column(self, tbl, column_index):
         return [
             tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
             if self.pre_processor.column_index(tc) == column_index
@@ -511,7 +511,7 @@ def _get_rowspan(self, el, v_merge):
 
         tcs = [
             tc for tc in self.memod_tree_op(
-                'get_tcs_in_column', tbl, current_col,
+                '_get_tcs_in_column', tbl, current_col,
             ) if self.pre_processor.row_index(tc) >= current_row
         ]