CenterForOpenScience · jlward · Nov 5, 2013 · Nov 4, 2013 · Nov 4, 2013 · Nov 4, 2013
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,10 @@
 
 Changelog
 =========
+* 0.3.13
+    * Significant performance gains for documents with a large number of table
+      cells.
+    * Significant performance gains for large documents.
 * 0.3.12
     * Added command line support to convert from docx to either html or
       markdown.

diff --git a/README.rst b/README.rst
@@ -185,16 +185,16 @@ When creating your own Parser (as described above) you can now add in your own c
 ::
 
     class Docx2Foo(DocxParser):
-        pre_processor_class = FooPrePorcessor
+        pre_processor_class = FooPreProcessor
 
 
-The `FooPrePorcessor` will need a few things to get you going:
+The `FooPreProcessor` will need a few things to get you going:
 
 ::
 
-    class FooPrePorcessor(PydocxPrePorcessor):
+    class FooPreProcessor(PydocxPreProcessor):
         def perform_pre_processing(self, root, *args, **kwargs):
-            super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs)
+            super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs)
             self._set_foo(root)
 
         def _set_foo(self, root):

diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
@@ -6,13 +6,14 @@
 from contextlib import contextmanager
 
 from pydocx.utils import (
-    PydocxPrePorcessor,
-    get_list_style,
-    parse_xml_from_string,
-    find_first,
+    MulitMemoizeMixin,
+    PydocxPreProcessor,
     find_all,
     find_ancestor_with_tag,
+    find_first,
+    get_list_style,
     has_descendant_with_tag,
+    parse_xml_from_string,
 )
 from pydocx.exceptions import MalformedDocxException
 
@@ -46,9 +47,9 @@ def ZipFile(path):  # This is not needed in python 3.2+
     f.close()
 
 
-class DocxParser:
+class DocxParser(MulitMemoizeMixin):
     __metaclass__ = ABCMeta
-    pre_processor_class = PydocxPrePorcessor
+    pre_processor_class = PydocxPreProcessor
 
     def _extract_xml(self, f, xml_path):
         try:
@@ -161,13 +162,19 @@ def __init__(
 
         #all blank when we init
         self.comment_store = None
-        self.visited = []
+        self.visited = set()
         self.list_depth = 0
         self.rels_dict = self._parse_rels_root()
         self.styles_dict = self._parse_styles()
         self.parse_begin(self.root)  # begin to parse
 
     def parse_begin(self, el):
+        self.populate_memoization({
+            'find_all': find_all,
+            'find_first': find_first,
+            'has_descendant_with_tag': has_descendant_with_tag,
+            '_get_tcs_in_column': self._get_tcs_in_column,
+        })
         self.pre_processor = self.pre_processor_class(
             convert_root_level_upper_roman=self.convert_root_level_upper_roman,
             styles_dict=self.styles_dict,
@@ -179,7 +186,7 @@ def parse_begin(self, el):
     def parse(self, el):
         if el in self.visited:
             return ''
-        self.visited.append(el)
+        self.visited.add(el)
         parsed = ''
         for child in el:
             # recursive. So you can get all the way to the bottom
@@ -417,7 +424,7 @@ def _should_append_break_tag(self, next_el):
         if self.pre_processor.previous(next_el) is None:
             return False
         tag_is_inline_like = any(
-            has_descendant_with_tag(next_el, tag) for
+            self.memod_tree_op('has_descendant_with_tag', next_el, tag) for
             tag in inline_like_tags
         )
         if tag_is_inline_like:
@@ -478,7 +485,20 @@ def _should_parse_next_as_content(el):
         # Create the actual li element
         return self.list_element(parsed)
 
+    def _get_tcs_in_column(self, tbl, column_index):
+        return [
+            tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
+            if self.pre_processor.column_index(tc) == column_index
+        ]
+
     def _get_rowspan(self, el, v_merge):
+        restart_in_v_merge = False
+        if v_merge is not None and 'val' in v_merge.attrib:
+            restart_in_v_merge = 'restart' in v_merge.attrib['val']
+
+        if not restart_in_v_merge:
+            return ''
+
         current_row = self.pre_processor.row_index(el)
         current_col = self.pre_processor.column_index(el)
         rowspan = 1
@@ -488,24 +508,20 @@ def _get_rowspan(self, el, v_merge):
         # than the current_row and that are on the current_col
         if tbl is None:
             return ''
+
         tcs = [
-            tc for tc in find_all(tbl, 'tc')
-            if self.pre_processor.row_index(tc) >= current_row and
-            self.pre_processor.column_index(tc) == current_col
+            tc for tc in self.memod_tree_op(
+                '_get_tcs_in_column', tbl, current_col,
+            ) if self.pre_processor.row_index(tc) >= current_row
         ]
-        restart_in_v_merge = False
-        if v_merge is not None and 'val' in v_merge.attrib:
-            restart_in_v_merge = 'restart' in v_merge.attrib['val']
 
-        def increment_rowspan(tc):
-            if not restart_in_v_merge:
-                return False
+        def should_increment_rowspan(tc):
             if not self.pre_processor.vmerge_continue(tc):
                 return False
             return True
 
         for tc in tcs:
-            if increment_rowspan(tc):
+            if should_increment_rowspan(tc):
                 rowspan += 1
             else:
                 rowspan = 1
@@ -517,7 +533,7 @@ def get_colspan(self, el):
         grid_span = find_first(el, 'gridSpan')
         if grid_span is None:
             return ''
-        return find_first(el, 'gridSpan').attrib['val']
+        return grid_span.attrib['val']
 
     def parse_table_cell_contents(self, el, text):
         parsed = text
@@ -640,7 +656,7 @@ def parse_r(self, el, parsed):
 
         # Get the rPr for the current style, they are the defaults.
         p = find_ancestor_with_tag(self.pre_processor, el, 'p')
-        paragraph_style = find_first(p, 'pStyle')
+        paragraph_style = self.memod_tree_op('find_first', p, 'pStyle')
         if paragraph_style is not None:
             style = paragraph_style.get('val')
             style_defaults = self.styles_dict.get(style, {})

diff --git a/pydocx/utils.py b/pydocx/utils.py
@@ -1,4 +1,5 @@
 import re
+import collections
 
 from collections import defaultdict
 from xml.etree import cElementTree
@@ -21,6 +22,43 @@
 )
 
 
+class MulitMemoize(object):
+    '''
+    Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+    func_names = {
+        'find_all': find_all,
+        ...
+    }
+    '''
+    def __init__(self, func_names):
+        self.cache = dict((func_name, {}) for func_name in func_names)
+        self.func_names = func_names
+
+    def __call__(self, func_name, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func_names[func_name](*args)
+        if args in self.cache[func_name]:
+            return self.cache[func_name][args]
+        else:
+            value = self.func_names[func_name](*args)
+            self.cache[func_name][args] = value
+            return value
+
+
+class MulitMemoizeMixin(object):
+    def __init__(self, *args, **kwargs):
+        super(MulitMemoizeMixin, self).__init__(*args, **kwargs)
+        self._memoization = None
+
+    def memod_tree_op(self, func_name, *args):
+        return self._memoization(func_name, *args)
+
+    def populate_memoization(self, func_names):
+        self._memoization = MulitMemoize(func_names)
+
+
 def el_iter(el):
     """
     Go through all elements
@@ -61,7 +99,7 @@ def has_descendant_with_tag(el, tag):
     Determine if there is a child ahead in the element tree.
     """
     # Get child. stop at first child.
-    return True if el.find('.//' + tag) is not None else False
+    return True if find_first(el, tag) is not None else False
 
 
 def _filter_children(element, tags):
@@ -154,7 +192,7 @@ def num_id(self):
         return self._num_id
 
 
-class PydocxPrePorcessor(object):
+class PydocxPreProcessor(MulitMemoizeMixin):
     def __init__(
             self,
             convert_root_level_upper_roman=False,
@@ -167,6 +205,9 @@ def __init__(
         self.numbering_root = numbering_root
 
     def perform_pre_processing(self, root, *args, **kwargs):
+        self.populate_memoization({
+            'find_first': find_first,
+        })
         self._add_parent(root)
         # If we don't have a numbering root there cannot be any lists.
         if self.numbering_root is not None:
@@ -251,14 +292,12 @@ def _set_list_attributes(self, el):
             # Deleted text in a list will have a numId but no ilvl.
             if parent is None:
                 continue
-            if find_first(parent, 'ilvl') is None:
+            parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
+            if parent_ilvl is None:
                 continue
             self.meta_data[parent]['is_list_item'] = True
             self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
-            self.meta_data[parent]['ilvl'] = find_first(
-                parent,
-                'ilvl',
-            ).attrib['val']
+            self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
 
     def _generate_num_id(self, el):
         '''
@@ -364,9 +403,10 @@ def _set_headers(self, elements):
 
         for element in elements:
             # This element is using the default style which is not a heading.
-            if find_first(element, 'pStyle') is None:
+            p_style = find_first(element, 'pStyle')
+            if p_style is None:
                 continue
-            style = find_first(element, 'pStyle').attrib.get('val', '')
+            style = p_style.attrib.get('val', '')
             metadata = self.styles_dict.get(style, {})
             style_name = metadata.get('style_name')
 
@@ -389,6 +429,7 @@ def _convert_upper_roman(self, body):
             if self.is_first_list_item(el)
         ]
         visited_num_ids = []
+        all_p_tags_in_body = find_all(body, 'p')
         for root_list_item in first_root_list_items:
             if self.num_id(root_list_item) in visited_num_ids:
                 continue
@@ -401,11 +442,11 @@ def _convert_upper_roman(self, body):
             if lst_style != 'upperRoman':
                 continue
             ilvl = min(
-                self.ilvl(el) for el in find_all(body, 'p')
+                self.ilvl(el) for el in all_p_tags_in_body
                 if self.num_id(el) == self.num_id(root_list_item)
             )
             root_upper_roman_list_items = [
-                el for el in find_all(body, 'p')
+                el for el in all_p_tags_in_body
                 if self.num_id(el) == self.num_id(root_list_item) and
                 self.ilvl(el) == ilvl
             ]