From 56a249db111a611809c834603d93693863d464dd Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 4 Nov 2013 16:20:08 -0500 Subject: [PATCH 1/6] refs #64: Added memoization around certain expensive operations. --- pydocx/utils.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pydocx/utils.py b/pydocx/utils.py index fabe7863..6c88f37e 100644 --- a/pydocx/utils.py +++ b/pydocx/utils.py @@ -1,4 +1,6 @@ import re +import collections +import functools from collections import defaultdict from xml.etree import cElementTree @@ -21,6 +23,39 @@ ) +class memoized(object): + ''' + Decorator. Caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned + (not reevaluated). + Stolen from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize + ''' + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func(*args) + if args in self.cache: + return self.cache[args] + else: + value = self.func(*args) + self.cache[args] = value + return value + + def __repr__(self): + '''Return the function's docstring.''' + return self.func.__doc__ + + def __get__(self, obj, objtype): + '''Support instance methods.''' + return functools.partial(self.__call__, obj) + + +@memoized def el_iter(el): """ Go through all elements @@ -31,6 +66,7 @@ def el_iter(el): return el.findall('.//*') +@memoized def find_first(el, tag): """ Find the first occurrence of a tag beneath the current element. @@ -38,6 +74,7 @@ def find_first(el, tag): return el.find('.//' + tag) +@memoized def find_all(el, tag): """ Find all occurrences of a tag @@ -45,6 +82,7 @@ def find_all(el, tag): return el.findall('.//' + tag) +@memoized def find_ancestor_with_tag(pre_processor, el, tag): """ Find the first ancestor with that is a `tag`. @@ -56,6 +94,7 @@ def find_ancestor_with_tag(pre_processor, el, tag): return None +@memoized def has_descendant_with_tag(el, tag): """ Determine if there is a child ahead in the element tree. From e3cc3ecfce607c36fbf742f43c3430265b3c13ad Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 4 Nov 2013 16:21:07 -0500 Subject: [PATCH 2/6] refs #64: Update note --- CHANGELOG | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 829d1041..2522b5ec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,9 @@ Changelog ========= +* 0.3.13 + * Significant performance gains for documents with a large number of table + cells. * 0.3.12 * Added command line support to convert from docx to either html or markdown. From 154fded8cf9f374d56ab78809a6e7afbff64a620 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Mon, 4 Nov 2013 16:53:46 -0500 Subject: [PATCH 3/6] refs #64: Something odd with el_iter in 2.7 --- pydocx/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pydocx/utils.py b/pydocx/utils.py index 6c88f37e..d0f604ca 100644 --- a/pydocx/utils.py +++ b/pydocx/utils.py @@ -55,7 +55,6 @@ def __get__(self, obj, objtype): return functools.partial(self.__call__, obj) -@memoized def el_iter(el): """ Go through all elements From ff1ba6567f16474170a6684b599d268834df4726 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 5 Nov 2013 13:13:41 -0500 Subject: [PATCH 4/6] refs #64: fixed typos, added memoization, set() are significantly faster than lists (in this case) --- README.rst | 8 ++--- pydocx/DocxParser.py | 58 +++++++++++++++++++++------------- pydocx/utils.py | 75 +++++++++++++++++++++++--------------------- 3 files changed, 80 insertions(+), 61 deletions(-) diff --git a/README.rst b/README.rst index fe21f717..6c41ad8c 100644 --- a/README.rst +++ b/README.rst @@ -185,16 +185,16 @@ When creating your own Parser (as described above) you can now add in your own c :: class Docx2Foo(DocxParser): - pre_processor_class = FooPrePorcessor + pre_processor_class = FooPreProcessor -The `FooPrePorcessor` will need a few things to get you going: +The `FooPreProcessor` will need a few things to get you going: :: - class FooPrePorcessor(PydocxPrePorcessor): + class FooPreProcessor(PydocxPreProcessor): def perform_pre_processing(self, root, *args, **kwargs): - super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs) + super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs) self._set_foo(root) def _set_foo(self, root): diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index 5d618ecd..70bf3643 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -6,13 +6,14 @@ from contextlib import contextmanager from pydocx.utils import ( - PydocxPrePorcessor, - get_list_style, - parse_xml_from_string, - find_first, + MulitMemoizeMixin, + PydocxPreProcessor, find_all, find_ancestor_with_tag, + find_first, + get_list_style, has_descendant_with_tag, + parse_xml_from_string, ) from pydocx.exceptions import MalformedDocxException @@ -46,9 +47,9 @@ def ZipFile(path): # This is not needed in python 3.2+ f.close() -class DocxParser: +class DocxParser(MulitMemoizeMixin): __metaclass__ = ABCMeta - pre_processor_class = PydocxPrePorcessor + pre_processor_class = PydocxPreProcessor def _extract_xml(self, f, xml_path): try: @@ -161,13 +162,19 @@ def __init__( #all blank when we init self.comment_store = None - self.visited = [] + self.visited = set() self.list_depth = 0 self.rels_dict = self._parse_rels_root() self.styles_dict = self._parse_styles() self.parse_begin(self.root) # begin to parse def parse_begin(self, el): + self.populate_memoization({ + 'find_all': find_all, + 'find_first': find_first, + 'has_descendant_with_tag': has_descendant_with_tag, + 'get_tcs_in_column': self.get_tcs_in_column, + }) self.pre_processor = self.pre_processor_class( convert_root_level_upper_roman=self.convert_root_level_upper_roman, styles_dict=self.styles_dict, @@ -179,7 +186,7 @@ def parse_begin(self, el): def parse(self, el): if el in self.visited: return '' - self.visited.append(el) + self.visited.add(el) parsed = '' for child in el: # recursive. So you can get all the way to the bottom @@ -417,7 +424,7 @@ def _should_append_break_tag(self, next_el): if self.pre_processor.previous(next_el) is None: return False tag_is_inline_like = any( - has_descendant_with_tag(next_el, tag) for + self.memod_tree_op('has_descendant_with_tag', next_el, tag) for tag in inline_like_tags ) if tag_is_inline_like: @@ -478,7 +485,20 @@ def _should_parse_next_as_content(el): # Create the actual li element return self.list_element(parsed) + def get_tcs_in_column(self, tbl, column_index): + return [ + tc for tc in self.memod_tree_op('find_all', tbl, 'tc') + if self.pre_processor.column_index(tc) == column_index + ] + def _get_rowspan(self, el, v_merge): + restart_in_v_merge = False + if v_merge is not None and 'val' in v_merge.attrib: + restart_in_v_merge = 'restart' in v_merge.attrib['val'] + + if not restart_in_v_merge: + return '' + current_row = self.pre_processor.row_index(el) current_col = self.pre_processor.column_index(el) rowspan = 1 @@ -488,24 +508,20 @@ def _get_rowspan(self, el, v_merge): # than the current_row and that are on the current_col if tbl is None: return '' + tcs = [ - tc for tc in find_all(tbl, 'tc') - if self.pre_processor.row_index(tc) >= current_row and - self.pre_processor.column_index(tc) == current_col + tc for tc in self.memod_tree_op( + 'get_tcs_in_column', tbl, current_col, + ) if self.pre_processor.row_index(tc) >= current_row ] - restart_in_v_merge = False - if v_merge is not None and 'val' in v_merge.attrib: - restart_in_v_merge = 'restart' in v_merge.attrib['val'] - def increment_rowspan(tc): - if not restart_in_v_merge: - return False + def should_increment_rowspan(tc): if not self.pre_processor.vmerge_continue(tc): return False return True for tc in tcs: - if increment_rowspan(tc): + if should_increment_rowspan(tc): rowspan += 1 else: rowspan = 1 @@ -517,7 +533,7 @@ def get_colspan(self, el): grid_span = find_first(el, 'gridSpan') if grid_span is None: return '' - return find_first(el, 'gridSpan').attrib['val'] + return grid_span.attrib['val'] def parse_table_cell_contents(self, el, text): parsed = text @@ -640,7 +656,7 @@ def parse_r(self, el, parsed): # Get the rPr for the current style, they are the defaults. p = find_ancestor_with_tag(self.pre_processor, el, 'p') - paragraph_style = find_first(p, 'pStyle') + paragraph_style = self.memod_tree_op('find_first', p, 'pStyle') if paragraph_style is not None: style = paragraph_style.get('val') style_defaults = self.styles_dict.get(style, {}) diff --git a/pydocx/utils.py b/pydocx/utils.py index d0f604ca..1323302b 100644 --- a/pydocx/utils.py +++ b/pydocx/utils.py @@ -1,6 +1,5 @@ import re import collections -import functools from collections import defaultdict from xml.etree import cElementTree @@ -23,36 +22,41 @@ ) -class memoized(object): +class MulitMemoize(object): ''' - Decorator. Caches a function's return value each time it is called. - If called later with the same arguments, the cached value is returned - (not reevaluated). - Stolen from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize + Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize + func_names = { + 'find_all': find_all, + ... + } ''' - def __init__(self, func): - self.func = func - self.cache = {} + def __init__(self, func_names): + self.cache = dict((func_name, {}) for func_name in func_names) + self.func_names = func_names - def __call__(self, *args): + def __call__(self, func_name, *args): if not isinstance(args, collections.Hashable): # uncacheable. a list, for instance. # better to not cache than blow up. - return self.func(*args) - if args in self.cache: - return self.cache[args] + return self.func_names[func_name](*args) + if args in self.cache[func_name]: + return self.cache[func_name][args] else: - value = self.func(*args) - self.cache[args] = value + value = self.func_names[func_name](*args) + self.cache[func_name][args] = value return value - def __repr__(self): - '''Return the function's docstring.''' - return self.func.__doc__ - def __get__(self, obj, objtype): - '''Support instance methods.''' - return functools.partial(self.__call__, obj) +class MulitMemoizeMixin(object): + def __init__(self, *args, **kwargs): + super(MulitMemoizeMixin, self).__init__(*args, **kwargs) + self._memoization = None + + def memod_tree_op(self, func_name, *args): + return self._memoization(func_name, *args) + + def populate_memoization(self, func_names): + self._memoization = MulitMemoize(func_names) def el_iter(el): @@ -65,7 +69,6 @@ def el_iter(el): return el.findall('.//*') -@memoized def find_first(el, tag): """ Find the first occurrence of a tag beneath the current element. @@ -73,7 +76,6 @@ def find_first(el, tag): return el.find('.//' + tag) -@memoized def find_all(el, tag): """ Find all occurrences of a tag @@ -81,7 +83,6 @@ def find_all(el, tag): return el.findall('.//' + tag) -@memoized def find_ancestor_with_tag(pre_processor, el, tag): """ Find the first ancestor with that is a `tag`. @@ -93,13 +94,12 @@ def find_ancestor_with_tag(pre_processor, el, tag): return None -@memoized def has_descendant_with_tag(el, tag): """ Determine if there is a child ahead in the element tree. """ # Get child. stop at first child. - return True if el.find('.//' + tag) is not None else False + return True if find_first(el, tag) is not None else False def _filter_children(element, tags): @@ -192,7 +192,7 @@ def num_id(self): return self._num_id -class PydocxPrePorcessor(object): +class PydocxPreProcessor(MulitMemoizeMixin): def __init__( self, convert_root_level_upper_roman=False, @@ -205,6 +205,9 @@ def __init__( self.numbering_root = numbering_root def perform_pre_processing(self, root, *args, **kwargs): + self.populate_memoization({ + 'find_first': find_first, + }) self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: @@ -289,14 +292,12 @@ def _set_list_attributes(self, el): # Deleted text in a list will have a numId but no ilvl. if parent is None: continue - if find_first(parent, 'ilvl') is None: + parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl') + if parent_ilvl is None: continue self.meta_data[parent]['is_list_item'] = True self.meta_data[parent]['num_id'] = self._generate_num_id(parent) - self.meta_data[parent]['ilvl'] = find_first( - parent, - 'ilvl', - ).attrib['val'] + self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val'] def _generate_num_id(self, el): ''' @@ -402,9 +403,10 @@ def _set_headers(self, elements): for element in elements: # This element is using the default style which is not a heading. - if find_first(element, 'pStyle') is None: + p_style = find_first(element, 'pStyle') + if p_style is None: continue - style = find_first(element, 'pStyle').attrib.get('val', '') + style = p_style.attrib.get('val', '') metadata = self.styles_dict.get(style, {}) style_name = metadata.get('style_name') @@ -427,6 +429,7 @@ def _convert_upper_roman(self, body): if self.is_first_list_item(el) ] visited_num_ids = [] + all_p_tags_in_body = find_all(body, 'p') for root_list_item in first_root_list_items: if self.num_id(root_list_item) in visited_num_ids: continue @@ -439,11 +442,11 @@ def _convert_upper_roman(self, body): if lst_style != 'upperRoman': continue ilvl = min( - self.ilvl(el) for el in find_all(body, 'p') + self.ilvl(el) for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) ) root_upper_roman_list_items = [ - el for el in find_all(body, 'p') + el for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) and self.ilvl(el) == ilvl ] From 928bcd189837f3649845b8408688d0b24b6fb521 Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 5 Nov 2013 13:16:01 -0500 Subject: [PATCH 5/6] refs #64: Update note --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index 2522b5ec..d40440c9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,7 @@ Changelog * 0.3.13 * Significant performance gains for documents with a large number of table cells. + * Significant performance gains for large documents. * 0.3.12 * Added command line support to convert from docx to either html or markdown. From 014377af180f99738c4f7dd1fcaf958b0639f43d Mon Sep 17 00:00:00 2001 From: Jason Ward Date: Tue, 5 Nov 2013 14:41:36 -0500 Subject: [PATCH 6/6] refs #64: Simple name change. --- pydocx/DocxParser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index 70bf3643..3e24f98f 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -173,7 +173,7 @@ def parse_begin(self, el): 'find_all': find_all, 'find_first': find_first, 'has_descendant_with_tag': has_descendant_with_tag, - 'get_tcs_in_column': self.get_tcs_in_column, + '_get_tcs_in_column': self._get_tcs_in_column, }) self.pre_processor = self.pre_processor_class( convert_root_level_upper_roman=self.convert_root_level_upper_roman, @@ -485,7 +485,7 @@ def _should_parse_next_as_content(el): # Create the actual li element return self.list_element(parsed) - def get_tcs_in_column(self, tbl, column_index): + def _get_tcs_in_column(self, tbl, column_index): return [ tc for tc in self.memod_tree_op('find_all', tbl, 'tc') if self.pre_processor.column_index(tc) == column_index @@ -511,7 +511,7 @@ def _get_rowspan(self, el, v_merge): tcs = [ tc for tc in self.memod_tree_op( - 'get_tcs_in_column', tbl, current_col, + '_get_tcs_in_column', tbl, current_col, ) if self.pre_processor.row_index(tc) >= current_row ]