Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@

Changelog
=========
* 0.3.13
* Significant performance gains for documents with a large number of table
cells.
* Significant performance gains for large documents.
* 0.3.12
* Added command line support to convert from docx to either html or
markdown.
Expand Down
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,16 @@ When creating your own Parser (as described above) you can now add in your own c
::

class Docx2Foo(DocxParser):
pre_processor_class = FooPrePorcessor
pre_processor_class = FooPreProcessor


The `FooPrePorcessor` will need a few things to get you going:
The `FooPreProcessor` will need a few things to get you going:

::

class FooPrePorcessor(PydocxPrePorcessor):
class FooPreProcessor(PydocxPreProcessor):
def perform_pre_processing(self, root, *args, **kwargs):
super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs)
super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs)
self._set_foo(root)

def _set_foo(self, root):
Expand Down
58 changes: 37 additions & 21 deletions pydocx/DocxParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
from contextlib import contextmanager

from pydocx.utils import (
PydocxPrePorcessor,
get_list_style,
parse_xml_from_string,
find_first,
MulitMemoizeMixin,
PydocxPreProcessor,
find_all,
find_ancestor_with_tag,
find_first,
get_list_style,
has_descendant_with_tag,
parse_xml_from_string,
)
from pydocx.exceptions import MalformedDocxException

Expand Down Expand Up @@ -46,9 +47,9 @@ def ZipFile(path): # This is not needed in python 3.2+
f.close()


class DocxParser:
class DocxParser(MulitMemoizeMixin):
__metaclass__ = ABCMeta
pre_processor_class = PydocxPrePorcessor
pre_processor_class = PydocxPreProcessor

def _extract_xml(self, f, xml_path):
try:
Expand Down Expand Up @@ -161,13 +162,19 @@ def __init__(

#all blank when we init
self.comment_store = None
self.visited = []
self.visited = set()
self.list_depth = 0
self.rels_dict = self._parse_rels_root()
self.styles_dict = self._parse_styles()
self.parse_begin(self.root) # begin to parse

def parse_begin(self, el):
self.populate_memoization({
'find_all': find_all,
'find_first': find_first,
'has_descendant_with_tag': has_descendant_with_tag,
'_get_tcs_in_column': self._get_tcs_in_column,
})
self.pre_processor = self.pre_processor_class(
convert_root_level_upper_roman=self.convert_root_level_upper_roman,
styles_dict=self.styles_dict,
Expand All @@ -179,7 +186,7 @@ def parse_begin(self, el):
def parse(self, el):
if el in self.visited:
return ''
self.visited.append(el)
self.visited.add(el)
parsed = ''
for child in el:
# recursive. So you can get all the way to the bottom
Expand Down Expand Up @@ -417,7 +424,7 @@ def _should_append_break_tag(self, next_el):
if self.pre_processor.previous(next_el) is None:
return False
tag_is_inline_like = any(
has_descendant_with_tag(next_el, tag) for
self.memod_tree_op('has_descendant_with_tag', next_el, tag) for
tag in inline_like_tags
)
if tag_is_inline_like:
Expand Down Expand Up @@ -478,7 +485,20 @@ def _should_parse_next_as_content(el):
# Create the actual li element
return self.list_element(parsed)

def _get_tcs_in_column(self, tbl, column_index):
return [
tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
if self.pre_processor.column_index(tc) == column_index
]

def _get_rowspan(self, el, v_merge):
restart_in_v_merge = False
if v_merge is not None and 'val' in v_merge.attrib:
restart_in_v_merge = 'restart' in v_merge.attrib['val']

if not restart_in_v_merge:
return ''

current_row = self.pre_processor.row_index(el)
current_col = self.pre_processor.column_index(el)
rowspan = 1
Expand All @@ -488,24 +508,20 @@ def _get_rowspan(self, el, v_merge):
# than the current_row and that are on the current_col
if tbl is None:
return ''

tcs = [
tc for tc in find_all(tbl, 'tc')
if self.pre_processor.row_index(tc) >= current_row and
self.pre_processor.column_index(tc) == current_col
tc for tc in self.memod_tree_op(
'_get_tcs_in_column', tbl, current_col,
) if self.pre_processor.row_index(tc) >= current_row
]
restart_in_v_merge = False
if v_merge is not None and 'val' in v_merge.attrib:
restart_in_v_merge = 'restart' in v_merge.attrib['val']

def increment_rowspan(tc):
if not restart_in_v_merge:
return False
def should_increment_rowspan(tc):
if not self.pre_processor.vmerge_continue(tc):
return False
return True

for tc in tcs:
if increment_rowspan(tc):
if should_increment_rowspan(tc):
rowspan += 1
else:
rowspan = 1
Expand All @@ -517,7 +533,7 @@ def get_colspan(self, el):
grid_span = find_first(el, 'gridSpan')
if grid_span is None:
return ''
return find_first(el, 'gridSpan').attrib['val']
return grid_span.attrib['val']

def parse_table_cell_contents(self, el, text):
parsed = text
Expand Down Expand Up @@ -640,7 +656,7 @@ def parse_r(self, el, parsed):

# Get the rPr for the current style, they are the defaults.
p = find_ancestor_with_tag(self.pre_processor, el, 'p')
paragraph_style = find_first(p, 'pStyle')
paragraph_style = self.memod_tree_op('find_first', p, 'pStyle')
if paragraph_style is not None:
style = paragraph_style.get('val')
style_defaults = self.styles_dict.get(style, {})
Expand Down
63 changes: 52 additions & 11 deletions pydocx/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import collections

from collections import defaultdict
from xml.etree import cElementTree
Expand All @@ -21,6 +22,43 @@
)


class MulitMemoize(object):
'''
Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
func_names = {
'find_all': find_all,
...
}
'''
def __init__(self, func_names):
self.cache = dict((func_name, {}) for func_name in func_names)
self.func_names = func_names

def __call__(self, func_name, *args):
if not isinstance(args, collections.Hashable):
# uncacheable. a list, for instance.
# better to not cache than blow up.
return self.func_names[func_name](*args)
if args in self.cache[func_name]:
return self.cache[func_name][args]
else:
value = self.func_names[func_name](*args)
self.cache[func_name][args] = value
return value


class MulitMemoizeMixin(object):
def __init__(self, *args, **kwargs):
super(MulitMemoizeMixin, self).__init__(*args, **kwargs)
self._memoization = None

def memod_tree_op(self, func_name, *args):
return self._memoization(func_name, *args)

def populate_memoization(self, func_names):
self._memoization = MulitMemoize(func_names)


def el_iter(el):
"""
Go through all elements
Expand Down Expand Up @@ -61,7 +99,7 @@ def has_descendant_with_tag(el, tag):
Determine if there is a child ahead in the element tree.
"""
# Get child. stop at first child.
return True if el.find('.//' + tag) is not None else False
return True if find_first(el, tag) is not None else False


def _filter_children(element, tags):
Expand Down Expand Up @@ -154,7 +192,7 @@ def num_id(self):
return self._num_id


class PydocxPrePorcessor(object):
class PydocxPreProcessor(MulitMemoizeMixin):
def __init__(
self,
convert_root_level_upper_roman=False,
Expand All @@ -167,6 +205,9 @@ def __init__(
self.numbering_root = numbering_root

def perform_pre_processing(self, root, *args, **kwargs):
self.populate_memoization({
'find_first': find_first,
})
self._add_parent(root)
# If we don't have a numbering root there cannot be any lists.
if self.numbering_root is not None:
Expand Down Expand Up @@ -251,14 +292,12 @@ def _set_list_attributes(self, el):
# Deleted text in a list will have a numId but no ilvl.
if parent is None:
continue
if find_first(parent, 'ilvl') is None:
parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
if parent_ilvl is None:
continue
self.meta_data[parent]['is_list_item'] = True
self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
self.meta_data[parent]['ilvl'] = find_first(
parent,
'ilvl',
).attrib['val']
self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']

def _generate_num_id(self, el):
'''
Expand Down Expand Up @@ -364,9 +403,10 @@ def _set_headers(self, elements):

for element in elements:
# This element is using the default style which is not a heading.
if find_first(element, 'pStyle') is None:
p_style = find_first(element, 'pStyle')
if p_style is None:
continue
style = find_first(element, 'pStyle').attrib.get('val', '')
style = p_style.attrib.get('val', '')
metadata = self.styles_dict.get(style, {})
style_name = metadata.get('style_name')

Expand All @@ -389,6 +429,7 @@ def _convert_upper_roman(self, body):
if self.is_first_list_item(el)
]
visited_num_ids = []
all_p_tags_in_body = find_all(body, 'p')
for root_list_item in first_root_list_items:
if self.num_id(root_list_item) in visited_num_ids:
continue
Expand All @@ -401,11 +442,11 @@ def _convert_upper_roman(self, body):
if lst_style != 'upperRoman':
continue
ilvl = min(
self.ilvl(el) for el in find_all(body, 'p')
self.ilvl(el) for el in all_p_tags_in_body
if self.num_id(el) == self.num_id(root_list_item)
)
root_upper_roman_list_items = [
el for el in find_all(body, 'p')
el for el in all_p_tags_in_body
if self.num_id(el) == self.num_id(root_list_item) and
self.ilvl(el) == ilvl
]
Expand Down