diff --git a/.gitignore b/.gitignore index ce7a7cef..40efeefd 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,6 @@ pip-log.txt nosetests.xml *.mo .idea + +test.html +testxml.html diff --git a/.travis.yml b/.travis.yml index 6a5babb4..4251ba15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,13 @@ language: python python: - "2.6" - "2.7" -script: python main.py +script: ./run_tests.sh install: + - python setup.py -q install - pip install -r requirements.txt +env: + - TRAVIS_EXECUTE_PERFORMANCE=1 notifications: email: - jason.louard.ward@gmail.com + - samson91787@gmail.com diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..33954f41 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,2 @@ +Sam Portnow +Jason Ward diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..d8aa3f16 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,38 @@ + +Changelog +========= +* 0.3.4 + * It is possible for `w:t` tags to have `text` set to `None`. This no longer causes an error when escaping that text. +* 0.3.3 + * In the event that `cElementTree` has a problem parsing the document, a + `MalformedDocxException` is raised instead of a `SyntaxError` +* 0.3.2 + * We were not taking into account that vertical merges should have a + continue attribute, but sometimes they do not, and in those cases word + assumes the continue attribute. We updated the parser to handle the + cases in which the continue attribute is not there. + * We now correctly handle documents with unicode character in the + namespace. + * In rare cases, some text would be output with a style when it should not + have been. This issue has been fixed. +* 0.3.1 + * Added support for several more OOXML tags including: + * caps + * smallCaps + * strike + * dstrike + * vanish + * webHidden + More details in the README. +* 0.3.0 + * We switched from using stock *xml.etree.ElementTree* to using + *xml.etree.cElementTree*. This has resulted in a fairly significant speed + increase for python 2.6 + * It is now possible to create your own pre processor to do additional pre + processing. + * Superscripts and subscripts are now extracted correctly. +* 0.2.1 + * Added a changelog + * Added the version in pydocx.__init__ + * Fixed an issue with duplicating content if there was indentation or + justification on a p element that had multiple t tags. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..88fbbf67 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,7 @@ +include AUTHORS +include CHANGELOG +include LICENSE +include MANIFEST.in +include README.rst +include pydocx/fixtures/* +include pydocx/tests/templates/* diff --git a/README.md b/README.md deleted file mode 100644 index e3773551..00000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -pydocx -====== \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 00000000..2f750299 --- /dev/null +++ b/README.rst @@ -0,0 +1,228 @@ +====== +pydocx +====== +.. image:: https://travis-ci.org/OpenScienceFramework/pydocx.png?branch=master + :align: left + :target: https://travis-ci.org/OpenScienceFramework/pydocx + +pydocx is a parser that breaks down the elements of a docxfile and converts them +into different markup languages. Right now, HTML is supported. Markdown and LaTex +will be available soon. You can extend any of the available parsers to customize it +to your needs. You can also create your own class that inherits DocxParser +to create your own methods for a markup language not yet supported. + +Currently Supported +################### + +* tables + * nested tables + * rowspans + * colspans + * lists in tables +* lists + * list styles + * nested lists + * list of tables + * list of pragraphs +* justification +* images +* styles + * bold + * italics + * underline + * hyperlinks +* headings + +Usage +##### + +DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows: + +:: + + class DocxParser: + + @property + def parsed(self): + return self._parsed + + @property + def escape(self, text): + return text + + @abstractmethod + def linebreak(self): + return '' + + @abstractmethod + def paragraph(self, text): + return text + + @abstractmethod + def heading(self, text, heading_level): + return text + + @abstractmethod + def insertion(self, text, author, date): + return text + + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, path, x, y): + return self.image_handler(path) + + @abstractmethod + def deletion(self, text, author, date): + return text + + @abstractmethod + def bold(self, text): + return text + + @abstractmethod + def italics(self, text): + return text + + @abstractmethod + def underline(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + + @abstractmethod + def tab(self): + return True + + @abstractmethod + def ordered_list(self, text): + return text + + @abstractmethod + def unordered_list(self, text): + return text + + @abstractmethod + def list_element(self, text): + return text + + @abstractmethod + def table(self, text): + return text + @abstractmethod + def table_row(self, text): + return text + + @abstractmethod + def table_cell(self, text): + return text + + @abstractmethod + def page_break(self): + return True + + @abstractmethod + def indent(self, text, left='', right='', firstLine=''): + return text + +Docx2Html inherits DocxParser and implements basic HTML handling. Ex. + +:: + + class Docx2Html(DocxParser): + + # Escape '&', '<', and '>' so we render the HTML correctly + def escape(self, text): + return xml.sax.saxutils.quoteattr(text)[1:-1] + + # return a line break + def linebreak(self, pre=None): + return '
' + + # add paragraph tags + def paragraph(self, text, pre=None): + return '

' + text + '

' + + +However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need. + +:: + + class My_Implementation_of_Docx2Html(Docx2Html): + + def paragraph(self, text, pre = None): + return

+ text + '

' + + + +OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser + +:: + + class Docx2Foo(DocxParser): + + # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge :) + def linebreak(self): + return '!!!!!!!!!!!!' + +Custom Pre-Processor +#################### + +When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so: + +:: + + class Docx2Foo(DocxParser): + pre_processor_class = FooPrePorcessor + + +The `FooPrePorcessor` will need a few things to get you going: + +:: + + class FooPrePorcessor(PydocxPrePorcessor): + def perform_pre_processing(self, root, *args, **kwargs): + super(FooPrePorcessor, self).perform_pre_processing(root, *args, **kwargs) + self._set_foo(root) + + def _set_foo(self, root): + pass + +If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx. + +Everything done during pre-processing is executed prior to `parse` being called for the first time. + + +Styles +###### + +The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include: + +* class `pydocx-insert` -> Turns the text green. +* class `pydocx-delete` -> Turns the text red and draws a line through the text. +* class `pydocx-center` -> Aligns the text to the center. +* class `pydocx-right` -> Aligns the text to the right. +* class `pydocx-left` -> Aligns the text to the left. +* class `pydocx-comment` -> Turns the text blue. +* class `pydocx-underline` -> Underlines the text. +* class `pydocx-caps` -> Makes all text uppercase. +* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts. +* class `pydocx-strike` -> Strike a line through. +* class `pydocx-hidden` -> Hide the text. + +Optional Arguments +################## + +You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead. diff --git a/main.py b/main.py deleted file mode 100644 index c9e8e1d4..00000000 --- a/main.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydocx import * -from bs4 import BeautifulSoup -import xml.etree.ElementTree as ElementTree -#import lxml.etree as etree - -with open('test.html', 'w') as f: - f.write(docx2html('helloworld.docx')) -with open('testxml.html','w') as f: - f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify()) - -#print docx2html('helloworld.docx') -#print docx2markdown('helloworld.docx') \ No newline at end of file diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index b3006ef0..5364289b 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -1,323 +1,686 @@ -from abc import abstractmethod, ABCMeta -import zipfile import logging -import xml.etree.ElementTree as ElementTree -from xml.etree.ElementTree import _ElementInterface +import os +import zipfile + +from abc import abstractmethod, ABCMeta +from contextlib import contextmanager + +from pydocx.utils import ( + PydocxPrePorcessor, + get_list_style, + parse_xml_from_string, + find_first, + find_all, + find_ancestor_with_tag, + has_descendant_with_tag, +) logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("NewParser") -def remove_namespaces(document): - root = ElementTree.fromstring(document) - for child in el_iter(root): - child.tag = child.tag.split("}")[1] - child.attrib = dict( - (k.split("}")[1], v) - for k, v in child.attrib.items() - ) - return ElementTree.tostring(root) - -# Add some helper functions to Element to make it slightly more readable - - -def has_child(self, tag): - return True if self.find(tag) is not None else False +# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx +EMUS_PER_PIXEL = 9525 +USE_ALIGNMENTS = True +JUSTIFY_CENTER = 'center' +JUSTIFY_LEFT = 'left' +JUSTIFY_RIGHT = 'right' -def has_child_all(self, tag): - return True if self.find('.//' + tag) is not None else False - - -def find_all(self, tag): - return self.find('.//' + tag) - - -def findall_all(self, tag): - return self.findall('.//' + tag) - - -def el_iter(el): - try: - return el.iter() - except AttributeError: - return el.findall('.//*') +INDENTATION_RIGHT = 'right' +INDENTATION_LEFT = 'left' +INDENTATION_FIRST_LINE = 'firstLine' +DISABLED_VALUES = ['false', '0'] +# Add some helper functions to Element to make it slightly more readable -setattr(_ElementInterface, 'has_child', has_child) -setattr(_ElementInterface, 'has_child_all', has_child_all) -setattr(_ElementInterface, 'find_all', find_all) -setattr(_ElementInterface, 'findall_all', findall_all) -setattr(_ElementInterface, 'parent', None) -setattr(_ElementInterface, 'parent_list', []) -# End helpers +@contextmanager +def ZipFile(path): # This is not needed in python 3.2+ + f = zipfile.ZipFile(path) + yield f + f.close() class DocxParser: __metaclass__ = ABCMeta + pre_processor_class = PydocxPrePorcessor - def __init__(self, path): - self._parsed = '' - self.in_list = False - - f = zipfile.ZipFile(path) - try: + def _build_data(self, path, *args, **kwargs): + with ZipFile(path) as f: self.document_text = f.read('word/document.xml') + self.styles_text = f.read('word/styles.xml') try: + self.fonts = f.read('/word/fontTable.xml') + except KeyError: + self.fonts = None + try: # Only present if there are lists self.numbering_text = f.read('word/numbering.xml') - except zipfile.BadZipfile: - pass - try: + except KeyError: + self.numbering_text = None + try: # Only present if there are comments self.comment_text = f.read('word/comments.xml') - except zipfile.BadZipfile: - pass - finally: - f.close() - - self.root = ElementTree.fromstring( - remove_namespaces(self.document_text), - ) - - def add_parent(el): - for child in el.getchildren(): - setattr(child, 'parent', el) - add_parent(child) - add_parent(self.root) - - def create_parent_list(el, tmp=None): - if tmp is None: - tmp = [] - for child in el: - tmp.append(el) - tmp = create_parent_list(child, tmp) - el.parent_list = tmp[:] - try: - tmp.pop() - except: - tmp = [] - return tmp - - create_parent_list(self.root) - + except KeyError: + self.comment_text = None + self.relationship_text = f.read('word/_rels/document.xml.rels') + zipped_image_files = [ + e for e in f.infolist() + if e.filename.startswith('word/media/') + ] + for e in zipped_image_files: + self._image_data[e.filename] = f.read(e.filename) + + self.root = parse_xml_from_string(self.document_text) + self.numbering_root = None + self.expon = '' + self.degree = '' + if self.numbering_text: + self.numbering_root = parse_xml_from_string(self.numbering_text) + self.comment_root = None + if self.comment_text: + self.comment_root = parse_xml_from_string(self.comment_text) + + def _parse_styles(self): + tree = parse_xml_from_string(self.styles_text) + result = {} + for style in find_all(tree, 'style'): + style_val = find_first(style, 'name').attrib['val'] + result[style.attrib['styleId']] = style_val + return result + + def _parse_rels_root(self): + tree = parse_xml_from_string(self.relationship_text) + rels_dict = {} + for el in tree: + rId = el.get('Id') + target = el.get('Target') + rels_dict[rId] = target + return rels_dict + + def __init__( + self, + path, + convert_root_level_upper_roman=False, + *args, + **kwargs): + self._parsed = '' + self.block_text = '' + self.page_width = 0 + self.convert_root_level_upper_roman = convert_root_level_upper_roman + self._image_data = {} + self._build_data(path, *args, **kwargs) + self.pre_processor = None + + #divide by 20 to get to pt (Office works in 20th's of a point) + """ + see http://msdn.microsoft.com/en-us/library/documentformat + .openxml.wordprocessing.indentation.aspx + """ + if find_first(self.root, 'pgSz') is not None: + self.page_width = int( + find_first(self.root, 'pgSz').attrib['w'] + ) / 20 + + #all blank when we init self.comment_store = None - self.numbering_store = None - self.ignore_current = False - self.elements = [] - self.tables_seen = [] self.visited = [] - try: - self.numbering_root = ElementTree.fromstring( - remove_namespaces(self.numbering_text), - ) - except: - pass - self.parse_begin(self.root) + self.list_depth = 0 + self.rels_dict = self._parse_rels_root() + self.styles_dict = self._parse_styles() + self.parse_begin(self.root) # begin to parse def parse_begin(self, el): - self._parsed += self.parse_lists(el) - -### parse table function and is_table flag - def parse_lists(self, el): - parsed = '' - first_p = el.find_all('p') - children = [] - for child in first_p.parent: - if child.tag == 'p' or child.tag == 'tbl': - children.append(child) - p_list = children - list_started = False - list_type = '' - list_chunks = [] - index_start = 0 - index_end = 1 - for i, el in enumerate(p_list): - if not list_started and el.has_child_all('ilvl'): - list_started = True - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif ( - list_started and - el.has_child_all('ilvl') and - not list_type == self.get_list_style( - el.find_all('numId').attrib['val'] - )): - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_started = True - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif list_started and not el.has_child_all('ilvl'): - list_started = False - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - else: - index_end = i+1 - list_chunks.append(p_list[index_start:index_end]) - for chunk in list_chunks: - chunk_parsed = '' - for el in chunk: - chunk_parsed += self.parse(el) - if chunk[0].has_child_all('ilvl'): - lst_style = self.get_list_style( - chunk[0].find_all('numId').attrib['val'], - ) - if lst_style['val'] == 'bullet': - parsed += self.unordered_list(chunk_parsed) - else: - parsed += self.ordered_list(chunk_parsed) - elif chunk[0].has_child_all('br'): - parsed += self.page_break() - else: - parsed += chunk_parsed - - return parsed + self.pre_processor = self.pre_processor_class( + convert_root_level_upper_roman=self.convert_root_level_upper_roman, + styles_dict=self.styles_dict, + numbering_root=self.numbering_root, + ) + self.pre_processor.perform_pre_processing(el) + self._parsed += self.parse(el) def parse(self, el): + if el in self.visited: + return '' + self.visited.append(el) parsed = '' - if not self.ignore_current: - tmp_d = dict( - (tmpel.tag, i) - for i, tmpel in enumerate(el.parent_list) - ) - if ( - 'tbl' in tmp_d and - el.parent_list[tmp_d['tbl']] not in self.tables_seen): - self.ignore_current = True - self.tables_seen.append(el.parent_list[tmp_d['tbl']]) - tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']])) - self.ignore_current = False - return tmpout - for child in el: + # recursive. So you can get all the way to the bottom parsed += self.parse(child) - - if el.tag == 'br' and el.attrib['type'] == 'page': - #TODO figure out what parsed is getting overwritten - return self.page_break() - # add it to the list so we don't repeat! - if el.tag == 'ilvl' and el not in self.visited: - self.in_list = True - self.visited.append(el) - ## This starts the returns + if el.tag == 'br' and el.attrib.get('type') == 'page': + return self.parse_page_break(el, parsed) + elif el.tag == 'tbl': + return self.parse_table(el, parsed) elif el.tag == 'tr': - return self.table_row(parsed) + return self.parse_table_row(el, parsed) elif el.tag == 'tc': - self.elements.append(el) - return self.table_cell(parsed) - if el.tag == 'r' and el not in self.elements: - self.elements.append(el) - return self.parse_r(el) + return self.parse_table_cell(el, parsed) + elif el.tag == 'm': + return self.matrix(parsed) + elif el.tag == 'mr': + return self.parse_matrix_row(el, parsed) + elif el.tag == 'rad': + return self.radical(self.degree, self.expon) + elif el.tag == 'deg': + return self.parse_deg(el, parsed) + elif el.tag == 'e': + return self.parse_exp(el, parsed) + elif el.tag == 'num': + return self.num(parsed) + elif el.tag == 'den': + return self.den(parsed) + elif el.tag == 'sup': + return self.superscript(parsed) + elif el.tag == 'sub': + return self.subscript(parsed) + elif el.tag == 'r': + return self.parse_r(el, parsed) + elif el.tag == 't': + return self.parse_t(el, parsed) + elif el.tag == 'br': + return self.parse_break_tag(el, parsed) + elif el.tag == 'delText': + return self.parse_deletion(el, parsed) elif el.tag == 'p': return self.parse_p(el, parsed) elif el.tag == 'ins': - return self.insertion(parsed, '', '') + return self.parse_insertion(el, parsed) + elif el.tag == 'hyperlink': + return self.parse_hyperlink(el, parsed) + elif el.tag in ('pict', 'drawing'): + return self.parse_image(el) else: return parsed - def parse_p(self, el, text): - parsed = text - if self.in_list: - self.in_list = False - parsed = self.list_element(parsed) - elif ( - not el.has_child_all('t') and - 'tbl' not in [i.tag for i in el.parent_list]): - parsed = self.linebreak() - elif el.parent not in self.elements: - parsed = self.paragraph(parsed) + def parse_page_break(self, el, text): + #TODO figure out what parsed is getting overwritten + return self.page_break() + + def parse_table(self, el, text): + return self.table(text) + + def parse_table_row(self, el, text): + return self.table_row(text) + + def parse_table_cell(self, el, text): + v_merge = find_first(el, 'vMerge') + if v_merge is not None and ( + 'restart' != v_merge.get('val', '')): + return self.empty_cell() + colspan = self.get_colspan(el) + rowspan = self._get_rowspan(el, v_merge) + if rowspan > 1: + rowspan = str(rowspan) + else: + rowspan = '' + return self.table_cell( + text, col=colspan, row=rowspan, + is_last_row_item=self.pre_processor.is_last_row_item(el), + is_list_item=has_descendant_with_tag(el, 'ilvl')) + + def parse_list(self, el, text): + """ + All the meat of building the list is done in _parse_list, however we + call this method for two reasons: It is the naming convention we are + following. And we need a reliable way to raise and lower the list_depth + (which is used to determine if we are in a list). I could have done + this in _parse_list, however it seemed cleaner to do it here. + """ + self.list_depth += 1 + parsed = self._parse_list(el, text) + self.list_depth -= 1 + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, parsed) return parsed - def parse_r(self, el): - is_deleted = False - text = None - if el.has_child('t'): - text = self.escape(el.find('t').text) - elif el.has_child('delText'): - text = self.escape(el.find('delText').text) - is_deleted = True - if text: - rpr = el.find('rPr') - if rpr is not None: - fns = [] - if rpr.has_child('b'): - fns.append(self.bold) - if rpr.has_child('i'): - fns.append(self.italics) - if rpr.has_child('u'): - fns.append(self.underline) - for fn in fns: - text = fn(text) - ppr = el.parent.find('pPr') - if ppr is not None: - jc = ppr.find('jc') - if jc is not None: - if jc.attrib['val'] == 'right': - text = self.right_justify(text) - if jc.attrib['val'] == 'center': - text = self.center_justify(text) - ind = ppr.find('ind') - if ind is not None: - right = None - left = None - firstLine = None - if 'right' in ind.attrib: - right = ind.attrib['right'] - right = int(right)/20 - right = str(right) - if 'left' in ind.attrib: - left = ind.attrib['left'] - left = int(left)/20 - left = str(left) - if 'firstLine' in ind.attrib: - firstLine = ind.attrib['firstLine'] - firstLine = int(firstLine)/20 - firstLine = str(firstLine) - text = self.indent(text, right, left, firstLine) - if is_deleted: - text = self.deletion(text, '', '') + def get_list_style(self, num_id, ilvl): + return get_list_style(self.numbering_root, num_id, ilvl) + + def _build_list(self, el, text): + # Get the list style for the pending list. + lst_style = self.get_list_style( + self.pre_processor.num_id(el).num_id, + self.pre_processor.ilvl(el), + ) + + parsed = text + # Create the actual list and return it. + if lst_style == 'bullet': + return self.unordered_list(parsed) + else: + return self.ordered_list( + parsed, + lst_style, + ) + + def _parse_list(self, el, text): + parsed = self.parse_list_item(el, text) + num_id = self.pre_processor.num_id(el) + ilvl = self.pre_processor.ilvl(el) + # Everything after this point assumes the first element is not also the + # last. If the first element is also the last then early return by + # building and returning the completed list. + if self.pre_processor.is_last_list_item_in_root(el): + return self._build_list(el, parsed) + next_el = self.pre_processor.next(el) + + def is_same_list(next_el, num_id, ilvl): + # Bail if next_el is not an element + if next_el is None: + return False + if self.pre_processor.is_last_list_item_in_root(next_el): + return False + # If next_el is not a list item then roll it into the list by + # returning True. + if not self.pre_processor.is_list_item(next_el): + return True + if self.pre_processor.num_id(next_el) != num_id: + # The next element is a new list entirely + return False + if self.pre_processor.ilvl(next_el) < ilvl: + # The next element is de-indented, so this is really the last + # element in the list + return False + return True + + while is_same_list(next_el, num_id, ilvl): + if next_el in self.visited: + # Early continue for elements we have already visited. + next_el = self.pre_processor.next(next_el) + continue + + if self.pre_processor.is_list_item(next_el): + # Reset the ilvl + ilvl = self.pre_processor.ilvl(next_el) + + parsed += self.parse(next_el) + next_el = self.pre_processor.next(next_el) + + def should_parse_last_el(last_el, first_el): + if last_el is None: + return False + # Different list + if ( + self.pre_processor.num_id(last_el) != + self.pre_processor.num_id(first_el)): + return False + # Will be handled when the ilvls do match (nesting issue) + if ( + self.pre_processor.ilvl(last_el) != + self.pre_processor.ilvl(first_el)): + return False + # We only care about last items that have not been + # parsed before (first list items are + # always parsed at the beginning of this method.) + return ( + not self.pre_processor.is_first_list_item(last_el) and + self.pre_processor.is_last_list_item_in_root(last_el) + ) + if should_parse_last_el(next_el, el): + parsed += self.parse(next_el) + + # If the list has no content, then we don't need to worry about the + # list styling, because it will be stripped out. + if parsed == '': + return parsed + + return self._build_list(el, parsed) + + def justification(self, el, text): + paragraph_tag_property = el.find('pPr') + if paragraph_tag_property is None: return text + + _justification = paragraph_tag_property.find('jc') + indentation = paragraph_tag_property.find('ind') + if _justification is None and indentation is None: + return text + alignment = None + right = None + left = None + firstLine = None + if _justification is not None: # text alignments + value = _justification.attrib['val'] + if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]: + alignment = value + if indentation is not None: + if INDENTATION_RIGHT in indentation.attrib: + right = indentation.attrib[INDENTATION_RIGHT] + # divide by 20 to get to pt. multiply by (4/3) to get to px + right = (int(right) / 20) * float(4) / float(3) + right = str(right) + if INDENTATION_LEFT in indentation.attrib: + left = indentation.attrib[INDENTATION_LEFT] + left = (int(left) / 20) * float(4) / float(3) + left = str(left) + if INDENTATION_FIRST_LINE in indentation.attrib: + firstLine = indentation.attrib[INDENTATION_FIRST_LINE] + firstLine = (int(firstLine) / 20) * float(4) / float(3) + firstLine = str(firstLine) + if any([alignment, firstLine, left, right]): + return self.indent( + text, just=alignment, firstLine=firstLine, + left=left, right=right, + is_in_table=self.pre_processor.is_in_table(el)) + return text + + def parse_p(self, el, text): + if text == '': + return '' + # TODO This is still not correct, however it fixes the bug. We need to + # apply the classes/styles on p, td, li and h tags instead of inline, + # but that is for another ticket. + text = self.justification(el, text) + if self.pre_processor.is_first_list_item(el): + return self.parse_list(el, text) + if self.pre_processor.heading_level(el): + return self.parse_heading(el, text) + if self.pre_processor.is_list_item(el): + return self.parse_list_item(el, text) + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, text) + parsed = text + # No p tags in li tags + if el.find('oMathPara') is not None: + math = True else: + math = False + if self.list_depth == 0: + parsed = self.paragraph(parsed, math) + return parsed + + def _should_append_break_tag(self, next_el): + paragraph_like_tags = [ + 'p', + ] + inline_like_tags = [ + 'smartTag', + 'ins', + 'delText', + ] + if self.pre_processor.is_list_item(next_el): + return False + if self.pre_processor.previous(next_el) is None: + return False + tag_is_inline_like = any( + has_descendant_with_tag(next_el, tag) for + tag in inline_like_tags + ) + if tag_is_inline_like: + return False + if ( + self.pre_processor.is_last_list_item_in_root( + self.pre_processor.previous(next_el))): + return False + if self.pre_processor.previous(next_el).tag not in paragraph_like_tags: + return False + if next_el.tag not in paragraph_like_tags: + return False + return True + + def parse_heading(self, el, parsed): + return self.heading(parsed, self.pre_processor.heading_level(el)) + + def parse_list_item(self, el, text): + # If for whatever reason we are not currently in a list, then start + # a list here. This will only happen if the num_id/ilvl combinations + # between lists is not well formed. + parsed = text + if self.list_depth == 0: + return self.parse_list(el, parsed) + + def _should_parse_next_as_content(el): + """ + Get the contents of the next el and append it to the + contents of the current el (that way things like tables + are actually in the li tag instead of in the ol/ul tag). + """ + next_el = self.pre_processor.next(el) + if next_el is None: + return False + if ( + not self.pre_processor.is_list_item(next_el) and + not self.pre_processor.is_last_list_item_in_root(el) + ): + return True + if self.pre_processor.is_first_list_item(next_el): + if ( + self.pre_processor.num_id(next_el) == + self.pre_processor.num_id(el)): + return True + return False + + while el is not None: + if _should_parse_next_as_content(el): + el = self.pre_processor.next(el) + next_elements_content = self.parse(el) + if not next_elements_content: + continue + if self._should_append_break_tag(el): + parsed += self.break_tag( + self.pre_processor.is_in_table(el)) + parsed += next_elements_content + else: + break + # Create the actual li element + return self.list_element(parsed) + + def _get_rowspan(self, el, v_merge): + current_row = self.pre_processor.row_index(el) + current_col = self.pre_processor.column_index(el) + rowspan = 1 + result = '' + tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl') + # We only want table cells that have a higher row_index that is greater + # than the current_row and that are on the current_col + if tbl is None: + return '' + tcs = [ + tc for tc in find_all(tbl, 'tc') + if self.pre_processor.row_index(tc) >= current_row and + self.pre_processor.column_index(tc) == current_col + ] + restart_in_v_merge = False + if v_merge is not None and 'val' in v_merge.attrib: + restart_in_v_merge = 'restart' in v_merge.attrib['val'] + + def increment_rowspan(tc): + if not restart_in_v_merge: + return False + if not self.pre_processor.vmerge_continue(tc): + return False + return True + + for tc in tcs: + if increment_rowspan(tc): + rowspan += 1 + else: + rowspan = 1 + if rowspan > 1: + result = rowspan + return str(result) + + def get_colspan(self, el): + grid_span = find_first(el, 'gridSpan') + if grid_span is None: return '' + return find_first(el, 'gridSpan').attrib['val'] - def get_list_style(self, numval): - ids = self.numbering_root.findall_all('num') - for _id in ids: - if _id.attrib['numId'] == numval: - abstractid = _id.find('abstractNumId') - abstractid = abstractid.attrib['val'] - style_information = self.numbering_root.findall_all( - 'abstractNum', - ) - for info in style_information: - if info.attrib['abstractNumId'] == abstractid: - for i in el_iter(info): - if i.find('numFmt') is not None: - return i.find('numFmt').attrib - - def get_comments(self, doc_id): - if self.comment_store is None: - # TODO throw appropriate error - comment_root = ElementTree.fromstring( - remove_namespaces(self.comment_text), + def parse_table_cell_contents(self, el, text): + parsed = text + + def _should_parse_next_as_content(el): + next_el = self.pre_processor.next(el) + if next_el is None: + return False + if self.pre_processor.is_in_table(next_el): + return True + while el is not None: + if _should_parse_next_as_content(el): + el = self.pre_processor.next(el) + next_elements_content = self.parse(el) + if not next_elements_content: + continue + if self._should_append_break_tag(el): + parsed += self.break_tag( + self.pre_processor.is_in_table(el)) + parsed += next_elements_content + else: + break + return parsed + + def parse_hyperlink(self, el, text): + rId = el.get('id') + href = self.rels_dict.get(rId) + if not href: + return text + href = self.escape(href) + return self.hyperlink(text, href) + + def _get_image_id(self, el): + # Drawings + blip = find_first(el, 'blip') + if blip is not None: + # On drawing tags the id is actually whatever is returned from the + # embed attribute on the blip tag. Thanks a lot Microsoft. + return blip.get('embed') + # Picts + imagedata = find_first(el, 'imagedata') + if imagedata is not None: + return imagedata.get('id') + + def _convert_image_size(self, size): + return size / EMUS_PER_PIXEL + + def _get_image_size(self, el): + """ + If we can't find a height or width, return 0 for whichever is not + found, then rely on the `image` handler to strip those attributes. This + functionality can change once we integrate PIL. + """ + sizes = find_first(el, 'ext') + if sizes is not None and sizes.get('cx'): + if sizes.get('cx'): + x = self._convert_image_size(int(sizes.get('cx'))) + if sizes.get('cy'): + y = self._convert_image_size(int(sizes.get('cy'))) + return ( + '%dpx' % x, + '%dpx' % y, ) - ids_and_info = {} - ids = comment_root.findall_all('comment') - for _id in ids: - ids_and_info[_id.attrib['id']] = { - "author": _id.attrib['author'], - "date": _id.attrib['date'], - "text": _id.findall_all('t')[0].text, - } - self.comment_store = ids_and_info - return self.comment_store[doc_id] + shape = find_first(el, 'shape') + if shape is not None and shape.get('style') is not None: + # If either of these are not set, rely on the method `image` to not + # use either of them. + x = 0 + y = 0 + styles = shape.get('style').split(';') + + for s in styles: + if s.startswith('height:'): + y = s.split(':')[1] + if s.startswith('width:'): + x = s.split(':')[1] + return x, y + return 0, 0 + + def parse_image(self, el): + x, y = self._get_image_size(el) + rId = self._get_image_id(el) + src = self.rels_dict.get(rId) + if not src: + return '' + src = os.path.join( + 'word', + src, + ) + if src in self._image_data: + filename = os.path.split(src)[-1] + return self.image(self._image_data[src], filename, x, y) + return '' + + def _is_style_on(self, el): + """ + For b, i, u (bold, italics, and underline) merely having the tag is not + sufficient. You need to check to make sure it is not set to "false" as + well. + """ + val = el.get('val', '').lower() + return val.lower() not in DISABLED_VALUES + + def parse_t(self, el, parsed): + if el.text is None: + return '' + return self.escape(el.text) + + def parse_break_tag(self, el, parsed): + return self.break_tag(self.pre_processor.is_in_table(el)) + + def parse_deletion(self, el, parsed): + if el.text is None: + return '' + return self.deletion(el.text, '', '') + + def parse_insertion(self, el, parsed): + return self.insertion(parsed, '', '') + + def parse_r(self, el, parsed): + """ + Parse the running text. + """ + text = parsed + if not text: + return '' + run_tag_property = el.find('rPr') + + def _has_style_on(run_tag_property, tag): + el = run_tag_property.find(tag) + if el is not None: + return self._is_style_on(el) + inline_tags = { + 'b': self.bold, + 'i': self.italics, + 'u': self.underline, + 'caps': self.caps, + 'smallCaps': self.small_caps, + 'strike': self.strike, + 'dstrike': self.strike, + 'vanish': self.hide, + 'webHidden': self.hide, + } + if run_tag_property is not None: + for child in run_tag_property: + # These tags are a little different, handle them separately + # from the rest. + # This could be a superscript or a subscript + if child.tag == 'vertAlign': + if child.attrib['val'] == 'superscript': + text = self.superscript(text) + elif child.attrib['val'] == 'subscript': + text = self.subscript(text) + elif child.tag in inline_tags and self._is_style_on(child): + text = inline_tags[child.tag](text) + + return text + + def parse_rad(self, el, parsed): + return '' + + def parse_deg(self, el, parsed): + self.degree = self.deg(parsed) + return '' + + def parse_exp(self, el, parsed): + if find_ancestor_with_tag(self.pre_processor, el, 'rad'): + self.expon = self.exp(parsed) + return '' + elif find_ancestor_with_tag(self.pre_processor, el, 'm'): + return self.matrix_cell( + parsed, self.pre_processor.is_last_matrix_row_item(el)) + else: + return self.exp(parsed) + + def parse_matrix_row(self, el, parsed): + return self.matrix_row(parsed) @property def parsed(self): @@ -332,13 +695,29 @@ def linebreak(self): return '' @abstractmethod - def paragraph(self, text): + def paragraph(self, text, math): + return text + + @abstractmethod + def heading(self, text, heading_level): return text @abstractmethod def insertion(self, text, author, date): return text + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, data, filename, x, y): + return self.image_handler(data) + @abstractmethod def deletion(self, text, author, date): return text @@ -355,6 +734,30 @@ def italics(self, text): def underline(self, text): return text + @abstractmethod + def caps(self, text): + return text + + @abstractmethod + def small_caps(self, text): + return text + + @abstractmethod + def strike(self, text): + return text + + @abstractmethod + def hide(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + @abstractmethod def tab(self): return True @@ -388,15 +791,41 @@ def page_break(self): return True @abstractmethod - def right_justify(self, text): + def indent(self, text, left='', right='', firstLine=''): return text @abstractmethod - def center_justify(self, text): + def empty_cell(self): + return '' + + @abstractmethod + def num(self, text): + return text + + @abstractmethod + def radical(self, deg, num): + return True + + @abstractmethod + def den(self, text): + return text + + @abstractmethod + def deg(self, text): return text @abstractmethod - def indent(self, text, left=None, right=None, firstLine=None): + def exp(self, text): return text - #TODO JUSTIFIED JUSTIFIED TEXT + @abstractmethod + def matrix_row(self, text): + return text + + @abstractmethod + def matrix_cell(self, text, is_last_row_item=False): + return text + + @abstractmethod + def matrix(self, text): + return text diff --git a/pydocx/__init__.py b/pydocx/__init__.py index 9b42e00f..dad89dc2 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -1,8 +1,15 @@ -from .parsers import * +from .parsers import Docx2LaTex, Docx2Html, Docx2Markdown + def docx2html(path): return Docx2Html(path).parsed + def docx2markdown(path): return Docx2Markdown(path).parsed + +def docx2latex(path): + return Docx2LaTex(path).parsed + +VERSION = '0.3.3' diff --git a/pydocx/exceptions.py b/pydocx/exceptions.py new file mode 100644 index 00000000..cdff556a --- /dev/null +++ b/pydocx/exceptions.py @@ -0,0 +1,2 @@ +class MalformedDocxException(Exception): + pass diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx new file mode 100644 index 00000000..8f514372 Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx new file mode 100644 index 00000000..774362ca Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx new file mode 100644 index 00000000..c722888b Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx new file mode 100644 index 00000000..53769e15 Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx new file mode 100644 index 00000000..a130f5ba Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx new file mode 100644 index 00000000..46ab5429 Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx new file mode 100644 index 00000000..2ebd0bd0 Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx new file mode 100644 index 00000000..996e6671 Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx new file mode 100644 index 00000000..a87d88ed Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx new file mode 100644 index 00000000..6bc49a7a Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx new file mode 100644 index 00000000..890104c7 Binary files /dev/null and b/pydocx/fixtures/headers.docx differ diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx new file mode 100644 index 00000000..38d6f6a8 Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx new file mode 100644 index 00000000..4aba2347 Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx new file mode 100644 index 00000000..7f8a3bf1 Binary files /dev/null and b/pydocx/fixtures/justification.docx differ diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx new file mode 100644 index 00000000..d1a87388 Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx new file mode 100644 index 00000000..f9b3946e Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx new file mode 100644 index 00000000..c1c7ecf8 Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx new file mode 100644 index 00000000..0f6d7f77 Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx new file mode 100644 index 00000000..21bed964 Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx new file mode 100644 index 00000000..f4000dfa Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx new file mode 100644 index 00000000..b43b8a0d Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx new file mode 100644 index 00000000..af704d4d Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx new file mode 100644 index 00000000..913099c4 Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx new file mode 100644 index 00000000..4128c0a2 Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx new file mode 100644 index 00000000..1d2a1c23 Binary files /dev/null and b/pydocx/fixtures/simple.docx differ diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx new file mode 100644 index 00000000..c09ad744 Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx new file mode 100644 index 00000000..26de483c Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx new file mode 100644 index 00000000..b4b9287f Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx new file mode 100644 index 00000000..cc4bd5cf Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx new file mode 100644 index 00000000..06ea2d7a Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx new file mode 100644 index 00000000..856abfdf Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx new file mode 100644 index 00000000..11859541 Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx new file mode 100644 index 00000000..dcb7ba1c Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx new file mode 100644 index 00000000..d518b2c5 Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py deleted file mode 100644 index 94b130d3..00000000 --- a/pydocx/lxmlparser.py +++ /dev/null @@ -1,111 +0,0 @@ -import zipfile -from lxml import etree -from StringIO import StringIO -__author__ = 'samportnow' - -#for el in tree.iter(): - # The way lists are handled could double visit certain elements; keep - # track of which elements have been visited and skip any that have been - # visited already. - #if el in visited_nodes: - #continue -with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f: - document = f.read('word/document.xml') - numbering= f.read('word/numbering.xml') -parser=etree.XMLParser(ns_clean=True) -document=StringIO(document) -numbering=StringIO(numbering) -numbering_tree=etree.parse(numbering,parser) -numbering_namespace=numbering_tree.getroot().nsmap['w'] -visited_els=[] - -def get_parsed(): - parser=etree.XMLParser(ns_clean=True) - tree=etree.parse(document,parser) - namespace=tree.getroot().nsmap['w'] - #rpr is run properties for the paragraph mark - paragraph='' - run_text='' - running_text='' - for el in tree.iter(): - if el.tag=='{%s}p' %namespace: - for wp in el.iter(): - if wp.tag =='{%s}ins' %namespace: - for text in wp.iterchildren(): - if text not in visited_els: - run_text +='
'+get_text(text,namespace,visited_els)+'
' - visited_els.append(text) - if wp.tag=='{%s}r' %namespace and wp not in visited_els: - run_text+=get_text(wp,namespace,visited_els) - visited_els.append(wp) - if not el.getchildren(): - run_text+='
' - if wp.tag == '{%s}ilvl' %namespace: - for lst in el.iter(): - if lst.find('{%s}numId' %namespace) is not None and el not in visited_els: - numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace] - lst_type=get_list_style(numval) - if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet': - if lst.getnext() is not None: - if lst not in visited_els: - while lst.getnext() is not None: - if lst not in visited_els: - text = get_text(lst,namespace,visited_els) - next_txt = get_text(lst.getnext(),namespace,visited_els) - running_text += text + next_txt - visited_els.append(lst) - visited_els.append(lst.getnext()) - lst=lst.getnext() - else: - run_text += '
  • ' + running_text + '
  • ' - break - else: - run_text +='
  • ' + get_text(lst, namespace, visited_els) + '
  • ' - visited_els.append(lst) - print running_text - return run_text - - -def get_text(wp,namespace,visited_els): - run_text= '' - decorator = '' - closing = '' - if wp.find('{%s}tab' %namespace) is not None: - run_text+='%nbsp' - if wp.find('{%s}rPr' %namespace) is not None: - for tag in wp.iter(): - if tag.find('{%s}u' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator +='' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}i' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}b' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - run_text = wp.find('{%s}t' %namespace).text - run_text = decorator + run_text + closing - if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els: - run_text+=wp.find('{%s}t' %namespace).text - return run_text - -def get_list_style(numval): - ids = numbering_tree.findall('{%s}num' %numbering_namespace) - for id in ids: - if id.attrib['{%s}numId' %numbering_namespace] == numval: - abstractid=id.find('{%s}abstractNumId' %numbering_namespace) - abstractid=abstractid.attrib['{%s}val' %numbering_namespace] - style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace) - for info in style_information: - if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid: - for i in info.iter(): - if i.find('{%s}numFmt' %numbering_namespace) is not None: - return i.find('{%s}numFmt' %numbering_namespace).attrib - -print get_parsed() diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py index bfaad2a6..5950f267 100644 --- a/pydocx/parsers/Docx2Html.py +++ b/pydocx/parsers/Docx2Html.py @@ -1,21 +1,46 @@ -from pydocx.DocxParser import DocxParser - +import base64 import xml.sax.saxutils +from pydocx.DocxParser import DocxParser + class Docx2Html(DocxParser): @property def parsed(self): - self._parsed = self._parsed.replace('

    ', '
    ') - self._parsed = self._parsed.replace('


    ', '

    ') - self._parsed = self._parsed.replace('