Merge pull request #45 from OpenScienceFramework/table_fix

jlward · jlward · commit e385ae0fb2e7 · 2013-06-18T07:54:24.000-07:00
Table fix
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,7 +1,11 @@
 
 Changelog
 =========
-
+* 0.3.2
+    * We were not taking into account that vertical merges should have a
+    continue attribute, but sometimes they do not, and in those cases word
+    assumes the continue attribute. We updated the parser to handle the
+    cases in which the continue attribute is not there.
 * 0.3.1
     * Added support for several more OOXML tags including:
         * caps
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
@@ -143,7 +143,6 @@ def parse(self, el):
         for child in el:
             # recursive. So you can get all the way to the bottom
             parsed += self.parse(child)
-
         if el.tag == 'br' and el.attrib.get('type') == 'page':
             return self.parse_page_break(el, parsed)
         elif el.tag == 'tbl':
@@ -183,10 +182,15 @@ def parse_table_row(self, el, text):
 
     def parse_table_cell(self, el, text):
         v_merge = find_first(el, 'vMerge')
-        if v_merge is not None and 'continue' == v_merge.get('val', ''):
+        if v_merge is not None and (
+                'restart' != v_merge.get('val', '')):
             return ''
         colspan = self.get_colspan(el)
         rowspan = self._get_rowspan(el, v_merge)
+        if rowspan > 1:
+            rowspan = str(rowspan)
+        else:
+            rowspan = ''
         return self.table_cell(text, colspan, rowspan)
 
     def parse_list(self, el, text):
@@ -434,7 +438,6 @@ def _get_rowspan(self, el, v_merge):
         current_col = self.pre_processor.column_index(el)
         rowspan = 1
         result = ''
-
         tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl')
         # We only want table cells that have a higher row_index that is greater
         # than the current_row and that are on the current_col
diff --git a/pydocx/__init__.py b/pydocx/__init__.py
@@ -8,5 +8,4 @@ def docx2html(path):
 def docx2markdown(path):
     return Docx2Markdown(path).parsed
 
-
 VERSION = '0.3.1'
diff --git a/pydocx/tests/document_builder.py b/pydocx/tests/document_builder.py
@@ -188,18 +188,22 @@ def li(self, text, ilvl, numId, bold=False):
         return template.render(**kwargs)
 
     @classmethod
-    def table(self, num_rows, num_columns, text, merge=False):
-
-        def _tc(cell_value):
-            template = env.get_template(templates['tc'])
-            return template.render(p_tag=cell_value, merge=merge)
+    def table_cell(self, paragraph, merge=False, merge_continue=False):
+        kwargs = {
+            'paragraph': paragraph,
+            'merge': merge,
+            'merge_continue': merge_continue
+        }
+        template = env.get_template(templates['tc'])
+        return template.render(**kwargs)
 
-        def _tr(rows, text):
-            tcs = [_tc(text.next()) for _ in range(rows)]
-            template = env.get_template(templates['tr'])
-            return template.render(table_cells=tcs)
+    @classmethod
+    def table_row(self, tcs):
+        template = env.get_template(templates['tr'])
+        return template.render(table_cells=tcs)
 
-        trs = [_tr(num_rows, text) for _ in range(num_rows)]
+    @classmethod
+    def table(self, trs):
         template = env.get_template(templates['table'])
         return template.render(table_rows=trs)
 
diff --git a/pydocx/tests/templates/tc.xml b/pydocx/tests/templates/tc.xml
@@ -1,9 +1,13 @@
 <w:tc>
 	<w:tcPr>
 		<w:tcW w:type="dxa" w:w="4986"/>
+        {% if merge_continue %}
+        <w:vMerge>
+        </w:vMerge>
+        {% endif %}
         {% if merge %}
-        <w:vmerge val="restart">
-        </w:vmerge>
+        <w:vMerge val="restart">
+        </w:vMerge>
         {% endif %}
 		<w:tcBorders>
 			<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
@@ -18,5 +22,7 @@
 			<w:right w:type="dxa" w:w="55"/>
 		</w:tcMar>
 	</w:tcPr>
-	{{ p_tag }}
+    {% if paragraph %}
+	{{ paragraph }}
+    {% endif %}
 </w:tc>
diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py
@@ -671,15 +671,15 @@ def test_simple_table():
     assert_html_equal(actual_html, BASE_HTML % '''
     <table border="1">
         <tr>
-            <td>Cell1<br />
+            <td rowspan="2">
+                Cell1<br />
                 Cell3
             </td>
             <td>Cell2<br />
                 And I am writing in the table
             </td>
         </tr>
         <tr>
-            <td></td>
             <td>Cell4</td>
         </tr>
     </table>
diff --git a/pydocx/tests/test_xml.py b/pydocx/tests/test_xml.py
@@ -1,6 +1,5 @@
 import os
 import time
-from itertools import chain
 
 from nose.plugins.skip import SkipTest
 
@@ -46,6 +45,7 @@ def get_xml(self):
 
 
 class HyperlinkVanillaTestCase(_TranslationTestCase):
+
     relationship_dict = {
         'rId0': 'www.google.com',
     }
@@ -296,12 +296,40 @@ class TableTag(_TranslationTestCase):
     '''
 
     def get_xml(self):
-        table = DXB.table(num_rows=2, num_columns=2, text=chain(
-            [DXB.p_tag('AAA')],
-            [DXB.p_tag('BBB')],
-            [DXB.p_tag('CCC')],
-            [DXB.p_tag('DDD')],
-        ))
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class RowSpanTestCase(_TranslationTestCase):
+
+    expected_output = '''
+           <table border="1">
+            <tr>
+                <td rowspan="2">AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td>CCC</td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(
+            paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False)
+        cell2 = DXB.table_cell(
+            paragraph=DXB.p_tag(None), merge=False, merge_continue=True)
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
         body = table
         xml = DXB.xml(body)
         return xml
@@ -333,18 +361,18 @@ class NestedTableTag(_TranslationTestCase):
     '''
 
     def get_xml(self):
-        nested_table = DXB.table(num_rows=2, num_columns=2, text=chain(
-            [DXB.p_tag('DDD')],
-            [DXB.p_tag('EEE')],
-            [DXB.p_tag('FFF')],
-            [DXB.p_tag('GGG')],
-        ))
-        table = DXB.table(num_rows=2, num_columns=2, text=chain(
-            [DXB.p_tag('AAA')],
-            [DXB.p_tag('BBB')],
-            [DXB.p_tag('CCC')],
-            [nested_table],
-        ))
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        nested_table = DXB.table(rows)
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(nested_table)
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
         body = table
         xml = DXB.xml(body)
         return xml
@@ -365,14 +393,12 @@ class TableWithInvalidTag(_TranslationTestCase):
     '''
 
     def get_xml(self):
-        table = DXB.table(num_rows=2, num_columns=2, text=chain(
-            [DXB.p_tag('AAA')],
-            [DXB.p_tag('BBB')],
-            # This tag may have CCC in it, however this tag has no meaning
-            # pertaining to content.
-            ['<w:invalidTag>CCC</w:invalidTag>'],
-            [DXB.p_tag('DDD')],
-        ))
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell('<w:invalidTag>CCC</w:invalidTag>')
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
         body = table
         xml = DXB.xml(body)
         return xml
@@ -410,9 +436,9 @@ def get_xml(self):
         td = ''
         for el in els:
             td += el
-        table = DXB.table(num_rows=1, num_columns=1, text=chain(
-            [td],
-        ))
+        cell1 = DXB.table_cell(td)
+        row = DXB.table_row([cell1])
+        table = DXB.table([row])
         body = table
         xml = DXB.xml(body)
         return xml
@@ -495,12 +521,12 @@ class ListWithContinuationTestCase(_TranslationTestCase):
     '''
 
     def get_xml(self):
-        table = DXB.table(num_rows=2, num_columns=2, text=chain(
-            [DXB.p_tag('DDD')],
-            [DXB.p_tag('EEE')],
-            [DXB.p_tag('FFF')],
-            [DXB.p_tag('GGG')],
-        ))
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
         tags = [
             DXB.li(text='AAA', ilvl=0, numId=1),
             DXB.p_tag('BBB'),
@@ -536,12 +562,12 @@ class ListWithMultipleContinuationTestCase(_TranslationTestCase):
     '''
 
     def get_xml(self):
-        table1 = DXB.table(num_rows=1, num_columns=1, text=chain(
-            [DXB.p_tag('BBB')],
-        ))
-        table2 = DXB.table(num_rows=1, num_columns=1, text=chain(
-            [DXB.p_tag('CCC')],
-        ))
+        cell = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        row = DXB.table_row([cell])
+        table1 = DXB.table([row])
+        cell = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        row = DXB.table_row([cell])
+        table2 = DXB.table([row])
         tags = [
             DXB.li(text='AAA', ilvl=0, numId=1),
             table1,
@@ -648,12 +674,12 @@ class DeeplyNestedTableTestCase(_TranslationTestCase):
     run_expected_output = False
 
     def get_xml(self):
-        table = DXB.p_tag('AAA')
+        paragraph = DXB.p_tag('AAA')
 
         for _ in range(50):
-            table = DXB.table(num_rows=1, num_columns=1, text=chain(
-                [table],
-            ))
+            cell = DXB.table_cell(paragraph)
+            row = DXB.table_cell([cell])
+            table = DXB.table([row])
         body = table
         xml = DXB.xml(body)
         return xml
@@ -816,19 +842,20 @@ class SimpleTableTest(_TranslationTestCase):
         </table>'''
 
     def get_xml(self):
-        table = DXB.table(num_rows=3, num_columns=3, text=chain(
-            [DXB.p_tag('Blank')],
-            [DXB.p_tag('Column 1')],
-            [DXB.p_tag('Column 2')],
-            [DXB.p_tag('Row 1')],
-            [DXB.p_tag('First')],
-            [DXB.p_tag('Second')],
-            [DXB.p_tag('Row 2')],
-            [DXB.p_tag('Third')],
-            [DXB.p_tag('Fourth')],
-        ), merge=True)
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1'))
+        cell5 = DXB.table_cell(paragraph=DXB.p_tag('First'))
+        cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third'))
+        cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2'))
+        cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second'))
+        cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth'))
+        rows = [DXB.table_row([cell1, cell4, cell7]),
+                DXB.table_row([cell2, cell5, cell8]),
+                DXB.table_row([cell3, cell6, cell9])]
+        table = DXB.table(rows)
         body = table
-
         xml = DXB.xml(body)
         return xml
 
@@ -889,15 +916,14 @@ def get_xml(self):
         lis = ''
         for text, ilvl, numId in li_text:
             lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
-        table = DXB.table(num_rows=1, num_columns=1, text=chain(
-            [lis],
-        ))
+        cell1 = DXB.table_cell(lis)
+        rows = DXB.table_row([cell1])
+        table = DXB.table([rows])
         lis = ''
         lis += DXB.li(text='AAA', ilvl=0, numId=1)
         lis += table
         lis += DXB.li(text='CCC', ilvl=0, numId=1)
         body = lis
-
         xml = DXB.xml(body)
         return xml
 
@@ -931,6 +957,7 @@ class HeadingTestCase(_TranslationTestCase):
         <h6>GGG</h6>
         <p>HHH</p>
     '''
+
     styles_dict = {
         'style0': 'heading 1',
         'style1': 'heading 2',
diff --git a/pydocx/utils.py b/pydocx/utils.py
@@ -304,7 +304,8 @@ def _set_table_attributes(self, el):
                     v_merge = find_first(child, 'vMerge')
                     if (
                             v_merge is not None and
-                            'continue' == v_merge.get('val', '')
+                            ('continue' == v_merge.get('val', '') or
+                             v_merge.attrib == {})
                     ):
                         self.meta_data[child]['vmerge_continue'] = True