Skip to content

Commit e385ae0

Browse files
committed
Merge pull request #45 from OpenScienceFramework/table_fix
Table fix
2 parents e6c9137 + f7c9a49 commit e385ae0

File tree

8 files changed

+127
-83
lines changed

8 files changed

+127
-83
lines changed

CHANGELOG

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11

22
Changelog
33
=========
4-
4+
* 0.3.2
5+
* We were not taking into account that vertical merges should have a
6+
continue attribute, but sometimes they do not, and in those cases word
7+
assumes the continue attribute. We updated the parser to handle the
8+
cases in which the continue attribute is not there.
59
* 0.3.1
610
* Added support for several more OOXML tags including:
711
* caps

pydocx/DocxParser.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ def parse(self, el):
143143
for child in el:
144144
# recursive. So you can get all the way to the bottom
145145
parsed += self.parse(child)
146-
147146
if el.tag == 'br' and el.attrib.get('type') == 'page':
148147
return self.parse_page_break(el, parsed)
149148
elif el.tag == 'tbl':
@@ -183,10 +182,15 @@ def parse_table_row(self, el, text):
183182

184183
def parse_table_cell(self, el, text):
185184
v_merge = find_first(el, 'vMerge')
186-
if v_merge is not None and 'continue' == v_merge.get('val', ''):
185+
if v_merge is not None and (
186+
'restart' != v_merge.get('val', '')):
187187
return ''
188188
colspan = self.get_colspan(el)
189189
rowspan = self._get_rowspan(el, v_merge)
190+
if rowspan > 1:
191+
rowspan = str(rowspan)
192+
else:
193+
rowspan = ''
190194
return self.table_cell(text, colspan, rowspan)
191195

192196
def parse_list(self, el, text):
@@ -434,7 +438,6 @@ def _get_rowspan(self, el, v_merge):
434438
current_col = self.pre_processor.column_index(el)
435439
rowspan = 1
436440
result = ''
437-
438441
tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl')
439442
# We only want table cells that have a higher row_index that is greater
440443
# than the current_row and that are on the current_col

pydocx/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ def docx2html(path):
88
def docx2markdown(path):
99
return Docx2Markdown(path).parsed
1010

11-
1211
VERSION = '0.3.1'

pydocx/tests/document_builder.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -188,18 +188,22 @@ def li(self, text, ilvl, numId, bold=False):
188188
return template.render(**kwargs)
189189

190190
@classmethod
191-
def table(self, num_rows, num_columns, text, merge=False):
192-
193-
def _tc(cell_value):
194-
template = env.get_template(templates['tc'])
195-
return template.render(p_tag=cell_value, merge=merge)
191+
def table_cell(self, paragraph, merge=False, merge_continue=False):
192+
kwargs = {
193+
'paragraph': paragraph,
194+
'merge': merge,
195+
'merge_continue': merge_continue
196+
}
197+
template = env.get_template(templates['tc'])
198+
return template.render(**kwargs)
196199

197-
def _tr(rows, text):
198-
tcs = [_tc(text.next()) for _ in range(rows)]
199-
template = env.get_template(templates['tr'])
200-
return template.render(table_cells=tcs)
200+
@classmethod
201+
def table_row(self, tcs):
202+
template = env.get_template(templates['tr'])
203+
return template.render(table_cells=tcs)
201204

202-
trs = [_tr(num_rows, text) for _ in range(num_rows)]
205+
@classmethod
206+
def table(self, trs):
203207
template = env.get_template(templates['table'])
204208
return template.render(table_rows=trs)
205209

pydocx/tests/templates/tc.xml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
<w:tc>
22
<w:tcPr>
33
<w:tcW w:type="dxa" w:w="4986"/>
4+
{% if merge_continue %}
5+
<w:vMerge>
6+
</w:vMerge>
7+
{% endif %}
48
{% if merge %}
5-
<w:vmerge val="restart">
6-
</w:vmerge>
9+
<w:vMerge val="restart">
10+
</w:vMerge>
711
{% endif %}
812
<w:tcBorders>
913
<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
@@ -18,5 +22,7 @@
1822
<w:right w:type="dxa" w:w="55"/>
1923
</w:tcMar>
2024
</w:tcPr>
21-
{{ p_tag }}
25+
{% if paragraph %}
26+
{{ paragraph }}
27+
{% endif %}
2228
</w:tc>

pydocx/tests/test_docx.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -671,15 +671,15 @@ def test_simple_table():
671671
assert_html_equal(actual_html, BASE_HTML % '''
672672
<table border="1">
673673
<tr>
674-
<td>Cell1<br />
674+
<td rowspan="2">
675+
Cell1<br />
675676
Cell3
676677
</td>
677678
<td>Cell2<br />
678679
And I am writing in the table
679680
</td>
680681
</tr>
681682
<tr>
682-
<td></td>
683683
<td>Cell4</td>
684684
</tr>
685685
</table>

pydocx/tests/test_xml.py

Lines changed: 89 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import time
3-
from itertools import chain
43

54
from nose.plugins.skip import SkipTest
65

@@ -46,6 +45,7 @@ def get_xml(self):
4645

4746

4847
class HyperlinkVanillaTestCase(_TranslationTestCase):
48+
4949
relationship_dict = {
5050
'rId0': 'www.google.com',
5151
}
@@ -296,12 +296,40 @@ class TableTag(_TranslationTestCase):
296296
'''
297297

298298
def get_xml(self):
299-
table = DXB.table(num_rows=2, num_columns=2, text=chain(
300-
[DXB.p_tag('AAA')],
301-
[DXB.p_tag('BBB')],
302-
[DXB.p_tag('CCC')],
303-
[DXB.p_tag('DDD')],
304-
))
299+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
300+
cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
301+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
302+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
303+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
304+
table = DXB.table(rows)
305+
body = table
306+
xml = DXB.xml(body)
307+
return xml
308+
309+
310+
class RowSpanTestCase(_TranslationTestCase):
311+
312+
expected_output = '''
313+
<table border="1">
314+
<tr>
315+
<td rowspan="2">AAA</td>
316+
<td>BBB</td>
317+
</tr>
318+
<tr>
319+
<td>CCC</td>
320+
</tr>
321+
</table>
322+
'''
323+
324+
def get_xml(self):
325+
cell1 = DXB.table_cell(
326+
paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False)
327+
cell2 = DXB.table_cell(
328+
paragraph=DXB.p_tag(None), merge=False, merge_continue=True)
329+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
330+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
331+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
332+
table = DXB.table(rows)
305333
body = table
306334
xml = DXB.xml(body)
307335
return xml
@@ -333,18 +361,18 @@ class NestedTableTag(_TranslationTestCase):
333361
'''
334362

335363
def get_xml(self):
336-
nested_table = DXB.table(num_rows=2, num_columns=2, text=chain(
337-
[DXB.p_tag('DDD')],
338-
[DXB.p_tag('EEE')],
339-
[DXB.p_tag('FFF')],
340-
[DXB.p_tag('GGG')],
341-
))
342-
table = DXB.table(num_rows=2, num_columns=2, text=chain(
343-
[DXB.p_tag('AAA')],
344-
[DXB.p_tag('BBB')],
345-
[DXB.p_tag('CCC')],
346-
[nested_table],
347-
))
364+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
365+
cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
366+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
367+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
368+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
369+
nested_table = DXB.table(rows)
370+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
371+
cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
372+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
373+
cell4 = DXB.table_cell(nested_table)
374+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
375+
table = DXB.table(rows)
348376
body = table
349377
xml = DXB.xml(body)
350378
return xml
@@ -365,14 +393,12 @@ class TableWithInvalidTag(_TranslationTestCase):
365393
'''
366394

367395
def get_xml(self):
368-
table = DXB.table(num_rows=2, num_columns=2, text=chain(
369-
[DXB.p_tag('AAA')],
370-
[DXB.p_tag('BBB')],
371-
# This tag may have CCC in it, however this tag has no meaning
372-
# pertaining to content.
373-
['<w:invalidTag>CCC</w:invalidTag>'],
374-
[DXB.p_tag('DDD')],
375-
))
396+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
397+
cell2 = DXB.table_cell('<w:invalidTag>CCC</w:invalidTag>')
398+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
399+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
400+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
401+
table = DXB.table(rows)
376402
body = table
377403
xml = DXB.xml(body)
378404
return xml
@@ -410,9 +436,9 @@ def get_xml(self):
410436
td = ''
411437
for el in els:
412438
td += el
413-
table = DXB.table(num_rows=1, num_columns=1, text=chain(
414-
[td],
415-
))
439+
cell1 = DXB.table_cell(td)
440+
row = DXB.table_row([cell1])
441+
table = DXB.table([row])
416442
body = table
417443
xml = DXB.xml(body)
418444
return xml
@@ -495,12 +521,12 @@ class ListWithContinuationTestCase(_TranslationTestCase):
495521
'''
496522

497523
def get_xml(self):
498-
table = DXB.table(num_rows=2, num_columns=2, text=chain(
499-
[DXB.p_tag('DDD')],
500-
[DXB.p_tag('EEE')],
501-
[DXB.p_tag('FFF')],
502-
[DXB.p_tag('GGG')],
503-
))
524+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
525+
cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
526+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
527+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
528+
rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
529+
table = DXB.table(rows)
504530
tags = [
505531
DXB.li(text='AAA', ilvl=0, numId=1),
506532
DXB.p_tag('BBB'),
@@ -536,12 +562,12 @@ class ListWithMultipleContinuationTestCase(_TranslationTestCase):
536562
'''
537563

538564
def get_xml(self):
539-
table1 = DXB.table(num_rows=1, num_columns=1, text=chain(
540-
[DXB.p_tag('BBB')],
541-
))
542-
table2 = DXB.table(num_rows=1, num_columns=1, text=chain(
543-
[DXB.p_tag('CCC')],
544-
))
565+
cell = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
566+
row = DXB.table_row([cell])
567+
table1 = DXB.table([row])
568+
cell = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
569+
row = DXB.table_row([cell])
570+
table2 = DXB.table([row])
545571
tags = [
546572
DXB.li(text='AAA', ilvl=0, numId=1),
547573
table1,
@@ -648,12 +674,12 @@ class DeeplyNestedTableTestCase(_TranslationTestCase):
648674
run_expected_output = False
649675

650676
def get_xml(self):
651-
table = DXB.p_tag('AAA')
677+
paragraph = DXB.p_tag('AAA')
652678

653679
for _ in range(50):
654-
table = DXB.table(num_rows=1, num_columns=1, text=chain(
655-
[table],
656-
))
680+
cell = DXB.table_cell(paragraph)
681+
row = DXB.table_cell([cell])
682+
table = DXB.table([row])
657683
body = table
658684
xml = DXB.xml(body)
659685
return xml
@@ -816,19 +842,20 @@ class SimpleTableTest(_TranslationTestCase):
816842
</table>'''
817843

818844
def get_xml(self):
819-
table = DXB.table(num_rows=3, num_columns=3, text=chain(
820-
[DXB.p_tag('Blank')],
821-
[DXB.p_tag('Column 1')],
822-
[DXB.p_tag('Column 2')],
823-
[DXB.p_tag('Row 1')],
824-
[DXB.p_tag('First')],
825-
[DXB.p_tag('Second')],
826-
[DXB.p_tag('Row 2')],
827-
[DXB.p_tag('Third')],
828-
[DXB.p_tag('Fourth')],
829-
), merge=True)
845+
cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank'))
846+
cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1'))
847+
cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2'))
848+
cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1'))
849+
cell5 = DXB.table_cell(paragraph=DXB.p_tag('First'))
850+
cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third'))
851+
cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2'))
852+
cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second'))
853+
cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth'))
854+
rows = [DXB.table_row([cell1, cell4, cell7]),
855+
DXB.table_row([cell2, cell5, cell8]),
856+
DXB.table_row([cell3, cell6, cell9])]
857+
table = DXB.table(rows)
830858
body = table
831-
832859
xml = DXB.xml(body)
833860
return xml
834861

@@ -889,15 +916,14 @@ def get_xml(self):
889916
lis = ''
890917
for text, ilvl, numId in li_text:
891918
lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
892-
table = DXB.table(num_rows=1, num_columns=1, text=chain(
893-
[lis],
894-
))
919+
cell1 = DXB.table_cell(lis)
920+
rows = DXB.table_row([cell1])
921+
table = DXB.table([rows])
895922
lis = ''
896923
lis += DXB.li(text='AAA', ilvl=0, numId=1)
897924
lis += table
898925
lis += DXB.li(text='CCC', ilvl=0, numId=1)
899926
body = lis
900-
901927
xml = DXB.xml(body)
902928
return xml
903929

@@ -931,6 +957,7 @@ class HeadingTestCase(_TranslationTestCase):
931957
<h6>GGG</h6>
932958
<p>HHH</p>
933959
'''
960+
934961
styles_dict = {
935962
'style0': 'heading 1',
936963
'style1': 'heading 2',

pydocx/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,8 @@ def _set_table_attributes(self, el):
304304
v_merge = find_first(child, 'vMerge')
305305
if (
306306
v_merge is not None and
307-
'continue' == v_merge.get('val', '')
307+
('continue' == v_merge.get('val', '') or
308+
v_merge.attrib == {})
308309
):
309310
self.meta_data[child]['vmerge_continue'] = True
310311

0 commit comments

Comments
 (0)