1+ using DocumentFormat . OpenXml . Packaging ;
2+ using OpenXmlPowerTools ;
3+ using OpenXmlPowerTools . OpenXMLWordprocessingMLToHtmlConverter ;
4+ using OpenXmlPowerTools . Tests ;
5+ using System . IO ;
6+ using System . Linq ;
7+ using System . Text ;
8+ using System . Xml . Linq ;
9+ using Xunit ;
10+
11+ namespace OxPt
12+ {
13+ public class WmlToHtmlConverterTests
14+ {
15+ // PowerShell oneliner that generates InlineData for all files in a directory
16+ // dir | % { '[InlineData("' + $_.Name + '")]' } | clip
17+
18+ [ Theory ]
19+ [ InlineData ( "HC001-5DayTourPlanTemplate.docx" ) ]
20+ [ InlineData ( "HC002-Hebrew-01.docx" ) ]
21+ [ InlineData ( "HC003-Hebrew-02.docx" ) ]
22+ [ InlineData ( "HC004-ResumeTemplate.docx" ) ]
23+ [ InlineData ( "HC005-TaskPlanTemplate.docx" ) ]
24+ [ InlineData ( "HC006-Test-01.docx" ) ]
25+ [ InlineData ( "HC007-Test-02.docx" ) ]
26+ [ InlineData ( "HC008-Test-03.docx" ) ]
27+ [ InlineData ( "HC009-Test-04.docx" ) ]
28+ [ InlineData ( "HC010-Test-05.docx" ) ]
29+ [ InlineData ( "HC011-Test-06.docx" ) ]
30+ [ InlineData ( "HC012-Test-07.docx" ) ]
31+ [ InlineData ( "HC013-Test-08.docx" ) ]
32+ [ InlineData ( "HC014-RTL-Table-01.docx" ) ]
33+ [ InlineData ( "HC015-Vertical-Spacing-atLeast.docx" ) ]
34+ [ InlineData ( "HC016-Horizontal-Spacing-firstLine.docx" ) ]
35+ [ InlineData ( "HC017-Vertical-Alignment-Cell-01.docx" ) ]
36+ [ InlineData ( "HC018-Vertical-Alignment-Para-01.docx" ) ]
37+ [ InlineData ( "HC019-Hidden-Run.docx" ) ]
38+ [ InlineData ( "HC020-Small-Caps.docx" ) ]
39+ [ InlineData ( "HC021-Symbols.docx" ) ]
40+ [ InlineData ( "HC022-Table-Of-Contents.docx" ) ]
41+ [ InlineData ( "HC023-Hyperlink.docx" ) ]
42+ [ InlineData ( "HC024-Tabs-01.docx" ) ]
43+ [ InlineData ( "HC025-Tabs-02.docx" ) ]
44+ [ InlineData ( "HC026-Tabs-03.docx" ) ]
45+ [ InlineData ( "HC027-Tabs-04.docx" ) ]
46+ [ InlineData ( "HC028-No-Break-Hyphen.docx" ) ]
47+ [ InlineData ( "HC029-Table-Merged-Cells.docx" ) ]
48+ [ InlineData ( "HC030-Content-Controls.docx" ) ]
49+ [ InlineData ( "HC031-Complicated-Document.docx" ) ]
50+ [ InlineData ( "HC032-Named-Color.docx" ) ]
51+ [ InlineData ( "HC033-Run-With-Border.docx" ) ]
52+ [ InlineData ( "HC034-Run-With-Position.docx" ) ]
53+ [ InlineData ( "HC035-Strike-Through.docx" ) ]
54+ [ InlineData ( "HC036-Super-Script.docx" ) ]
55+ [ InlineData ( "HC037-Sub-Script.docx" ) ]
56+ [ InlineData ( "HC038-Conflicting-Border-Weight.docx" ) ]
57+ [ InlineData ( "HC039-Bold.docx" ) ]
58+ [ InlineData ( "HC040-Hyperlink-Fieldcode-01.docx" ) ]
59+ [ InlineData ( "HC041-Hyperlink-Fieldcode-02.docx" ) ]
60+ [ InlineData ( "HC042-Image-Png.docx" ) ]
61+ [ InlineData ( "HC043-Chart.docx" ) ]
62+ [ InlineData ( "HC044-Embedded-Workbook.docx" ) ]
63+ [ InlineData ( "HC045-Italic.docx" ) ]
64+ [ InlineData ( "HC046-BoldAndItalic.docx" ) ]
65+ [ InlineData ( "HC047-No-Section.docx" ) ]
66+ [ InlineData ( "HC048-Excerpt.docx" ) ]
67+ [ InlineData ( "HC049-Borders.docx" ) ]
68+ [ InlineData ( "HC050-Shaded-Text-01.docx" ) ]
69+ [ InlineData ( "HC051-Shaded-Text-02.docx" ) ]
70+ [ InlineData ( "HC060-Image-with-Hyperlink.docx" ) ]
71+ [ InlineData ( "HC061-Hyperlink-in-Field.docx" ) ]
72+ public void HC001 ( string name )
73+ {
74+ var sourceDir = new DirectoryInfo ( "../../../../TestFiles/" ) ;
75+ var sourceDocx = new FileInfo ( Path . Combine ( sourceDir . FullName , name ) ) ;
76+
77+ var oxPtConvertedDestHtml = new FileInfo ( Path . Combine ( TestUtil . TempDir . FullName , sourceDocx . Name . Replace ( ".docx" , "-3-OxPt.html" ) ) ) ;
78+ ConvertToHtml ( sourceDocx , oxPtConvertedDestHtml , false ) ;
79+ }
80+
81+ [ Theory ]
82+ [ InlineData ( "HC006-Test-01.docx" ) ]
83+ public void HC002_NoCssClasses ( string name )
84+ {
85+ var sourceDir = new DirectoryInfo ( "../../../../TestFiles/" ) ;
86+ var sourceDocx = new FileInfo ( Path . Combine ( sourceDir . FullName , name ) ) ;
87+
88+ var oxPtConvertedDestHtml = new FileInfo ( Path . Combine ( TestUtil . TempDir . FullName , sourceDocx . Name . Replace ( ".docx" , "-5-OxPt-No-CSS-Classes.html" ) ) ) ;
89+ ConvertToHtml ( sourceDocx , oxPtConvertedDestHtml , true ) ;
90+ }
91+
92+ private static void CopyFormattingAssembledDocx ( FileInfo source , FileInfo dest )
93+ {
94+ var ba = File . ReadAllBytes ( source . FullName ) ;
95+ using var ms = new MemoryStream ( ) ;
96+ ms . Write ( ba , 0 , ba . Length ) ;
97+ using ( var wordDoc = WordprocessingDocument . Open ( ms , true ) )
98+ {
99+ RevisionAccepter . AcceptRevisions ( wordDoc ) ;
100+ var simplifyMarkupSettings = new SimplifyMarkupSettings
101+ {
102+ RemoveComments = true ,
103+ RemoveContentControls = true ,
104+ RemoveEndAndFootNotes = true ,
105+ RemoveFieldCodes = false ,
106+ RemoveLastRenderedPageBreak = true ,
107+
108+ RemovePermissions = true ,
109+ RemoveProof = true ,
110+ RemoveRsidInfo = true ,
111+ RemoveSmartTags = true ,
112+ RemoveSoftHyphens = true ,
113+ RemoveGoBackBookmark = true ,
114+ ReplaceTabsWithSpaces = false ,
115+ } ;
116+ MarkupSimplifier . SimplifyMarkup ( wordDoc , simplifyMarkupSettings ) ;
117+
118+ var formattingAssemblerSettings = new FormattingAssemblerSettings
119+ {
120+ RemoveStyleNamesFromParagraphAndRunProperties = false ,
121+ ClearStyles = false ,
122+ RestrictToSupportedLanguages = false ,
123+ RestrictToSupportedNumberingFormats = false ,
124+ CreateHtmlConverterAnnotationAttributes = true ,
125+ OrderElementsPerStandard = false ,
126+ ListItemRetrieverSettings =
127+ new ListItemRetrieverSettings ( )
128+ {
129+ ListItemTextImplementations = ListItemRetrieverSettings . DefaultListItemTextImplementations ,
130+ } ,
131+ } ;
132+
133+ FormattingAssembler . AssembleFormatting ( wordDoc , formattingAssemblerSettings ) ;
134+ }
135+ var newBa = ms . ToArray ( ) ;
136+ File . WriteAllBytes ( dest . FullName , newBa ) ;
137+ }
138+
139+ private static void ConvertToHtml ( FileInfo sourceDocx , FileInfo destFileName , bool fabricateCssClasses )
140+ {
141+ var byteArray = File . ReadAllBytes ( sourceDocx . FullName ) ;
142+ using var memoryStream = new MemoryStream ( ) ;
143+ memoryStream . Write ( byteArray , 0 , byteArray . Length ) ;
144+ using var wDoc = WordprocessingDocument . Open ( memoryStream , true ) ;
145+ var outputDirectory = destFileName . Directory ;
146+ destFileName = new FileInfo ( Path . Combine ( outputDirectory . FullName , destFileName . Name ) ) ;
147+ var imageDirectoryName = destFileName . FullName . Substring ( 0 , destFileName . FullName . Length - 5 ) + "_files" ;
148+ var pageTitle = ( string ) wDoc . CoreFilePropertiesPart . GetXDocument ( ) . Descendants ( DC . title ) . FirstOrDefault ( ) ;
149+ if ( pageTitle == null )
150+ {
151+ pageTitle = sourceDocx . FullName ;
152+ }
153+
154+ var settings = new WmlToHtmlConverterSettings ( )
155+ {
156+ PageTitle = pageTitle ,
157+ FabricateCssClasses = fabricateCssClasses ,
158+ CssClassPrefix = fabricateCssClasses ? "pt-" : null ,
159+ RestrictToSupportedLanguages = false ,
160+ RestrictToSupportedNumberingFormats = false
161+ } ;
162+
163+ var html = WmlToHtmlConverter . ConvertToHtml ( wDoc , settings ) ;
164+
165+ // Note: the xhtml returned by ConvertToHtmlTransform contains objects of type XEntity. PtOpenXmlUtil.cs define the XEntity class. See http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-references-using-linq-to-xml.aspx for detailed explanation.
166+ // If you further transform the XML tree returned by ConvertToHtmlTransform, you must do it correctly, or entities will not be serialized properly.
167+
168+ var htmlString = html . ToString ( SaveOptions . DisableFormatting ) ;
169+ File . WriteAllText ( destFileName . FullName , htmlString , Encoding . UTF8 ) ;
170+ }
171+ }
172+ }
0 commit comments