From 4971d4ae6a7ca19651da4decedf452bd0def3951 Mon Sep 17 00:00:00 2001 From: Peter Collins Date: Thu, 15 Jun 2023 21:38:10 -0400 Subject: [PATCH 1/3] [Java.Interop.Tools.JavaSource] Improve `` parsing Parsing for `` elements has been improved to fix all 83 cases where conversion to a `` element would fail. Some examples of such failures include: ## Unable to parse HTML element: here System.Xml.XmlException: 'https' is an unexpected token. The expected token is '"' or '''. Line 1, position 11. ## Unable to parse HTML element: Progress & activity System.Xml.XmlException: An error occurred while parsing EntityName. Line 2, position 11. ## Unable to parse HTML element: broken System.Xml.XmlException: '#' is an unexpected token. The expected token is '"' or '''. Line 1, position 11. ## Unable to parse HTML element: RFC 2045 System.Xml.XmlException: 'http' is an unexpected token. The expected token is '"' or '''. Line 1, position 11. When we encounter an `` element that points to code or a local path we will now only include the element value in the javadoc, and not the full `href` attribute value. --- ...urceJavadocToXmldocGrammar.HtmlBnfTerms.cs | 79 +++++++++++-------- ...avadocToXmldocGrammar.HtmlBnfTermsTests.cs | 6 +- 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs index f17b63600..c07785a00 100644 --- a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs +++ b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs @@ -25,7 +25,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) AllHtmlTerms.Rule = TopLevelInlineDeclaration | PBlockDeclaration | PreBlockDeclaration - | IgnorableElementDeclaration ; var inlineDeclaration = new NonTerminal ("", ConcatChildNodes) { @@ -100,43 +99,39 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) parseNode.AstNode = p; }; - InlineHyperLinkDeclaration.Rule = InlineHyperLinkOpenTerm + InlineDeclarations + CreateEndElement ("a", grammar, optional: true); + InlineHyperLinkDeclaration.Rule = HtmlAElementStart + InlineDeclarations + CreateEndElement ("a", grammar, optional: true); InlineHyperLinkDeclaration.AstConfig.NodeCreator = (context, parseNode) => { - var unparsedAElementValue = string.Empty; - foreach (var cn in parseNode.ChildNodes) { - if (cn.ChildNodes?.Count > 1) { - foreach (var gcn in cn.ChildNodes) { - unparsedAElementValue += gcn.AstNode?.ToString (); - } - } else { - unparsedAElementValue += cn.AstNode?.ToString (); - } - } + var nodesAsString = GetChildNodesAsString (parseNode); + var tokenValue = parseNode.ChildNodes [0].Token.Text; + int stopIndex = nodesAsString.IndexOf ('>'); - var seeElement = TryParseHRef (unparsedAElementValue); - if (seeElement == null) - seeElement = TryParseHRef (WebUtility.HtmlDecode (unparsedAElementValue), logError: true); + if (stopIndex == -1 || !tokenValue.Contains ("href", StringComparison.OrdinalIgnoreCase)) { + parseNode.AstNode = new XText (nodesAsString); + return; + } - var hrefValue = seeElement?.Attribute ("href")?.Value ?? string.Empty; - if (!string.IsNullOrEmpty (hrefValue) && - (hrefValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase) || hrefValue.StartsWith ("www", StringComparison.OrdinalIgnoreCase))) { - parseNode.AstNode = seeElement; + var attributeName = parseNode.ChildNodes [0].Term.Name; + var attributeValue = nodesAsString.Substring (0, stopIndex).Trim ().Trim ('\'', '"'); + var elementValue = nodesAsString.Substring (stopIndex + 1); + if (!string.IsNullOrEmpty (attributeValue) && + (attributeValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase) || attributeValue.StartsWith ("www", StringComparison.OrdinalIgnoreCase))) { + var unparsed = $"{elementValue}"; + XNode? seeElement = TryParseElement (unparsed); + if (seeElement == null) { + // Try to parse with HTML entities decoded + seeElement = TryParseElement (WebUtility.HtmlDecode (unparsed)); + if (seeElement == null) { + // Finally, try to parse with only the element value encoded + seeElement = TryParseElement ($"{WebUtility.HtmlEncode (elementValue)}", logError: true); + } + } + parseNode.AstNode = seeElement ?? new XText (nodesAsString); } else { // TODO: Need to convert relative paths or code references to appropriate CREF value. - parseNode.AstNode = new XText (unparsedAElementValue); + parseNode.AstNode = new XText (elementValue); } }; - // Start to trim out unusable HTML elements/tags, but not any inner values - IgnorableElementDeclaration.Rule = - CreateStartElementIgnoreAttribute ("a", "name") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true) - | CreateStartElementIgnoreAttribute ("a", "id") + InlineDeclarations + CreateEndElement ("a", grammar, optional: true) - ; - IgnorableElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => { - var aElementValue = new XText (parseNode.ChildNodes [1].AstNode.ToString () ?? string.Empty); - parseNode.AstNode = aElementValue; - }; - CodeElementDeclaration.Rule = CreateStartElement ("code", grammar) + InlineDeclarations + CreateEndElement ("code", grammar); CodeElementDeclaration.AstConfig.NodeCreator = (context, parseNode) => { var target = parseNode.ChildNodes [1].AstNode; @@ -184,13 +179,28 @@ static IEnumerable GetParagraphs (ParseTreeNodeList children) } } - static XElement? TryParseHRef (string unparsedAElementValue, bool logError = false) + static string GetChildNodesAsString (ParseTreeNode parseNode) + { + var unparsed = string.Empty; + foreach (var cn in parseNode.ChildNodes) { + if (cn.ChildNodes?.Count > 1) { + foreach (var gcn in cn.ChildNodes) { + unparsed += gcn.AstNode?.ToString (); + } + } else { + unparsed += cn.AstNode?.ToString (); + } + } + return unparsed; + } + + static XElement? TryParseElement (string unparsed, bool logError = false) { try { - return XElement.Parse ($""); + return XElement.Parse (unparsed); } catch (Exception x) { if (logError) - Console.Error.WriteLine ($"## Unable to parse HTML element: \n{x.GetType ()}: {x.Message}"); + Console.Error.WriteLine ($"## Unable to parse HTML element: `{unparsed}`\n{x.GetType ()}: {x.Message}"); return null; } } @@ -221,10 +231,9 @@ static IEnumerable GetParagraphs (ParseTreeNodeList children) public readonly NonTerminal PBlockDeclaration = new NonTerminal (nameof (PBlockDeclaration), ConcatChildNodes); public readonly NonTerminal PreBlockDeclaration = new NonTerminal (nameof (PreBlockDeclaration), ConcatChildNodes); public readonly NonTerminal InlineHyperLinkDeclaration = new NonTerminal (nameof (InlineHyperLinkDeclaration), ConcatChildNodes); - public readonly NonTerminal IgnorableElementDeclaration = new NonTerminal (nameof (IgnorableElementDeclaration), ConcatChildNodes); public readonly NonTerminal CodeElementDeclaration = new NonTerminal (nameof (CodeElementDeclaration), ConcatChildNodes); - public readonly Terminal InlineHyperLinkOpenTerm = new RegexBasedTerminal ("field classification"); Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); - Assert.AreEqual ("\"AutofillService.html#FieldClassification\">field classification", + Assert.AreEqual ("field classification", r.Root.AstNode.ToString ()); + + r = p.Parse ("\nProgress & activity"); + Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); + Assert.AreEqual ($"{Environment.NewLine}Progress & activity", r.Root.AstNode.ToString ()); } From 76aec0dc26306b811cf1c5256b67081ccd47b734 Mon Sep 17 00:00:00 2001 From: Peter Collins Date: Fri, 16 Jun 2023 15:24:15 -0400 Subject: [PATCH 2/3] Apply feedback --- .../SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs index c07785a00..af113eb5f 100644 --- a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs +++ b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs @@ -99,7 +99,7 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) parseNode.AstNode = p; }; - InlineHyperLinkDeclaration.Rule = HtmlAElementStart + InlineDeclarations + CreateEndElement ("a", grammar, optional: true); + InlineHyperLinkDeclaration.Rule = InlineHyperLinkOpenTerm + InlineDeclarations + CreateEndElement ("a", grammar, optional: true); InlineHyperLinkDeclaration.AstConfig.NodeCreator = (context, parseNode) => { var nodesAsString = GetChildNodesAsString (parseNode); var tokenValue = parseNode.ChildNodes [0].Token.Text; @@ -233,7 +233,7 @@ static string GetChildNodesAsString (ParseTreeNode parseNode) public readonly NonTerminal InlineHyperLinkDeclaration = new NonTerminal (nameof (InlineHyperLinkDeclaration), ConcatChildNodes); public readonly NonTerminal CodeElementDeclaration = new NonTerminal (nameof (CodeElementDeclaration), ConcatChildNodes); - public readonly Terminal HtmlAElementStart = new RegexBasedTerminal (" Date: Fri, 16 Jun 2023 17:49:20 -0400 Subject: [PATCH 3/3] Update tests and IgnorableCharTerminal --- ...urceJavadocToXmldocGrammar.HtmlBnfTerms.cs | 3 +- ...vadocToXmldocGrammar.InlineTagsBnfTerms.cs | 42 ++++++++++++++----- ...avadocToXmldocGrammar.HtmlBnfTermsTests.cs | 16 +++++++ 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs index af113eb5f..475da73b5 100644 --- a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs +++ b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.HtmlBnfTerms.cs @@ -113,8 +113,7 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) var attributeName = parseNode.ChildNodes [0].Term.Name; var attributeValue = nodesAsString.Substring (0, stopIndex).Trim ().Trim ('\'', '"'); var elementValue = nodesAsString.Substring (stopIndex + 1); - if (!string.IsNullOrEmpty (attributeValue) && - (attributeValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase) || attributeValue.StartsWith ("www", StringComparison.OrdinalIgnoreCase))) { + if (!string.IsNullOrEmpty (attributeValue) && attributeValue.StartsWith ("http", StringComparison.OrdinalIgnoreCase)) { var unparsed = $"{elementValue}"; XNode? seeElement = TryParseElement (unparsed); if (seeElement == null) { diff --git a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.InlineTagsBnfTerms.cs b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.InlineTagsBnfTerms.cs index f78fab170..2592762f0 100644 --- a/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.InlineTagsBnfTerms.cs +++ b/src/Java.Interop.Tools.JavaSource/Java.Interop.Tools.JavaSource/SourceJavadocToXmldocGrammar.InlineTagsBnfTerms.cs @@ -109,15 +109,6 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) } }; - // Inline content may contain reserved characters with no tags or special parsing rules, do not throw when encountering them - IgnorableDeclaration.Rule = grammar.ToTerm ("@ ") - | grammar.ToTerm ("{") - | grammar.ToTerm ("}") - ; - IgnorableDeclaration.AstConfig.NodeCreator = (context, parseNode) => { - parseNode.AstNode = new XText (parseNode.ChildNodes [0].Term.Name.Trim ()); - }; - InlineParamDeclaration.Rule = grammar.ToTerm ("{@param") + InlineValue + "}"; InlineParamDeclaration.AstConfig.NodeCreator = (context, parseNode) => { var target = parseNode.ChildNodes [1].AstNode; @@ -156,9 +147,38 @@ internal void CreateRules (SourceJavadocToXmldocGrammar grammar) // https://docs.oracle.com/javase/7/docs/technotes/tools/windows/javadoc.html#value public readonly NonTerminal ValueDeclaration = new NonTerminal (nameof (ValueDeclaration)); - public readonly NonTerminal IgnorableDeclaration = new NonTerminal (nameof (IgnorableDeclaration)); - public readonly NonTerminal InlineParamDeclaration = new NonTerminal (nameof (InlineParamDeclaration)); + + public readonly Terminal IgnorableDeclaration = new IgnorableCharTerminal (nameof (IgnorableDeclaration)) { + AstConfig = new AstNodeConfig { + NodeCreator = (context, parseNode) => parseNode.AstNode = parseNode.Token.Value.ToString (), + }, + }; + } } + + class IgnorableCharTerminal : Terminal + { + public IgnorableCharTerminal (string name) + : base (name) + { + Priority = TerminalPriority.Low - 1; + } + + public override Token? TryMatch (ParsingContext context, ISourceStream source) + { + var startChar = source.Text [source.Location.Position]; + if (startChar != '@' + && startChar != '{' + && startChar != '}' + ) { + return null; + } + source.PreviewPosition += 1; + return source.CreateToken (OutputTerminal, startChar); + } + + } + } diff --git a/tests/Java.Interop.Tools.JavaSource-Tests/SourceJavadocToXmldocGrammar.HtmlBnfTermsTests.cs b/tests/Java.Interop.Tools.JavaSource-Tests/SourceJavadocToXmldocGrammar.HtmlBnfTermsTests.cs index 85810cc38..e699d9c1a 100644 --- a/tests/Java.Interop.Tools.JavaSource-Tests/SourceJavadocToXmldocGrammar.HtmlBnfTermsTests.cs +++ b/tests/Java.Interop.Tools.JavaSource-Tests/SourceJavadocToXmldocGrammar.HtmlBnfTermsTests.cs @@ -71,6 +71,22 @@ public void HyperLinkDeclaration () Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); Assert.AreEqual ("field classification", r.Root.AstNode.ToString ()); + r = p.Parse ("here"); + Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); + Assert.AreEqual ("here", r.Root.AstNode.ToString ()); + + r = p.Parse ("libphonenumber"); + Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); + Assert.AreEqual ("libphonenumber", r.Root.AstNode.ToString ()); + + r = p.Parse (" broken"); + Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); + Assert.AreEqual (" broken", r.Root.AstNode.ToString ()); + + r = p.Parse ("nobody"); + Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); + Assert.AreEqual ("nobody", r.Root.AstNode.ToString ()); + r = p.Parse ("\nProgress & activity"); Assert.IsFalse (r.HasErrors (), DumpMessages (r, p)); Assert.AreEqual ($"{Environment.NewLine}Progress & activity",