Merge branch 'cc-14-wat-decode-html-char-entities', fixes #14

sebastian-nagel · sebastian-nagel · commit c7e79be728f7 · 2019-07-08T16:56:15.000+02:00
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -51,6 +51,8 @@ public class ExtractingParseObserver implements ParseObserver {
 			Pattern.compile(jsOnClickUrl2PatString)
 	};
 
+	protected static Pattern wsPattern = Pattern.compile("\\s+");
+
 	private final static int MAX_TEXT_LEN = 100;
 
 	private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
@@ -161,6 +163,7 @@ public void handleTagOpen(TagNode tag) {
 			}
 			attrName = attrName.toLowerCase(Locale.ROOT);
 			if (globalHrefAttributes.contains(attrName)) {
+				attrValue = Translate.decode(attrValue);
 				data.addHref(PATH,makePath(name,attrName),"url",attrValue);
 			}
 		}
@@ -196,7 +199,7 @@ public void handleTagClose(TagNode tag) {
 				if((vals != null) && (vals.size() > 0)) {
 					if(text != null) {
 						// contained an href - we want to ignore <a name="X"></a>:
-						String trimmed = text.toString().trim().replaceAll("\\s+", " ");
+						String trimmed = wsPattern.matcher(Translate.decode(text.toString()).trim()).replaceAll(" ");
 						if(trimmed.length() > MAX_TEXT_LEN) {
 							trimmed = trimmed.substring(0,MAX_TEXT_LEN);
 						}
@@ -240,7 +243,7 @@ public void handleTextNode(TextNode text) {
 			}
 		}
 
-		String t = text.getText().replaceAll("\\s+", " ");
+		String t = wsPattern.matcher(txt).replaceAll(" ");
 
 		if(t.length() > MAX_TEXT_LEN) {
 			t = t.substring(0,MAX_TEXT_LEN);
@@ -271,8 +274,9 @@ public void handleScriptNode(TextNode text) {
 	}
 
 	public void handleStyleNode(TextNode text) {
-		patternCSSExtract(data, cssUrlPattern, text.getText());
-		patternCSSExtract(data, cssImportNoUrlPattern, text.getText());
+		String cssStr = Translate.decode(text.getText());
+		patternCSSExtract(data, cssUrlPattern, cssStr);
+		patternCSSExtract(data, cssImportNoUrlPattern, cssStr);
 	}
 
 	public void handleRemarkNode(RemarkNode remark) {
@@ -299,6 +303,7 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
 		for(String attr : attrs) {
 			String val = node.getAttribute(attr);
 			if(val != null) {
+				val = Translate.decode(val);
 				data.addHref(PATH,makePath(node.getTagName(),attr),"url",val);
 			}
 		}
@@ -309,6 +314,7 @@ private static ArrayList<String> getAttrList(TagNode node, String... attrs) {
 		for(String attr : attrs) {
 			String val = node.getAttribute(attr);
 			if(val != null) {
+				val = Translate.decode(val);
 				l.add(attr);
 				l.add(val);
 			}
@@ -324,6 +330,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
 		String url = node.getAttribute(urlAttr);
 		ArrayList<String> l = null;
 		if(url != null) {
+			url = Translate.decode(url);
 			l = new ArrayList<String>();
 			l.add(PATH);
 			l.add(makePath(node.getTagName(),urlAttr));
@@ -333,6 +340,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
 			for(String attr : optionalAttrs) {
 				String val = node.getAttribute(attr);
 				if(val != null) {
+					val = Translate.decode(val);
 					l.add(attr);
 					l.add(val);
 				}
@@ -356,6 +364,7 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
 			for (Pattern pattern : jsOnClickUrlPatterns) {
 				String url = patternJSExtract(pattern, onclick);
 				if (url != null) {
+					// TODO: translate?
 					data.addHref(PATH, path, "url", url);
 				}
 			}
@@ -395,13 +404,15 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
 			String url = node.getAttribute("href");
 			if(url != null) {
 				// got data:
+				url = Translate.decode(url);
 				l.add(PATH);
 				l.add(makePath("A","href"));
 				l.add("url");
 				l.add(url);
 				for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
 					String v = node.getAttribute(a);
 					if(v != null) {
+						v = Translate.decode(v);
 						l.add(a);
 						l.add(v);
 					}
@@ -428,6 +439,7 @@ private static class AreaTagExtractor implements TagExtractor {
 		public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
 			String url = node.getAttribute("href");
 			if(url != null) {
+				url = Translate.decode(url);
 				ArrayList<String> l = new ArrayList<String>();
 				l.add(PATH);
 				l.add(makePath("AREA","href"));
@@ -449,6 +461,7 @@ private static class BaseTagExtractor implements TagExtractor {
 		public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
 			String url = node.getAttribute("href");
 			if(url != null) {
+				url = Translate.decode(url);
 				data.setBaseHref(url);
 			}
 		}
@@ -483,6 +496,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
 			ArrayList<String> l = new ArrayList<String>();
 			String url = node.getAttribute("action");
 			if(url != null) {
+				url = Translate.decode(url);
 				// got data:
 				l.add(PATH);
 				l.add(makePath("FORM","action"));
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -15,6 +15,7 @@
 import org.archive.resource.ResourceParseException;
 import org.archive.resource.ResourceProducer;
 import org.htmlparser.nodes.TextNode;
+import org.htmlparser.util.Translate;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
@@ -141,16 +142,24 @@ private void checkExtract(String[] data) throws JSONException {
 	}
 	
 	private void checkLink(Multimap<String,String> links, String url, String path) {
-		assertTrue("Link with URL " + url + " not found", links.containsKey(url));
+		assertTrue("Link with URL " + url + " not found in [" + String.join(", ", links.keySet()) + "]",
+				links.containsKey(url));
 		assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
 	}
 
+	private void checkAnchor(Multimap<String,String> anchors, String url, String anchor) {
+		assertTrue("Anchor for URL " + url + " not found in [" + String.join(", ", anchors.keySet()) + "]",
+				anchors.containsKey(url));
+		assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
+	}
+
 	private void checkLinks(Resource resource, String[][] expectedLinks) {
 		assertNotNull(resource);
 		assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
 		MetaData md = resource.getMetaData();
 		LOG.info(md.toString());
 		Multimap<String, String> links = ArrayListMultimap.create();
+		Multimap<String, String> anchors = ArrayListMultimap.create();
 		JSONObject head = md.optJSONObject("Head");
 		if (head != null) {
 			// <base href="http://www.example.com/" />
@@ -189,9 +198,22 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
 			for (int i = 0; i < ldata.length(); i++) {
 				JSONObject o = (JSONObject) ldata.optJSONObject(i);
 				try {
-					String url = o.getString("url");
+					String url;
+					if (o.has("url")) {
+						url = o.getString("url");
+					} else if (o.has("href")) {
+						url = o.getString("href");
+					} else {
+						fail("No URL found in: " + o);
+						continue;
+					}
 					links.put(url, o.getString("path"));
-					LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
+					LOG.info(" found link: " + url + " " + o.getString("path"));
+					if (o.has("text")) {
+						anchors.put(url, o.getString("text"));
+					} else if (o.has("alt")) {
+						anchors.put(url, o.getString("alt"));
+					}
 				} catch (JSONException e) {
 					fail("Failed to extract URL from link: " + e.getMessage());
 				}
@@ -200,6 +222,9 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
 		assertEquals("Unexpected number of links", expectedLinks.length, links.size());
 		for (String[] l : expectedLinks) {
 			checkLink(links, l[0], l[1]);
+			if (l.length > 2 && l[2] != null) {
+				checkAnchor(anchors, l[0], l[2]);
+			}
 		}
 	}
 
@@ -225,8 +250,8 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 		};
 		checkLinks(extractor.getNext(), html4links);
 		String[][] html5links = {
-				{"http:///www.example.com/video.html", "LINK@/href", "canonical"},
-				{"video.rss", "LINK@/href", "alternate"},
+				{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
+				{"video.rss", "LINK@/href", null, "alternate"},
 				{"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
 				{"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
 				{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
@@ -245,7 +270,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 		};
 		checkLinks(extractor.getNext(), fbVideoLinks);
 		String[][] dataHrefLinks = {
-				{"standard.css", "LINK@/href", "stylesheet"},
+				{"standard.css", "LINK@/href", null, "stylesheet"},
 				{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
 				{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
 				{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
@@ -265,9 +290,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"jackbox/img/thumbs/4.jpg",  "IMG@/src"},
 				{"//venobox-destination", "A@/data-href"},
 				{"#", "A@/href"},
-				{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1", "DIV@/data-href"},
+				{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
 				{"#", "A@/href"},
-				{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0", "IFRAME@/src"}
+				{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
 		};
 		checkLinks(extractor.getNext(), dataHrefLinks);
 		String[][] fbSocialLinks = {
@@ -292,6 +317,30 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
 				{"http://example.com/location/href/2.html", "INPUT@/onclick"}
 		};
 		checkLinks(extractor.getNext(), onClickLinks);
+		String[][] escapedEntitiesLinks = {
+				{"http://www.example.com/", "__base__"},
+				{"http://www.example.com/redirected.html", "__meta_refresh__"},
+				{"/view?id=logo&action=edit", "A@/href"},
+				{"http://www.example.com/search?q=examples&n=20", "A@/href", "Examples & more"},
+				{"/view?id=logo&res=420x180", "STYLE/#text"},
+				{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
+					"image URL containing escaped ampersand (\"&amp;\")" }
+		};
+		Resource resource = extractor.getNext();
+		assertNotNull(resource);
+		checkLinks(resource, escapedEntitiesLinks);
+		MetaData md = resource.getMetaData();
+		assertEquals("Wrong title", "Title – \"Title\" written using character entities",
+				md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
+		JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
+		for (int i = 0; i < metas.length(); i++) {
+			JSONObject o = (JSONObject) metas.optJSONObject(i);
+			String property = o.optString("property");
+			if (property.equals("og:description")) {
+				String content = o.optString("content");
+				assertEquals(content, "Apostrophe's description");
+			}
+		}
 	}
 
 	public void testTextExtraction() throws ResourceParseException, IOException {
@@ -323,4 +372,31 @@ public void testTextExtraction() throws ResourceParseException, IOException {
 		// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
 	}
 
+	public void testHtmlParserEntityDecoding() {
+		String[][] entities = { //
+				// ampersand
+				{ "&amp;", "&" },
+				// apostrophe
+				// TODO: { "&apos;", "'" },
+				// comma
+				// TODO: { "&comma;", "," },
+				// % percent
+				// TODO: { "percnt", "%" },
+				// ’ right single quotation mark
+				{ "&rsquo;", "\u2019" },
+				// » right-pointing double angle quotation mark
+				{ "&raquo", "\u00bb" },
+				// … horizontal ellipsis
+				{ "&hellip;", "\u2026" },
+				// 𤆑 CJK UNIFIED IDEOGRAPH-24191
+				// TODO: { "&#x24191;", new String(Character.toChars(0x24191)) },
+				// 😊 U+1F60A SMILING FACE WITH SMILING EYES
+				// TODO: { "&#x1F60A;", new String(Character.toChars(0x1f60a)) },
+		};
+		for (String[] ent : entities) {
+			String decoded = Translate.decode(ent[0]);
+			assertEquals("Entity " + ent[0] + " not properly decoded", ent[1], decoded);
+		}
+	}
+
 }
diff --git a/src/test/resources/org/archive/resource/html/link-extraction-test.warc b/src/test/resources/org/archive/resource/html/link-extraction-test.warc
@@ -126,6 +126,7 @@ This is valid HTML5!
 <P>paragraph one with <A hRef = http://www.example.com/ >link</a>.
 
 
+
 WARC/1.0
 WARC-Type: response
 WARC-Target-URI: http://www.example.com/fb-video.html
@@ -321,6 +322,7 @@ Content-Type: text/html
 WARC/1.0
 WARC-Type: response
 WARC-Date: 2017-08-23T13:54:59Z
+WARC-Target-URI: http://www.example.com/link-extraction-test-onclick-attr.html
 Content-Type: application/http;msgtype=response
 Content-Length: 1279
 
@@ -360,3 +362,44 @@ Content-Type: text/html
 </html>
 
 
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: http://www.example.com/link-extraction-test-unescaped-entities.xhtml
+WARC-Date: 2019-06-19T13:13:38Z
+WARC-IP-Address: 127.0.0.1
+Content-Type: application/http;msgtype=response
+Content-Length: 1520
+
+HTTP/1.1 200 OK
+Date: Wed, 19 Jun 2019 13:13:38 GMT
+Server: Apache/2.4.29 (Ubuntu)
+Last-Modified: Wed, 19 Jun 2019 13:11:24 GMT
+ETag: "4c6-58bacf761e299"
+Accept-Ranges: bytes
+Content-Length: 1223
+Keep-Alive: timeout=5, max=100
+Connection: Keep-Alive
+Content-Type: application/xhtml+xml
+
+<?xml version="1.0"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#">
+<head>
+<meta property="og:description" content="Apostrophe&#39;s description" />
+<meta content="Apostrophe&apos;s description" name="description" /><!-- Note: &apos; is defined in XML 1.0 but is not part of HTML -->
+<meta http-equiv="Refresh" content="5; URL=http:&#x2f;&#x2f;www.example.com&#x2f;redirected.html" />
+<base href="http://www.example.com/" />
+<title>Title &#8211; &quot;&#84;&#x69;&#116;&#x6c;&#101;&quot; written using character entities</title>
+<style type="text/css">.logo{background-image:url("/view?id=logo&amp;res=420x180"); background-color: #cccccc; display:block; height:180px; width:420px;}</style>
+</head>
+<body>
+<p class="logo">Here is the <a href="/view?id=logo&amp;action=edit">logo</a></p>
+<p>
+<a href="http://www.example.com/search?q&#x3D;examples&amp;n&#x3D;20" target="_blank" rel="nofollow">Examples &amp; more</a>
+<img src="https://img.example.org/view?id=867&amp;res=10x16" alt="image URL containing escaped ampersand (&quot;&amp;amp;&quot;)" />
+</p>
+</body>
+</html>
+
+