Skip to content

Commit c7e79be

Browse files
Merge branch 'cc-14-wat-decode-html-char-entities', fixes #14
2 parents 538ec1d + fb514ab commit c7e79be

File tree

3 files changed

+145
-12
lines changed

3 files changed

+145
-12
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ public class ExtractingParseObserver implements ParseObserver {
5151
Pattern.compile(jsOnClickUrl2PatString)
5252
};
5353

54+
protected static Pattern wsPattern = Pattern.compile("\\s+");
55+
5456
private final static int MAX_TEXT_LEN = 100;
5557

5658
private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
@@ -161,6 +163,7 @@ public void handleTagOpen(TagNode tag) {
161163
}
162164
attrName = attrName.toLowerCase(Locale.ROOT);
163165
if (globalHrefAttributes.contains(attrName)) {
166+
attrValue = Translate.decode(attrValue);
164167
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
165168
}
166169
}
@@ -196,7 +199,7 @@ public void handleTagClose(TagNode tag) {
196199
if((vals != null) && (vals.size() > 0)) {
197200
if(text != null) {
198201
// contained an href - we want to ignore <a name="X"></a>:
199-
String trimmed = text.toString().trim().replaceAll("\\s+", " ");
202+
String trimmed = wsPattern.matcher(Translate.decode(text.toString()).trim()).replaceAll(" ");
200203
if(trimmed.length() > MAX_TEXT_LEN) {
201204
trimmed = trimmed.substring(0,MAX_TEXT_LEN);
202205
}
@@ -240,7 +243,7 @@ public void handleTextNode(TextNode text) {
240243
}
241244
}
242245

243-
String t = text.getText().replaceAll("\\s+", " ");
246+
String t = wsPattern.matcher(txt).replaceAll(" ");
244247

245248
if(t.length() > MAX_TEXT_LEN) {
246249
t = t.substring(0,MAX_TEXT_LEN);
@@ -271,8 +274,9 @@ public void handleScriptNode(TextNode text) {
271274
}
272275

273276
public void handleStyleNode(TextNode text) {
274-
patternCSSExtract(data, cssUrlPattern, text.getText());
275-
patternCSSExtract(data, cssImportNoUrlPattern, text.getText());
277+
String cssStr = Translate.decode(text.getText());
278+
patternCSSExtract(data, cssUrlPattern, cssStr);
279+
patternCSSExtract(data, cssImportNoUrlPattern, cssStr);
276280
}
277281

278282
public void handleRemarkNode(RemarkNode remark) {
@@ -299,6 +303,7 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
299303
for(String attr : attrs) {
300304
String val = node.getAttribute(attr);
301305
if(val != null) {
306+
val = Translate.decode(val);
302307
data.addHref(PATH,makePath(node.getTagName(),attr),"url",val);
303308
}
304309
}
@@ -309,6 +314,7 @@ private static ArrayList<String> getAttrList(TagNode node, String... attrs) {
309314
for(String attr : attrs) {
310315
String val = node.getAttribute(attr);
311316
if(val != null) {
317+
val = Translate.decode(val);
312318
l.add(attr);
313319
l.add(val);
314320
}
@@ -324,6 +330,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
324330
String url = node.getAttribute(urlAttr);
325331
ArrayList<String> l = null;
326332
if(url != null) {
333+
url = Translate.decode(url);
327334
l = new ArrayList<String>();
328335
l.add(PATH);
329336
l.add(makePath(node.getTagName(),urlAttr));
@@ -333,6 +340,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
333340
for(String attr : optionalAttrs) {
334341
String val = node.getAttribute(attr);
335342
if(val != null) {
343+
val = Translate.decode(val);
336344
l.add(attr);
337345
l.add(val);
338346
}
@@ -356,6 +364,7 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
356364
for (Pattern pattern : jsOnClickUrlPatterns) {
357365
String url = patternJSExtract(pattern, onclick);
358366
if (url != null) {
367+
// TODO: translate?
359368
data.addHref(PATH, path, "url", url);
360369
}
361370
}
@@ -395,13 +404,15 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
395404
String url = node.getAttribute("href");
396405
if(url != null) {
397406
// got data:
407+
url = Translate.decode(url);
398408
l.add(PATH);
399409
l.add(makePath("A","href"));
400410
l.add("url");
401411
l.add(url);
402412
for(String a : new String[] {"target","alt","title","rel","hreflang","type"}) {
403413
String v = node.getAttribute(a);
404414
if(v != null) {
415+
v = Translate.decode(v);
405416
l.add(a);
406417
l.add(v);
407418
}
@@ -428,6 +439,7 @@ private static class AreaTagExtractor implements TagExtractor {
428439
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
429440
String url = node.getAttribute("href");
430441
if(url != null) {
442+
url = Translate.decode(url);
431443
ArrayList<String> l = new ArrayList<String>();
432444
l.add(PATH);
433445
l.add(makePath("AREA","href"));
@@ -449,6 +461,7 @@ private static class BaseTagExtractor implements TagExtractor {
449461
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
450462
String url = node.getAttribute("href");
451463
if(url != null) {
464+
url = Translate.decode(url);
452465
data.setBaseHref(url);
453466
}
454467
}
@@ -483,6 +496,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
483496
ArrayList<String> l = new ArrayList<String>();
484497
String url = node.getAttribute("action");
485498
if(url != null) {
499+
url = Translate.decode(url);
486500
// got data:
487501
l.add(PATH);
488502
l.add(makePath("FORM","action"));

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 84 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.archive.resource.ResourceParseException;
1616
import org.archive.resource.ResourceProducer;
1717
import org.htmlparser.nodes.TextNode;
18+
import org.htmlparser.util.Translate;
1819
import org.json.JSONArray;
1920
import org.json.JSONException;
2021
import org.json.JSONObject;
@@ -141,16 +142,24 @@ private void checkExtract(String[] data) throws JSONException {
141142
}
142143

143144
private void checkLink(Multimap<String,String> links, String url, String path) {
144-
assertTrue("Link with URL " + url + " not found", links.containsKey(url));
145+
assertTrue("Link with URL " + url + " not found in [" + String.join(", ", links.keySet()) + "]",
146+
links.containsKey(url));
145147
assertTrue("Wrong path " + path + " for " + url, links.get(url).contains(path));
146148
}
147149

150+
private void checkAnchor(Multimap<String,String> anchors, String url, String anchor) {
151+
assertTrue("Anchor for URL " + url + " not found in [" + String.join(", ", anchors.keySet()) + "]",
152+
anchors.containsKey(url));
153+
assertTrue("Wrong anchor text " + anchor + " for " + url, anchors.get(url).contains(anchor));
154+
}
155+
148156
private void checkLinks(Resource resource, String[][] expectedLinks) {
149157
assertNotNull(resource);
150158
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
151159
MetaData md = resource.getMetaData();
152160
LOG.info(md.toString());
153161
Multimap<String, String> links = ArrayListMultimap.create();
162+
Multimap<String, String> anchors = ArrayListMultimap.create();
154163
JSONObject head = md.optJSONObject("Head");
155164
if (head != null) {
156165
// <base href="http://www.example.com/" />
@@ -189,9 +198,22 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
189198
for (int i = 0; i < ldata.length(); i++) {
190199
JSONObject o = (JSONObject) ldata.optJSONObject(i);
191200
try {
192-
String url = o.getString("url");
201+
String url;
202+
if (o.has("url")) {
203+
url = o.getString("url");
204+
} else if (o.has("href")) {
205+
url = o.getString("href");
206+
} else {
207+
fail("No URL found in: " + o);
208+
continue;
209+
}
193210
links.put(url, o.getString("path"));
194-
LOG.info(" found link: " + o.getString("url") + " " + o.getString("path"));
211+
LOG.info(" found link: " + url + " " + o.getString("path"));
212+
if (o.has("text")) {
213+
anchors.put(url, o.getString("text"));
214+
} else if (o.has("alt")) {
215+
anchors.put(url, o.getString("alt"));
216+
}
195217
} catch (JSONException e) {
196218
fail("Failed to extract URL from link: " + e.getMessage());
197219
}
@@ -200,6 +222,9 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
200222
assertEquals("Unexpected number of links", expectedLinks.length, links.size());
201223
for (String[] l : expectedLinks) {
202224
checkLink(links, l[0], l[1]);
225+
if (l.length > 2 && l[2] != null) {
226+
checkAnchor(anchors, l[0], l[2]);
227+
}
203228
}
204229
}
205230

@@ -225,8 +250,8 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
225250
};
226251
checkLinks(extractor.getNext(), html4links);
227252
String[][] html5links = {
228-
{"http:///www.example.com/video.html", "LINK@/href", "canonical"},
229-
{"video.rss", "LINK@/href", "alternate"},
253+
{"http:///www.example.com/video.html", "LINK@/href", null, "canonical"},
254+
{"video.rss", "LINK@/href", null, "alternate"},
230255
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif", "VIDEO@/poster"},
231256
{"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm", "SOURCE@/src"},
232257
{"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4", "SOURCE@/src"},
@@ -245,7 +270,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
245270
};
246271
checkLinks(extractor.getNext(), fbVideoLinks);
247272
String[][] dataHrefLinks = {
248-
{"standard.css", "LINK@/href", "stylesheet"},
273+
{"standard.css", "LINK@/href", null, "stylesheet"},
249274
{"https://www.facebook.com/elegantthemes/videos/10153760379211923/", "DIV@/data-href"},
250275
{"https://www.facebook.com/facebook/videos/10153231379946729/", "DIV@/data-href"},
251276
{"https://www.facebook.com/facebook/videos/10153231379946729/", "BLOCKQUOTE@/cite"},
@@ -265,9 +290,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
265290
{"jackbox/img/thumbs/4.jpg", "IMG@/src"},
266291
{"//venobox-destination", "A@/data-href"},
267292
{"#", "A@/href"},
268-
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1", "DIV@/data-href"},
293+
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1", "DIV@/data-href"},
269294
{"#", "A@/href"},
270-
{"http://www.youtube.com/v/itTskyFLSS8&amp;rel=0&amp;autohide=1&amp;showinfo=0", "IFRAME@/src"}
295+
{"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0", "IFRAME@/src"}
271296
};
272297
checkLinks(extractor.getNext(), dataHrefLinks);
273298
String[][] fbSocialLinks = {
@@ -292,6 +317,30 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
292317
{"http://example.com/location/href/2.html", "INPUT@/onclick"}
293318
};
294319
checkLinks(extractor.getNext(), onClickLinks);
320+
String[][] escapedEntitiesLinks = {
321+
{"http://www.example.com/", "__base__"},
322+
{"http://www.example.com/redirected.html", "__meta_refresh__"},
323+
{"/view?id=logo&action=edit", "A@/href"},
324+
{"http://www.example.com/search?q=examples&n=20", "A@/href", "Examples & more"},
325+
{"/view?id=logo&res=420x180", "STYLE/#text"},
326+
{"https://img.example.org/view?id=867&res=10x16", "IMG@/src",
327+
"image URL containing escaped ampersand (\"&amp;\")" }
328+
};
329+
Resource resource = extractor.getNext();
330+
assertNotNull(resource);
331+
checkLinks(resource, escapedEntitiesLinks);
332+
MetaData md = resource.getMetaData();
333+
assertEquals("Wrong title", "Title – \"Title\" written using character entities",
334+
md.getJSONObject(ResourceConstants.HTML_HEAD).getString(ResourceConstants.HTML_TITLE));
335+
JSONArray metas = md.getJSONObject(ResourceConstants.HTML_HEAD).getJSONArray(ResourceConstants.HTML_META_TAGS);
336+
for (int i = 0; i < metas.length(); i++) {
337+
JSONObject o = (JSONObject) metas.optJSONObject(i);
338+
String property = o.optString("property");
339+
if (property.equals("og:description")) {
340+
String content = o.optString("content");
341+
assertEquals(content, "Apostrophe's description");
342+
}
343+
}
295344
}
296345

297346
public void testTextExtraction() throws ResourceParseException, IOException {
@@ -323,4 +372,31 @@ public void testTextExtraction() throws ResourceParseException, IOException {
323372
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
324373
}
325374

375+
public void testHtmlParserEntityDecoding() {
376+
String[][] entities = { //
377+
// ampersand
378+
{ "&amp;", "&" },
379+
// apostrophe
380+
// TODO: { "&apos;", "'" },
381+
// comma
382+
// TODO: { "&comma;", "," },
383+
// % percent
384+
// TODO: { "percnt", "%" },
385+
// ’ right single quotation mark
386+
{ "&rsquo;", "\u2019" },
387+
// » right-pointing double angle quotation mark
388+
{ "&raquo", "\u00bb" },
389+
// … horizontal ellipsis
390+
{ "&hellip;", "\u2026" },
391+
// 𤆑 CJK UNIFIED IDEOGRAPH-24191
392+
// TODO: { "&#x24191;", new String(Character.toChars(0x24191)) },
393+
// 😊 U+1F60A SMILING FACE WITH SMILING EYES
394+
// TODO: { "&#x1F60A;", new String(Character.toChars(0x1f60a)) },
395+
};
396+
for (String[] ent : entities) {
397+
String decoded = Translate.decode(ent[0]);
398+
assertEquals("Entity " + ent[0] + " not properly decoded", ent[1], decoded);
399+
}
400+
}
401+
326402
}

src/test/resources/org/archive/resource/html/link-extraction-test.warc

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ This is valid HTML5!
126126
<P>paragraph one with <A hRef = http://www.example.com/ >link</a>.
127127

128128

129+
129130
WARC/1.0
130131
WARC-Type: response
131132
WARC-Target-URI: http://www.example.com/fb-video.html
@@ -321,6 +322,7 @@ Content-Type: text/html
321322
WARC/1.0
322323
WARC-Type: response
323324
WARC-Date: 2017-08-23T13:54:59Z
325+
WARC-Target-URI: http://www.example.com/link-extraction-test-onclick-attr.html
324326
Content-Type: application/http;msgtype=response
325327
Content-Length: 1279
326328

@@ -360,3 +362,44 @@ Content-Type: text/html
360362
</html>
361363

362364

365+
WARC/1.0
366+
WARC-Type: response
367+
WARC-Target-URI: http://www.example.com/link-extraction-test-unescaped-entities.xhtml
368+
WARC-Date: 2019-06-19T13:13:38Z
369+
WARC-IP-Address: 127.0.0.1
370+
Content-Type: application/http;msgtype=response
371+
Content-Length: 1520
372+
373+
HTTP/1.1 200 OK
374+
Date: Wed, 19 Jun 2019 13:13:38 GMT
375+
Server: Apache/2.4.29 (Ubuntu)
376+
Last-Modified: Wed, 19 Jun 2019 13:11:24 GMT
377+
ETag: "4c6-58bacf761e299"
378+
Accept-Ranges: bytes
379+
Content-Length: 1223
380+
Keep-Alive: timeout=5, max=100
381+
Connection: Keep-Alive
382+
Content-Type: application/xhtml+xml
383+
384+
<?xml version="1.0"?>
385+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
386+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
387+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://ogp.me/ns#">
388+
<head>
389+
<meta property="og:description" content="Apostrophe&#39;s description" />
390+
<meta content="Apostrophe&apos;s description" name="description" /><!-- Note: &apos; is defined in XML 1.0 but is not part of HTML -->
391+
<meta http-equiv="Refresh" content="5; URL=http:&#x2f;&#x2f;www.example.com&#x2f;redirected.html" />
392+
<base href="http://www.example.com/" />
393+
<title>Title &#8211; &quot;&#84;&#x69;&#116;&#x6c;&#101;&quot; written using character entities</title>
394+
<style type="text/css">.logo{background-image:url("/view?id=logo&amp;res=420x180"); background-color: #cccccc; display:block; height:180px; width:420px;}</style>
395+
</head>
396+
<body>
397+
<p class="logo">Here is the <a href="/view?id=logo&amp;action=edit">logo</a></p>
398+
<p>
399+
<a href="http://www.example.com/search?q&#x3D;examples&amp;n&#x3D;20" target="_blank" rel="nofollow">Examples &amp; more</a>
400+
<img src="https://img.example.org/view?id=867&amp;res=10x16" alt="image URL containing escaped ampersand (&quot;&amp;amp;&quot;)" />
401+
</p>
402+
</body>
403+
</html>
404+
405+

0 commit comments

Comments
 (0)