|
15 | 15 | import org.archive.resource.ResourceParseException; |
16 | 16 | import org.archive.resource.ResourceProducer; |
17 | 17 | import org.htmlparser.nodes.TextNode; |
| 18 | +import org.htmlparser.util.Translate; |
18 | 19 | import org.json.JSONArray; |
19 | 20 | import org.json.JSONException; |
20 | 21 | import org.json.JSONObject; |
@@ -371,4 +372,31 @@ public void testTextExtraction() throws ResourceParseException, IOException { |
371 | 372 | // assertTrue(text.matches("CDATA in MathML:\\W*x<y")); |
372 | 373 | } |
373 | 374 |
|
| 375 | + public void testHtmlParserEntityDecoding() { |
| 376 | + String[][] entities = { // |
| 377 | + // ampersand |
| 378 | + { "&", "&" }, |
| 379 | + // apostrophe |
| 380 | + // TODO: { "'", "'" }, |
| 381 | + // comma |
| 382 | + // TODO: { ",", "," }, |
| 383 | + // % percent |
| 384 | + // TODO: { "percnt", "%" }, |
| 385 | + // ’ right single quotation mark |
| 386 | + { "’", "\u2019" }, |
| 387 | + // » right-pointing double angle quotation mark |
| 388 | + { "»", "\u00bb" }, |
| 389 | + // … horizontal ellipsis |
| 390 | + { "…", "\u2026" }, |
| 391 | + // 𤆑 CJK UNIFIED IDEOGRAPH-24191 |
| 392 | + // TODO: { "𤆑", new String(Character.toChars(0x24191)) }, |
| 393 | + // 😊 U+1F60A SMILING FACE WITH SMILING EYES |
| 394 | + // TODO: { "😊", new String(Character.toChars(0x1f60a)) }, |
| 395 | + }; |
| 396 | + for (String[] ent : entities) { |
| 397 | + String decoded = Translate.decode(ent[0]); |
| 398 | + assertEquals("Entity " + ent[0] + " not properly decoded", ent[1], decoded); |
| 399 | + } |
| 400 | + } |
| 401 | + |
374 | 402 | } |
0 commit comments