Skip to content

Commit fb514ab

Browse files
Add unit test for HTML entity decoding
- includes character entities not supported by htmlparser.org as TODOs
1 parent 4a1f323 commit fb514ab

File tree

1 file changed

+28
-0
lines changed

1 file changed

+28
-0
lines changed

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.archive.resource.ResourceParseException;
1616
import org.archive.resource.ResourceProducer;
1717
import org.htmlparser.nodes.TextNode;
18+
import org.htmlparser.util.Translate;
1819
import org.json.JSONArray;
1920
import org.json.JSONException;
2021
import org.json.JSONObject;
@@ -371,4 +372,31 @@ public void testTextExtraction() throws ResourceParseException, IOException {
371372
// assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
372373
}
373374

375+
public void testHtmlParserEntityDecoding() {
376+
String[][] entities = { //
377+
// ampersand
378+
{ "&amp;", "&" },
379+
// apostrophe
380+
// TODO: { "&apos;", "'" },
381+
// comma
382+
// TODO: { "&comma;", "," },
383+
// % percent
384+
// TODO: { "percnt", "%" },
385+
// ’ right single quotation mark
386+
{ "&rsquo;", "\u2019" },
387+
// » right-pointing double angle quotation mark
388+
{ "&raquo", "\u00bb" },
389+
// … horizontal ellipsis
390+
{ "&hellip;", "\u2026" },
391+
// 𤆑 CJK UNIFIED IDEOGRAPH-24191
392+
// TODO: { "&#x24191;", new String(Character.toChars(0x24191)) },
393+
// 😊 U+1F60A SMILING FACE WITH SMILING EYES
394+
// TODO: { "&#x1F60A;", new String(Character.toChars(0x1f60a)) },
395+
};
396+
for (String[] ent : entities) {
397+
String decoded = Translate.decode(ent[0]);
398+
assertEquals("Entity " + ent[0] + " not properly decoded", ent[1], decoded);
399+
}
400+
}
401+
374402
}

0 commit comments

Comments
 (0)