1515import org .archive .resource .ResourceParseException ;
1616import org .archive .resource .ResourceProducer ;
1717import org .htmlparser .nodes .TextNode ;
18+ import org .htmlparser .util .Translate ;
1819import org .json .JSONArray ;
1920import org .json .JSONException ;
2021import org .json .JSONObject ;
@@ -141,16 +142,24 @@ private void checkExtract(String[] data) throws JSONException {
141142 }
142143
143144 private void checkLink (Multimap <String ,String > links , String url , String path ) {
144- assertTrue ("Link with URL " + url + " not found" , links .containsKey (url ));
145+ assertTrue ("Link with URL " + url + " not found in [" + String .join (", " , links .keySet ()) + "]" ,
146+ links .containsKey (url ));
145147 assertTrue ("Wrong path " + path + " for " + url , links .get (url ).contains (path ));
146148 }
147149
150+ private void checkAnchor (Multimap <String ,String > anchors , String url , String anchor ) {
151+ assertTrue ("Anchor for URL " + url + " not found in [" + String .join (", " , anchors .keySet ()) + "]" ,
152+ anchors .containsKey (url ));
153+ assertTrue ("Wrong anchor text " + anchor + " for " + url , anchors .get (url ).contains (anchor ));
154+ }
155+
148156 private void checkLinks (Resource resource , String [][] expectedLinks ) {
149157 assertNotNull (resource );
150158 assertTrue ("Wrong instance type of Resource: " + resource .getClass (), resource instanceof HTMLResource );
151159 MetaData md = resource .getMetaData ();
152160 LOG .info (md .toString ());
153161 Multimap <String , String > links = ArrayListMultimap .create ();
162+ Multimap <String , String > anchors = ArrayListMultimap .create ();
154163 JSONObject head = md .optJSONObject ("Head" );
155164 if (head != null ) {
156165 // <base href="http://www.example.com/" />
@@ -189,9 +198,22 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
189198 for (int i = 0 ; i < ldata .length (); i ++) {
190199 JSONObject o = (JSONObject ) ldata .optJSONObject (i );
191200 try {
192- String url = o .getString ("url" );
201+ String url ;
202+ if (o .has ("url" )) {
203+ url = o .getString ("url" );
204+ } else if (o .has ("href" )) {
205+ url = o .getString ("href" );
206+ } else {
207+ fail ("No URL found in: " + o );
208+ continue ;
209+ }
193210 links .put (url , o .getString ("path" ));
194- LOG .info (" found link: " + o .getString ("url" ) + " " + o .getString ("path" ));
211+ LOG .info (" found link: " + url + " " + o .getString ("path" ));
212+ if (o .has ("text" )) {
213+ anchors .put (url , o .getString ("text" ));
214+ } else if (o .has ("alt" )) {
215+ anchors .put (url , o .getString ("alt" ));
216+ }
195217 } catch (JSONException e ) {
196218 fail ("Failed to extract URL from link: " + e .getMessage ());
197219 }
@@ -200,6 +222,9 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
200222 assertEquals ("Unexpected number of links" , expectedLinks .length , links .size ());
201223 for (String [] l : expectedLinks ) {
202224 checkLink (links , l [0 ], l [1 ]);
225+ if (l .length > 2 && l [2 ] != null ) {
226+ checkAnchor (anchors , l [0 ], l [2 ]);
227+ }
203228 }
204229 }
205230
@@ -225,8 +250,8 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
225250 };
226251 checkLinks (extractor .getNext (), html4links );
227252 String [][] html5links = {
228- {"http:///www.example.com/video.html" , "LINK@/href" , "canonical" },
229- {"video.rss" , "LINK@/href" , "alternate" },
253+ {"http:///www.example.com/video.html" , "LINK@/href" , null , "canonical" },
254+ {"video.rss" , "LINK@/href" , null , "alternate" },
230255 {"https://archive.org/download/WebmVp8Vorbis/webmvp8.gif" , "VIDEO@/poster" },
231256 {"https://archive.org/download/WebmVp8Vorbis/webmvp8.webm" , "SOURCE@/src" },
232257 {"https://archive.org/download/WebmVp8Vorbis/webmvp8_512kb.mp4" , "SOURCE@/src" },
@@ -245,7 +270,7 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
245270 };
246271 checkLinks (extractor .getNext (), fbVideoLinks );
247272 String [][] dataHrefLinks = {
248- {"standard.css" , "LINK@/href" , "stylesheet" },
273+ {"standard.css" , "LINK@/href" , null , "stylesheet" },
249274 {"https://www.facebook.com/elegantthemes/videos/10153760379211923/" , "DIV@/data-href" },
250275 {"https://www.facebook.com/facebook/videos/10153231379946729/" , "DIV@/data-href" },
251276 {"https://www.facebook.com/facebook/videos/10153231379946729/" , "BLOCKQUOTE@/cite" },
@@ -265,9 +290,9 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
265290 {"jackbox/img/thumbs/4.jpg" , "IMG@/src" },
266291 {"//venobox-destination" , "A@/data-href" },
267292 {"#" , "A@/href" },
268- {"http://www.youtube.com/v/itTskyFLSS8& rel=0& autohide=1& showinfo=0& autoplay=1" , "DIV@/data-href" },
293+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0&autoplay=1" , "DIV@/data-href" },
269294 {"#" , "A@/href" },
270- {"http://www.youtube.com/v/itTskyFLSS8& rel=0& autohide=1& showinfo=0" , "IFRAME@/src" }
295+ {"http://www.youtube.com/v/itTskyFLSS8&rel=0&autohide=1&showinfo=0" , "IFRAME@/src" }
271296 };
272297 checkLinks (extractor .getNext (), dataHrefLinks );
273298 String [][] fbSocialLinks = {
@@ -292,6 +317,30 @@ public void testLinkExtraction() throws ResourceParseException, IOException {
292317 {"http://example.com/location/href/2.html" , "INPUT@/onclick" }
293318 };
294319 checkLinks (extractor .getNext (), onClickLinks );
320+ String [][] escapedEntitiesLinks = {
321+ {"http://www.example.com/" , "__base__" },
322+ {"http://www.example.com/redirected.html" , "__meta_refresh__" },
323+ {"/view?id=logo&action=edit" , "A@/href" },
324+ {"http://www.example.com/search?q=examples&n=20" , "A@/href" , "Examples & more" },
325+ {"/view?id=logo&res=420x180" , "STYLE/#text" },
326+ {"https://img.example.org/view?id=867&res=10x16" , "IMG@/src" ,
327+ "image URL containing escaped ampersand (\" &\" )" }
328+ };
329+ Resource resource = extractor .getNext ();
330+ assertNotNull (resource );
331+ checkLinks (resource , escapedEntitiesLinks );
332+ MetaData md = resource .getMetaData ();
333+ assertEquals ("Wrong title" , "Title – \" Title\" written using character entities" ,
334+ md .getJSONObject (ResourceConstants .HTML_HEAD ).getString (ResourceConstants .HTML_TITLE ));
335+ JSONArray metas = md .getJSONObject (ResourceConstants .HTML_HEAD ).getJSONArray (ResourceConstants .HTML_META_TAGS );
336+ for (int i = 0 ; i < metas .length (); i ++) {
337+ JSONObject o = (JSONObject ) metas .optJSONObject (i );
338+ String property = o .optString ("property" );
339+ if (property .equals ("og:description" )) {
340+ String content = o .optString ("content" );
341+ assertEquals (content , "Apostrophe's description" );
342+ }
343+ }
295344 }
296345
297346 public void testTextExtraction () throws ResourceParseException , IOException {
@@ -323,4 +372,31 @@ public void testTextExtraction() throws ResourceParseException, IOException {
323372 // assertTrue(text.matches("CDATA in MathML:\\W*x<y"));
324373 }
325374
375+ public void testHtmlParserEntityDecoding () {
376+ String [][] entities = { //
377+ // ampersand
378+ { "&" , "&" },
379+ // apostrophe
380+ // TODO: { "'", "'" },
381+ // comma
382+ // TODO: { ",", "," },
383+ // % percent
384+ // TODO: { "percnt", "%" },
385+ // ’ right single quotation mark
386+ { "’" , "\u2019 " },
387+ // » right-pointing double angle quotation mark
388+ { "»" , "\u00bb " },
389+ // … horizontal ellipsis
390+ { "…" , "\u2026 " },
391+ // 𤆑 CJK UNIFIED IDEOGRAPH-24191
392+ // TODO: { "𤆑", new String(Character.toChars(0x24191)) },
393+ // 😊 U+1F60A SMILING FACE WITH SMILING EYES
394+ // TODO: { "😊", new String(Character.toChars(0x1f60a)) },
395+ };
396+ for (String [] ent : entities ) {
397+ String decoded = Translate .decode (ent [0 ]);
398+ assertEquals ("Entity " + ent [0 ] + " not properly decoded" , ent [1 ], decoded );
399+ }
400+ }
401+
326402}
0 commit comments