@@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
265265 const scrapedData = [ ] ;
266266
267267 while ( scrapedData . length < limit ) {
268- // Get all parent elements matching the listSelector
269- const parentElements = Array . from ( document . querySelectorAll ( listSelector ) ) ;
270-
271- // Iterate through each parent element
272- for ( const parent of parentElements ) {
273- if ( scrapedData . length >= limit ) break ;
274- const record = { } ;
275-
276- // For each field, select the corresponding element within the parent
277- for ( const [ label , { selector, attribute } ] of Object . entries ( fields ) ) {
278- const fieldElement = parent . querySelector ( selector ) ;
279-
280- if ( fieldElement ) {
281- if ( attribute === 'innerText' ) {
282- record [ label ] = fieldElement . innerText . trim ( ) ;
283- } else if ( attribute === 'innerHTML' ) {
284- record [ label ] = fieldElement . innerHTML . trim ( ) ;
285- } else if ( attribute === 'src' ) {
286- // Handle relative 'src' URLs
287- const src = fieldElement . getAttribute ( 'src' ) ;
288- record [ label ] = src ? new URL ( src , window . location . origin ) . href : null ;
289- } else if ( attribute === 'href' ) {
290- // Handle relative 'href' URLs
291- const href = fieldElement . getAttribute ( 'href' ) ;
292- record [ label ] = href ? new URL ( href , window . location . origin ) . href : null ;
293- } else {
294- record [ label ] = fieldElement . getAttribute ( attribute ) ;
268+ let parentElements = Array . from ( document . querySelectorAll ( listSelector ) ) ;
269+
270+ // If we only got one element or none, try a more generic approach
271+ if ( limit > 1 && parentElements . length <= 1 ) {
272+ const [ containerSelector , _ ] = listSelector . split ( '>' ) . map ( s => s . trim ( ) ) ;
273+ const container = document . querySelector ( containerSelector ) ;
274+
275+ if ( container ) {
276+ const allChildren = Array . from ( container . children ) ;
277+
278+ const firstMatch = document . querySelector ( listSelector ) ;
279+ if ( firstMatch ) {
280+ // Get classes from the first matching element
281+ const firstMatchClasses = Array . from ( firstMatch . classList ) ;
282+
283+ // Find similar elements by matching most of their classes
284+ parentElements = allChildren . filter ( element => {
285+ const elementClasses = Array . from ( element . classList ) ;
286+
287+ // Element should share at least 70% of classes with the first match
288+ const commonClasses = firstMatchClasses . filter ( cls =>
289+ elementClasses . includes ( cls ) ) ;
290+ return commonClasses . length >= Math . floor ( firstMatchClasses . length * 0.7 ) ;
291+ } ) ;
292+ }
295293 }
296- }
297294 }
298- scrapedData . push ( record ) ;
299- }
295+
296+ // Iterate through each parent element
297+ for ( const parent of parentElements ) {
298+ if ( scrapedData . length >= limit ) break ;
299+ const record = { } ;
300+
301+ // For each field, select the corresponding element within the parent
302+ for ( const [ label , { selector, attribute } ] of Object . entries ( fields ) ) {
303+ const fieldElement = parent . querySelector ( selector ) ;
304+
305+ if ( fieldElement ) {
306+ if ( attribute === 'innerText' ) {
307+ record [ label ] = fieldElement . innerText . trim ( ) ;
308+ } else if ( attribute === 'innerHTML' ) {
309+ record [ label ] = fieldElement . innerHTML . trim ( ) ;
310+ } else if ( attribute === 'src' ) {
311+ // Handle relative 'src' URLs
312+ const src = fieldElement . getAttribute ( 'src' ) ;
313+ record [ label ] = src ? new URL ( src , window . location . origin ) . href : null ;
314+ } else if ( attribute === 'href' ) {
315+ // Handle relative 'href' URLs
316+ const href = fieldElement . getAttribute ( 'href' ) ;
317+ record [ label ] = href ? new URL ( href , window . location . origin ) . href : null ;
318+ } else {
319+ record [ label ] = fieldElement . getAttribute ( attribute ) ;
320+ }
321+ }
322+ }
323+ scrapedData . push ( record ) ;
324+ }
325+
326+ // If we've processed all available elements and still haven't reached the limit,
327+ // break to avoid infinite loop
328+ if ( parentElements . length === 0 || scrapedData . length >= parentElements . length ) {
329+ break ;
330+ }
300331 }
301- return scrapedData
302- } ;
332+ return scrapedData ;
333+ } ;
303334
304335
305336 /**
0 commit comments