Skip to content

Commit a9dc4c8

Browse files
committed
feat: add conditional check to collect matching classes
1 parent 647cd62 commit a9dc4c8

File tree

1 file changed

+63
-32
lines changed

1 file changed

+63
-32
lines changed

maxun-core/src/browserSide/scraper.js

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
265265
const scrapedData = [];
266266

267267
while (scrapedData.length < limit) {
268-
// Get all parent elements matching the listSelector
269-
const parentElements = Array.from(document.querySelectorAll(listSelector));
270-
271-
// Iterate through each parent element
272-
for (const parent of parentElements) {
273-
if (scrapedData.length >= limit) break;
274-
const record = {};
275-
276-
// For each field, select the corresponding element within the parent
277-
for (const [label, { selector, attribute }] of Object.entries(fields)) {
278-
const fieldElement = parent.querySelector(selector);
279-
280-
if (fieldElement) {
281-
if (attribute === 'innerText') {
282-
record[label] = fieldElement.innerText.trim();
283-
} else if (attribute === 'innerHTML') {
284-
record[label] = fieldElement.innerHTML.trim();
285-
} else if (attribute === 'src') {
286-
// Handle relative 'src' URLs
287-
const src = fieldElement.getAttribute('src');
288-
record[label] = src ? new URL(src, window.location.origin).href : null;
289-
} else if (attribute === 'href') {
290-
// Handle relative 'href' URLs
291-
const href = fieldElement.getAttribute('href');
292-
record[label] = href ? new URL(href, window.location.origin).href : null;
293-
} else {
294-
record[label] = fieldElement.getAttribute(attribute);
268+
let parentElements = Array.from(document.querySelectorAll(listSelector));
269+
270+
// If we only got one element or none, try a more generic approach
271+
if (limit > 1 && parentElements.length <= 1) {
272+
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
273+
const container = document.querySelector(containerSelector);
274+
275+
if (container) {
276+
const allChildren = Array.from(container.children);
277+
278+
const firstMatch = document.querySelector(listSelector);
279+
if (firstMatch) {
280+
// Get classes from the first matching element
281+
const firstMatchClasses = Array.from(firstMatch.classList);
282+
283+
// Find similar elements by matching most of their classes
284+
parentElements = allChildren.filter(element => {
285+
const elementClasses = Array.from(element.classList);
286+
287+
// Element should share at least 70% of classes with the first match
288+
const commonClasses = firstMatchClasses.filter(cls =>
289+
elementClasses.includes(cls));
290+
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
291+
});
292+
}
295293
}
296-
}
297294
}
298-
scrapedData.push(record);
299-
}
295+
296+
// Iterate through each parent element
297+
for (const parent of parentElements) {
298+
if (scrapedData.length >= limit) break;
299+
const record = {};
300+
301+
// For each field, select the corresponding element within the parent
302+
for (const [label, { selector, attribute }] of Object.entries(fields)) {
303+
const fieldElement = parent.querySelector(selector);
304+
305+
if (fieldElement) {
306+
if (attribute === 'innerText') {
307+
record[label] = fieldElement.innerText.trim();
308+
} else if (attribute === 'innerHTML') {
309+
record[label] = fieldElement.innerHTML.trim();
310+
} else if (attribute === 'src') {
311+
// Handle relative 'src' URLs
312+
const src = fieldElement.getAttribute('src');
313+
record[label] = src ? new URL(src, window.location.origin).href : null;
314+
} else if (attribute === 'href') {
315+
// Handle relative 'href' URLs
316+
const href = fieldElement.getAttribute('href');
317+
record[label] = href ? new URL(href, window.location.origin).href : null;
318+
} else {
319+
record[label] = fieldElement.getAttribute(attribute);
320+
}
321+
}
322+
}
323+
scrapedData.push(record);
324+
}
325+
326+
// If we've processed all available elements and still haven't reached the limit,
327+
// break to avoid infinite loop
328+
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
329+
break;
330+
}
300331
}
301-
return scrapedData
302-
};
332+
return scrapedData;
333+
};
303334

304335

305336
/**

0 commit comments

Comments
 (0)