Skip to content

Commit c25975b

Browse files
authored
Merge pull request #266 from getmaxun/sel-fix
feat: selection revamp
2 parents fa2d609 + a9dc4c8 commit c25975b

File tree

3 files changed

+136
-91
lines changed

3 files changed

+136
-91
lines changed

maxun-core/src/browserSide/scraper.js

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
265265
const scrapedData = [];
266266

267267
while (scrapedData.length < limit) {
268-
// Get all parent elements matching the listSelector
269-
const parentElements = Array.from(document.querySelectorAll(listSelector));
270-
271-
// Iterate through each parent element
272-
for (const parent of parentElements) {
273-
if (scrapedData.length >= limit) break;
274-
const record = {};
275-
276-
// For each field, select the corresponding element within the parent
277-
for (const [label, { selector, attribute }] of Object.entries(fields)) {
278-
const fieldElement = parent.querySelector(selector);
279-
280-
if (fieldElement) {
281-
if (attribute === 'innerText') {
282-
record[label] = fieldElement.innerText.trim();
283-
} else if (attribute === 'innerHTML') {
284-
record[label] = fieldElement.innerHTML.trim();
285-
} else if (attribute === 'src') {
286-
// Handle relative 'src' URLs
287-
const src = fieldElement.getAttribute('src');
288-
record[label] = src ? new URL(src, window.location.origin).href : null;
289-
} else if (attribute === 'href') {
290-
// Handle relative 'href' URLs
291-
const href = fieldElement.getAttribute('href');
292-
record[label] = href ? new URL(href, window.location.origin).href : null;
293-
} else {
294-
record[label] = fieldElement.getAttribute(attribute);
268+
let parentElements = Array.from(document.querySelectorAll(listSelector));
269+
270+
// If we only got one element or none, try a more generic approach
271+
if (limit > 1 && parentElements.length <= 1) {
272+
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
273+
const container = document.querySelector(containerSelector);
274+
275+
if (container) {
276+
const allChildren = Array.from(container.children);
277+
278+
const firstMatch = document.querySelector(listSelector);
279+
if (firstMatch) {
280+
// Get classes from the first matching element
281+
const firstMatchClasses = Array.from(firstMatch.classList);
282+
283+
// Find similar elements by matching most of their classes
284+
parentElements = allChildren.filter(element => {
285+
const elementClasses = Array.from(element.classList);
286+
287+
// Element should share at least 70% of classes with the first match
288+
const commonClasses = firstMatchClasses.filter(cls =>
289+
elementClasses.includes(cls));
290+
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
291+
});
292+
}
295293
}
296-
}
297294
}
298-
scrapedData.push(record);
299-
}
295+
296+
// Iterate through each parent element
297+
for (const parent of parentElements) {
298+
if (scrapedData.length >= limit) break;
299+
const record = {};
300+
301+
// For each field, select the corresponding element within the parent
302+
for (const [label, { selector, attribute }] of Object.entries(fields)) {
303+
const fieldElement = parent.querySelector(selector);
304+
305+
if (fieldElement) {
306+
if (attribute === 'innerText') {
307+
record[label] = fieldElement.innerText.trim();
308+
} else if (attribute === 'innerHTML') {
309+
record[label] = fieldElement.innerHTML.trim();
310+
} else if (attribute === 'src') {
311+
// Handle relative 'src' URLs
312+
const src = fieldElement.getAttribute('src');
313+
record[label] = src ? new URL(src, window.location.origin).href : null;
314+
} else if (attribute === 'href') {
315+
// Handle relative 'href' URLs
316+
const href = fieldElement.getAttribute('href');
317+
record[label] = href ? new URL(href, window.location.origin).href : null;
318+
} else {
319+
record[label] = fieldElement.getAttribute(attribute);
320+
}
321+
}
322+
}
323+
scrapedData.push(record);
324+
}
325+
326+
// If we've processed all available elements and still haven't reached the limit,
327+
// break to avoid infinite loop
328+
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
329+
break;
330+
}
300331
}
301-
return scrapedData
302-
};
332+
return scrapedData;
333+
};
303334

304335

305336
/**

server/src/workflow-management/classes/Generator.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -544,9 +544,9 @@ export class WorkflowGenerator {
544544
* @returns {Promise<string|null>}
545545
*/
546546
private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => {
547-
const elementInfo = await getElementInformation(page, coordinates, this.listSelector);
547+
const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList);
548548
const selectorBasedOnCustomAction = (this.getList === true)
549-
? await getNonUniqueSelectors(page, coordinates)
549+
? await getNonUniqueSelectors(page, coordinates, this.listSelector)
550550
: await getSelectors(page, coordinates);
551551

552552
const bestSelector = getBestSelectorForAction(
@@ -572,9 +572,9 @@ export class WorkflowGenerator {
572572
* @returns {Promise<void>}
573573
*/
574574
public generateDataForHighlighter = async (page: Page, coordinates: Coordinates) => {
575-
const rect = await getRect(page, coordinates, this.listSelector);
575+
const rect = await getRect(page, coordinates, this.listSelector, this.getList);
576576
const displaySelector = await this.generateSelector(page, coordinates, ActionType.Click);
577-
const elementInfo = await getElementInformation(page, coordinates, this.listSelector);
577+
const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList);
578578
if (rect) {
579579
if (this.getList === true) {
580580
if (this.listSelector !== '') {

server/src/workflow-management/selector.ts

Lines changed: 69 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ export const getElementInformation = async (
1717
page: Page,
1818
coordinates: Coordinates,
1919
listSelector: string,
20+
getList: boolean
2021
) => {
2122
try {
22-
if (listSelector !== '') {
23+
if (!getList || listSelector !== '') {
2324
const elementInfo = await page.evaluate(
2425
async ({ x, y }) => {
2526
const el = document.elementFromPoint(x, y) as HTMLElement;
@@ -74,22 +75,10 @@ export const getElementInformation = async (
7475
if (originalEl) {
7576
let element = originalEl;
7677

77-
const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE',
78-
'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME',
79-
'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET',
80-
'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT',
81-
'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT',
82-
'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET',
83-
'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A'
84-
];
8578
while (element.parentElement) {
8679
const parentRect = element.parentElement.getBoundingClientRect();
8780
const childRect = element.getBoundingClientRect();
8881

89-
if (!containerTags.includes(element.parentElement.tagName)) {
90-
break;
91-
}
92-
9382
const fullyContained =
9483
parentRect.left <= childRect.left &&
9584
parentRect.right >= childRect.right &&
@@ -167,9 +156,9 @@ export const getElementInformation = async (
167156
* @category WorkflowManagement-Selectors
168157
* @returns {Promise<Rectangle|undefined|null>}
169158
*/
170-
export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string) => {
159+
export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => {
171160
try {
172-
if (listSelector !== '') {
161+
if (!getList || listSelector !== '') {
173162
const rect = await page.evaluate(
174163
async ({ x, y }) => {
175164
const el = document.elementFromPoint(x, y) as HTMLElement;
@@ -202,22 +191,10 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
202191
if (originalEl) {
203192
let element = originalEl;
204193

205-
const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE',
206-
'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME',
207-
'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET',
208-
'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT',
209-
'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT',
210-
'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET',
211-
'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A'
212-
];
213194
while (element.parentElement) {
214195
const parentRect = element.parentElement.getBoundingClientRect();
215196
const childRect = element.getBoundingClientRect();
216197

217-
if (!containerTags.includes(element.parentElement.tagName)) {
218-
break;
219-
}
220-
221198
const fullyContained =
222199
parentRect.left <= childRect.left &&
223200
parentRect.right >= childRect.right &&
@@ -875,8 +852,10 @@ interface SelectorResult {
875852
* @returns {Promise<Selectors|null|undefined>}
876853
*/
877854

878-
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => {
855+
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise<SelectorResult> => {
879856
try {
857+
if (!listSelector) {
858+
console.log(`NON UNIQUE: MODE 1`)
880859
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
881860
function getNonUniqueSelector(element: HTMLElement): string {
882861
let selector = element.tagName.toLowerCase();
@@ -914,47 +893,82 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
914893

915894
let element = originalEl;
916895

917-
const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE',
918-
'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME',
919-
'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET',
920-
'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT',
921-
'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT',
922-
'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET',
923-
'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A'
924-
];
925-
926-
while (element.parentElement) {
927-
const parentRect = element.parentElement.getBoundingClientRect();
928-
const childRect = element.getBoundingClientRect();
929-
930-
if (!containerTags.includes(element.parentElement.tagName)) {
931-
break;
896+
// if (listSelector === '') {
897+
while (element.parentElement) {
898+
const parentRect = element.parentElement.getBoundingClientRect();
899+
const childRect = element.getBoundingClientRect();
900+
901+
const fullyContained =
902+
parentRect.left <= childRect.left &&
903+
parentRect.right >= childRect.right &&
904+
parentRect.top <= childRect.top &&
905+
parentRect.bottom >= childRect.bottom;
906+
907+
const significantOverlap =
908+
(childRect.width * childRect.height) /
909+
(parentRect.width * parentRect.height) > 0.5;
910+
911+
if (fullyContained && significantOverlap) {
912+
element = element.parentElement;
913+
} else {
914+
break;
915+
}
916+
}
917+
// }
918+
919+
const generalSelector = getSelectorPath(element);
920+
return {
921+
generalSelector,
922+
};
923+
}, coordinates);
924+
return selectors || { generalSelector: '' };
925+
} else {
926+
console.log(`NON UNIQUE: MODE 2`)
927+
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
928+
function getNonUniqueSelector(element: HTMLElement): string {
929+
let selector = element.tagName.toLowerCase();
930+
931+
if (element.className) {
932+
const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls));
933+
if (classes.length > 0) {
934+
const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':'));
935+
if (validClasses.length > 0) {
936+
selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.');
937+
}
938+
}
932939
}
933940

934-
const fullyContained =
935-
parentRect.left <= childRect.left &&
936-
parentRect.right >= childRect.right &&
937-
parentRect.top <= childRect.top &&
938-
parentRect.bottom >= childRect.bottom;
941+
return selector;
942+
}
939943

940-
const significantOverlap =
941-
(childRect.width * childRect.height) /
942-
(parentRect.width * parentRect.height) > 0.5;
944+
function getSelectorPath(element: HTMLElement | null): string {
945+
const path: string[] = [];
946+
let depth = 0;
947+
const maxDepth = 2;
943948

944-
if (fullyContained && significantOverlap) {
949+
while (element && element !== document.body && depth < maxDepth) {
950+
const selector = getNonUniqueSelector(element);
951+
path.unshift(selector);
945952
element = element.parentElement;
946-
} else {
947-
break;
953+
depth++;
948954
}
955+
956+
return path.join(' > ');
949957
}
950958

959+
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
960+
if (!originalEl) return null;
961+
962+
let element = originalEl;
963+
951964
const generalSelector = getSelectorPath(element);
952965
return {
953966
generalSelector,
954967
};
955968
}, coordinates);
956-
957969
return selectors || { generalSelector: '' };
970+
}
971+
958972
} catch (error) {
959973
console.error('Error in getNonUniqueSelectors:', error);
960974
return { generalSelector: '' };

0 commit comments

Comments
 (0)