Skip to content

Commit d24496c

Browse files
committed
Refactor: Enhance image extraction
- Export extractImages function for module usage - Preserve original error stack trace using 'cause' - Validate Content-Type before parsing HTML - Filter out data: URLs to avoid bloating results - Handle <picture> <source> tags for responsive images - Deduplicate images - Maintain Winston logging for errors and warnings - Human-friendly JSDoc comments and ES6 syntax
1 parent caaf131 commit d24496c

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

mediaParser.js

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
1+
// mediaParser.js
2+
13
import axios from 'axios';
24
import * as cheerio from 'cheerio';
35
import { URL } from 'url';
46
import logger from './logger'; // Adjust path if necessary
57

68
/**
7-
* Fetches and extracts all images from a webpage, including responsive ones.
8-
* This includes regular <img> tags and srcset URLs used for different screen sizes.
9+
* Fetches and extracts all images from a webpage, including responsive images.
10+
* This includes regular <img> tags, srcset URLs, and <source> tags within <picture> elements.
911
*
1012
* @param {string} url - The webpage URL to extract images from.
1113
* Must be a valid, non-empty string.
12-
* @returns {Array} - An array of objects, each containing:
14+
* @returns {Array} - An array of objects:
1315
* {
1416
* url: string, // The absolute URL of the image
1517
* altText: string // The alt text of the image (if any)
@@ -18,13 +20,12 @@ import logger from './logger'; // Adjust path if necessary
1820
* @throws {Error} - If the fetch fails or the response is not HTML.
1921
*/
2022
async function extractImages(url) {
21-
// 1. Validate input
2223
if (!url || typeof url !== 'string') {
2324
throw new TypeError('URL must be a non-empty string');
2425
}
2526

2627
try {
27-
// 2. Fetch HTML with axios configured for reliability
28+
// Fetch webpage with axios
2829
const response = await axios.get(url, {
2930
timeout: 10000,
3031
maxContentLength: 10 * 1024 * 1024,
@@ -35,15 +36,13 @@ async function extractImages(url) {
3536
maxRedirects: 5
3637
});
3738

38-
// 3. Validate content-type
39+
// Validate that content is HTML
3940
const contentType = response.headers['content-type'] || '';
4041
if (!contentType.includes('text/html')) {
4142
throw new Error(`Expected HTML but got ${contentType}`);
4243
}
4344

4445
const html = response.data;
45-
46-
// 4. Load HTML into cheerio
4746
const $ = cheerio.load(html, {
4847
decodeEntities: true,
4948
normalizeWhitespace: false
@@ -52,16 +51,15 @@ async function extractImages(url) {
5251
const images = [];
5352
const seen = new Set();
5453

55-
// 5. Extract <img> tags
54+
// Extract <img> tags
5655
$('img').each((index, element) => {
5756
const alt = $(element).attr('alt') || '';
58-
59-
// 5a. Handle src
6057
let src = $(element).attr('src');
58+
6159
if (src) {
6260
try {
6361
const absoluteUrl = new URL(src, url).href;
64-
if (!seen.has(absoluteUrl)) {
62+
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
6563
seen.add(absoluteUrl);
6664
images.push({ url: absoluteUrl, altText: alt });
6765
}
@@ -70,17 +68,17 @@ async function extractImages(url) {
7068
}
7169
}
7270

73-
// 5b. Handle srcset (responsive images)
71+
// Handle srcset (responsive images)
7472
const srcset = $(element).attr('srcset');
7573
if (srcset) {
7674
const srcsetUrls = srcset.split(',')
7775
.map(s => s.trim().split(/\s+/)[0])
78-
.filter(Boolean); // Remove empty strings
76+
.filter(Boolean);
7977

8078
for (const srcsetUrl of srcsetUrls) {
8179
try {
8280
const absoluteUrl = new URL(srcsetUrl, url).href;
83-
if (!seen.has(absoluteUrl)) {
81+
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
8482
seen.add(absoluteUrl);
8583
images.push({ url: absoluteUrl, altText: alt });
8684
}
@@ -91,11 +89,35 @@ async function extractImages(url) {
9189
}
9290
});
9391

92+
// Extract <source> tags inside <picture> elements
93+
$('picture source').each((i, element) => {
94+
const srcset = $(element).attr('srcset');
95+
if (srcset) {
96+
const srcsetUrls = srcset.split(',')
97+
.map(s => s.trim().split(/\s+/)[0])
98+
.filter(Boolean);
99+
100+
for (const srcsetUrl of srcsetUrls) {
101+
try {
102+
const absoluteUrl = new URL(srcsetUrl, url).href;
103+
if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) {
104+
seen.add(absoluteUrl);
105+
images.push({ url: absoluteUrl, altText: '' });
106+
}
107+
} catch {
108+
logger.warn(`Invalid srcset URL in <source>: ${srcsetUrl}`);
109+
}
110+
}
111+
}
112+
});
113+
94114
return images;
95115

96116
} catch (error) {
97-
// Log errors and throw for the caller
98-
logger.error('Failed to extract images', { url, error: error.message });
99-
throw new Error(`Failed to extract images from ${url}: ${error.message}`);
117+
// Preserve original stack trace
118+
throw new Error(`Failed to extract images from ${url}`, { cause: error });
100119
}
101-
}
120+
}
121+
122+
// Export function for other modules
123+
export { extractImages };

0 commit comments

Comments
 (0)