Skip to content

Commit caaf131

Browse files
committed
Refactor: Improve image extraction
- Switch to ES6 module syntax (import/export) - Add URL input validation - Configure axios for timeout, headers, content limits - Validate Content-Type before parsing HTML - Handle srcset with empty URL filtering - Remove duplicate images - Use Winston logger for warnings and errors - Add human-friendly JSDoc comments
1 parent df2efee commit caaf131

File tree

1 file changed

+36
-9
lines changed

1 file changed

+36
-9
lines changed

mediaParser.js

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,29 @@ import * as cheerio from 'cheerio';
33
import { URL } from 'url';
44
import logger from './logger'; // Adjust path if necessary
55

6+
/**
7+
* Fetches and extracts all images from a webpage, including responsive ones.
8+
* This includes regular <img> tags and srcset URLs used for different screen sizes.
9+
*
10+
* @param {string} url - The webpage URL to extract images from.
11+
* Must be a valid, non-empty string.
12+
* @returns {Array} - An array of objects, each containing:
13+
* {
14+
* url: string, // The absolute URL of the image
15+
* altText: string // The alt text of the image (if any)
16+
* }
17+
* @throws {TypeError} - If the URL is missing or not a string.
18+
* @throws {Error} - If the fetch fails or the response is not HTML.
19+
*/
620
async function extractImages(url) {
7-
// Input validation
21+
// 1. Validate input
822
if (!url || typeof url !== 'string') {
923
throw new TypeError('URL must be a non-empty string');
1024
}
1125

1226
try {
13-
// Fetch HTML with proper axios config
14-
const { data: html } = await axios.get(url, {
27+
// 2. Fetch HTML with axios configured for reliability
28+
const response = await axios.get(url, {
1529
timeout: 10000,
1630
maxContentLength: 10 * 1024 * 1024,
1731
maxBodyLength: 10 * 1024 * 1024,
@@ -21,7 +35,15 @@ async function extractImages(url) {
2135
maxRedirects: 5
2236
});
2337

24-
// Load HTML
38+
// 3. Validate content-type
39+
const contentType = response.headers['content-type'] || '';
40+
if (!contentType.includes('text/html')) {
41+
throw new Error(`Expected HTML but got ${contentType}`);
42+
}
43+
44+
const html = response.data;
45+
46+
// 4. Load HTML into cheerio
2547
const $ = cheerio.load(html, {
2648
decodeEntities: true,
2749
normalizeWhitespace: false
@@ -30,10 +52,11 @@ async function extractImages(url) {
3052
const images = [];
3153
const seen = new Set();
3254

55+
// 5. Extract <img> tags
3356
$('img').each((index, element) => {
3457
const alt = $(element).attr('alt') || '';
35-
36-
// Handle src
58+
59+
// 5a. Handle src
3760
let src = $(element).attr('src');
3861
if (src) {
3962
try {
@@ -47,10 +70,13 @@ async function extractImages(url) {
4770
}
4871
}
4972

50-
// Handle srcset
73+
// 5b. Handle srcset (responsive images)
5174
const srcset = $(element).attr('srcset');
5275
if (srcset) {
53-
const srcsetUrls = srcset.split(',').map(s => s.trim().split(/\s+/)[0]);
76+
const srcsetUrls = srcset.split(',')
77+
.map(s => s.trim().split(/\s+/)[0])
78+
.filter(Boolean); // Remove empty strings
79+
5480
for (const srcsetUrl of srcsetUrls) {
5581
try {
5682
const absoluteUrl = new URL(srcsetUrl, url).href;
@@ -68,7 +94,8 @@ async function extractImages(url) {
6894
return images;
6995

7096
} catch (error) {
97+
// Log errors and throw for the caller
7198
logger.error('Failed to extract images', { url, error: error.message });
7299
throw new Error(`Failed to extract images from ${url}: ${error.message}`);
73100
}
74-
}
101+
}

0 commit comments

Comments
 (0)