Refactor: Improve image extraction

Aman-Raj-bat · Aman-Raj-bat · commit caaf13164c4d · 2025-10-03T22:35:02.000+05:30
- Switch to ES6 module syntax (import/export)
- Add URL input validation
- Configure axios for timeout, headers, content limits
- Validate Content-Type before parsing HTML
- Handle srcset with empty URL filtering
- Remove duplicate images
- Use Winston logger for warnings and errors
- Add human-friendly JSDoc comments
diff --git a/mediaParser.js b/mediaParser.js
@@ -3,15 +3,29 @@ import * as cheerio from 'cheerio';
 import { URL } from 'url';
 import logger from './logger'; // Adjust path if necessary
 
+/**
+ * Fetches and extracts all images from a webpage, including responsive ones.
+ * This includes regular <img> tags and srcset URLs used for different screen sizes.
+ *
+ * @param {string} url - The webpage URL to extract images from.
+ *                        Must be a valid, non-empty string.
+ * @returns {Array} - An array of objects, each containing:
+ *                    { 
+ *                      url: string,     // The absolute URL of the image
+ *                      altText: string  // The alt text of the image (if any)
+ *                    }
+ * @throws {TypeError} - If the URL is missing or not a string.
+ * @throws {Error} - If the fetch fails or the response is not HTML.
+ */
 async function extractImages(url) {
-    // Input validation
+    // 1. Validate input
     if (!url || typeof url !== 'string') {
         throw new TypeError('URL must be a non-empty string');
     }
 
     try {
-        // Fetch HTML with proper axios config
-        const { data: html } = await axios.get(url, {
+        // 2. Fetch HTML with axios configured for reliability
+        const response = await axios.get(url, {
             timeout: 10000,
             maxContentLength: 10 * 1024 * 1024,
             maxBodyLength: 10 * 1024 * 1024,
@@ -21,7 +35,15 @@ async function extractImages(url) {
             maxRedirects: 5
         });
 
-        // Load HTML
+        // 3. Validate content-type
+        const contentType = response.headers['content-type'] || '';
+        if (!contentType.includes('text/html')) {
+            throw new Error(`Expected HTML but got ${contentType}`);
+        }
+
+        const html = response.data;
+
+        // 4. Load HTML into cheerio
         const $ = cheerio.load(html, {
             decodeEntities: true,
             normalizeWhitespace: false
@@ -30,10 +52,11 @@ async function extractImages(url) {
         const images = [];
         const seen = new Set();
 
+        // 5. Extract <img> tags
         $('img').each((index, element) => {
             const alt = $(element).attr('alt') || '';
-            
-            // Handle src
+
+            // 5a. Handle src
             let src = $(element).attr('src');
             if (src) {
                 try {
@@ -47,10 +70,13 @@ async function extractImages(url) {
                 }
             }
 
-            // Handle srcset
+            // 5b. Handle srcset (responsive images)
             const srcset = $(element).attr('srcset');
             if (srcset) {
-                const srcsetUrls = srcset.split(',').map(s => s.trim().split(/\s+/)[0]);
+                const srcsetUrls = srcset.split(',')
+                    .map(s => s.trim().split(/\s+/)[0])
+                    .filter(Boolean); // Remove empty strings
+
                 for (const srcsetUrl of srcsetUrls) {
                     try {
                         const absoluteUrl = new URL(srcsetUrl, url).href;
@@ -68,7 +94,8 @@ async function extractImages(url) {
         return images;
 
     } catch (error) {
+        // Log errors and throw for the caller
         logger.error('Failed to extract images', { url, error: error.message });
         throw new Error(`Failed to extract images from ${url}: ${error.message}`);
     }
-}
+}