1+ // mediaParser.js
2+
13import axios from 'axios' ;
24import * as cheerio from 'cheerio' ;
35import { URL } from 'url' ;
46import logger from './logger' ; // Adjust path if necessary
57
68/**
7- * Fetches and extracts all images from a webpage, including responsive ones .
8- * This includes regular <img> tags and srcset URLs used for different screen sizes .
9+ * Fetches and extracts all images from a webpage, including responsive images .
10+ * This includes regular <img> tags, srcset URLs, and <source> tags within <picture> elements .
911 *
1012 * @param {string } url - The webpage URL to extract images from.
1113 * Must be a valid, non-empty string.
12- * @returns {Array } - An array of objects, each containing :
14+ * @returns {Array } - An array of objects:
1315 * {
1416 * url: string, // The absolute URL of the image
1517 * altText: string // The alt text of the image (if any)
@@ -18,13 +20,12 @@ import logger from './logger'; // Adjust path if necessary
1820 * @throws {Error } - If the fetch fails or the response is not HTML.
1921 */
2022async function extractImages ( url ) {
21- // 1. Validate input
2223 if ( ! url || typeof url !== 'string' ) {
2324 throw new TypeError ( 'URL must be a non-empty string' ) ;
2425 }
2526
2627 try {
27- // 2. Fetch HTML with axios configured for reliability
28+ // Fetch webpage with axios
2829 const response = await axios . get ( url , {
2930 timeout : 10000 ,
3031 maxContentLength : 10 * 1024 * 1024 ,
@@ -35,15 +36,13 @@ async function extractImages(url) {
3536 maxRedirects : 5
3637 } ) ;
3738
38- // 3. Validate content-type
39+ // Validate that content is HTML
3940 const contentType = response . headers [ 'content-type' ] || '' ;
4041 if ( ! contentType . includes ( 'text/html' ) ) {
4142 throw new Error ( `Expected HTML but got ${ contentType } ` ) ;
4243 }
4344
4445 const html = response . data ;
45-
46- // 4. Load HTML into cheerio
4746 const $ = cheerio . load ( html , {
4847 decodeEntities : true ,
4948 normalizeWhitespace : false
@@ -52,16 +51,15 @@ async function extractImages(url) {
5251 const images = [ ] ;
5352 const seen = new Set ( ) ;
5453
55- // 5. Extract <img> tags
54+ // Extract <img> tags
5655 $ ( 'img' ) . each ( ( index , element ) => {
5756 const alt = $ ( element ) . attr ( 'alt' ) || '' ;
58-
59- // 5a. Handle src
6057 let src = $ ( element ) . attr ( 'src' ) ;
58+
6159 if ( src ) {
6260 try {
6361 const absoluteUrl = new URL ( src , url ) . href ;
64- if ( ! seen . has ( absoluteUrl ) ) {
62+ if ( ! seen . has ( absoluteUrl ) && ! absoluteUrl . startsWith ( 'data:' ) ) {
6563 seen . add ( absoluteUrl ) ;
6664 images . push ( { url : absoluteUrl , altText : alt } ) ;
6765 }
@@ -70,17 +68,17 @@ async function extractImages(url) {
7068 }
7169 }
7270
73- // 5b. Handle srcset (responsive images)
71+ // Handle srcset (responsive images)
7472 const srcset = $ ( element ) . attr ( 'srcset' ) ;
7573 if ( srcset ) {
7674 const srcsetUrls = srcset . split ( ',' )
7775 . map ( s => s . trim ( ) . split ( / \s + / ) [ 0 ] )
78- . filter ( Boolean ) ; // Remove empty strings
76+ . filter ( Boolean ) ;
7977
8078 for ( const srcsetUrl of srcsetUrls ) {
8179 try {
8280 const absoluteUrl = new URL ( srcsetUrl , url ) . href ;
83- if ( ! seen . has ( absoluteUrl ) ) {
81+ if ( ! seen . has ( absoluteUrl ) && ! absoluteUrl . startsWith ( 'data:' ) ) {
8482 seen . add ( absoluteUrl ) ;
8583 images . push ( { url : absoluteUrl , altText : alt } ) ;
8684 }
@@ -91,11 +89,35 @@ async function extractImages(url) {
9189 }
9290 } ) ;
9391
92+ // Extract <source> tags inside <picture> elements
93+ $ ( 'picture source' ) . each ( ( i , element ) => {
94+ const srcset = $ ( element ) . attr ( 'srcset' ) ;
95+ if ( srcset ) {
96+ const srcsetUrls = srcset . split ( ',' )
97+ . map ( s => s . trim ( ) . split ( / \s + / ) [ 0 ] )
98+ . filter ( Boolean ) ;
99+
100+ for ( const srcsetUrl of srcsetUrls ) {
101+ try {
102+ const absoluteUrl = new URL ( srcsetUrl , url ) . href ;
103+ if ( ! seen . has ( absoluteUrl ) && ! absoluteUrl . startsWith ( 'data:' ) ) {
104+ seen . add ( absoluteUrl ) ;
105+ images . push ( { url : absoluteUrl , altText : '' } ) ;
106+ }
107+ } catch {
108+ logger . warn ( `Invalid srcset URL in <source>: ${ srcsetUrl } ` ) ;
109+ }
110+ }
111+ }
112+ } ) ;
113+
94114 return images ;
95115
96116 } catch ( error ) {
97- // Log errors and throw for the caller
98- logger . error ( 'Failed to extract images' , { url, error : error . message } ) ;
99- throw new Error ( `Failed to extract images from ${ url } : ${ error . message } ` ) ;
117+ // Preserve original stack trace
118+ throw new Error ( `Failed to extract images from ${ url } ` , { cause : error } ) ;
100119 }
101- }
120+ }
121+
122+ // Export function for other modules
123+ export { extractImages } ;
0 commit comments