@@ -3,15 +3,29 @@ import * as cheerio from 'cheerio';
33import { URL } from 'url' ;
44import logger from './logger' ; // Adjust path if necessary
55
6+ /**
7+ * Fetches and extracts all images from a webpage, including responsive ones.
8+ * This includes regular <img> tags and srcset URLs used for different screen sizes.
9+ *
10+ * @param {string } url - The webpage URL to extract images from.
11+ * Must be a valid, non-empty string.
12+ * @returns {Array } - An array of objects, each containing:
13+ * {
14+ * url: string, // The absolute URL of the image
15+ * altText: string // The alt text of the image (if any)
16+ * }
17+ * @throws {TypeError } - If the URL is missing or not a string.
18+ * @throws {Error } - If the fetch fails or the response is not HTML.
19+ */
620async function extractImages ( url ) {
7- // Input validation
21+ // 1. Validate input
822 if ( ! url || typeof url !== 'string' ) {
923 throw new TypeError ( 'URL must be a non-empty string' ) ;
1024 }
1125
1226 try {
13- // Fetch HTML with proper axios config
14- const { data : html } = await axios . get ( url , {
27+ // 2. Fetch HTML with axios configured for reliability
28+ const response = await axios . get ( url , {
1529 timeout : 10000 ,
1630 maxContentLength : 10 * 1024 * 1024 ,
1731 maxBodyLength : 10 * 1024 * 1024 ,
@@ -21,7 +35,15 @@ async function extractImages(url) {
2135 maxRedirects : 5
2236 } ) ;
2337
24- // Load HTML
38+ // 3. Validate content-type
39+ const contentType = response . headers [ 'content-type' ] || '' ;
40+ if ( ! contentType . includes ( 'text/html' ) ) {
41+ throw new Error ( `Expected HTML but got ${ contentType } ` ) ;
42+ }
43+
44+ const html = response . data ;
45+
46+ // 4. Load HTML into cheerio
2547 const $ = cheerio . load ( html , {
2648 decodeEntities : true ,
2749 normalizeWhitespace : false
@@ -30,10 +52,11 @@ async function extractImages(url) {
3052 const images = [ ] ;
3153 const seen = new Set ( ) ;
3254
55+ // 5. Extract <img> tags
3356 $ ( 'img' ) . each ( ( index , element ) => {
3457 const alt = $ ( element ) . attr ( 'alt' ) || '' ;
35-
36- // Handle src
58+
59+ // 5a. Handle src
3760 let src = $ ( element ) . attr ( 'src' ) ;
3861 if ( src ) {
3962 try {
@@ -47,10 +70,13 @@ async function extractImages(url) {
4770 }
4871 }
4972
50- // Handle srcset
73+ // 5b. Handle srcset (responsive images)
5174 const srcset = $ ( element ) . attr ( 'srcset' ) ;
5275 if ( srcset ) {
53- const srcsetUrls = srcset . split ( ',' ) . map ( s => s . trim ( ) . split ( / \s + / ) [ 0 ] ) ;
76+ const srcsetUrls = srcset . split ( ',' )
77+ . map ( s => s . trim ( ) . split ( / \s + / ) [ 0 ] )
78+ . filter ( Boolean ) ; // Remove empty strings
79+
5480 for ( const srcsetUrl of srcsetUrls ) {
5581 try {
5682 const absoluteUrl = new URL ( srcsetUrl , url ) . href ;
@@ -68,7 +94,8 @@ async function extractImages(url) {
6894 return images ;
6995
7096 } catch ( error ) {
97+ // Log errors and throw for the caller
7198 logger . error ( 'Failed to extract images' , { url, error : error . message } ) ;
7299 throw new Error ( `Failed to extract images from ${ url } : ${ error . message } ` ) ;
73100 }
74- }
101+ }
0 commit comments