Skip to content

Commit 43d1bf6

Browse files
committed
feat: add scrape endpoint
1 parent af5500d commit 43d1bf6

File tree

8 files changed

+1632
-58
lines changed

8 files changed

+1632
-58
lines changed
Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
"""
2+
Basic asynchronous example demonstrating how to use the Scrape API.
3+
4+
This example shows:
5+
1. How to make async scrape requests
6+
2. How to process multiple URLs concurrently
7+
3. How to use render_heavy_js for JavaScript-heavy websites
8+
4. How to add custom headers in async mode
9+
10+
Equivalent curl command:
11+
curl -X POST https://api.scrapegraphai.com/v1/scrape \
12+
-H "Content-Type: application/json" \
13+
-H "SGAI-APIKEY: your-api-key-here" \
14+
-d '{
15+
"website_url": "https://example.com",
16+
"render_heavy_js": false
17+
}'
18+
19+
Requirements:
20+
- Python 3.7+
21+
- scrapegraph-py
22+
- python-dotenv
23+
- aiohttp
24+
- A .env file with your SGAI_API_KEY
25+
26+
Example .env file:
27+
SGAI_API_KEY=your_api_key_here
28+
"""
29+
30+
import asyncio
31+
import time
32+
from pathlib import Path
33+
from typing import List, Dict, Any
34+
from dotenv import load_dotenv
35+
36+
from scrapegraph_py import AsyncClient
37+
38+
# Load environment variables from .env file
39+
load_dotenv()
40+
41+
42+
async def basic_async_scrape():
43+
"""Demonstrate basic async scrape functionality."""
44+
print("🌐 Basic Async Scrape Example")
45+
print("=" * 35)
46+
47+
async with AsyncClient.from_env() as client:
48+
try:
49+
print("Making basic async scrape request...")
50+
result = await client.scrape(
51+
website_url="https://example.com",
52+
render_heavy_js=False
53+
)
54+
55+
html_content = result.get("html", "")
56+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
57+
print(f"Request ID: {result.get('request_id', 'N/A')}")
58+
59+
return result
60+
61+
except Exception as e:
62+
print(f"❌ Error: {str(e)}")
63+
return None
64+
65+
66+
async def async_scrape_with_heavy_js():
67+
"""Demonstrate async scraping with heavy JavaScript rendering."""
68+
print("\n🚀 Async Heavy JavaScript Rendering Example")
69+
print("=" * 50)
70+
71+
async with AsyncClient.from_env() as client:
72+
try:
73+
print("Making async scrape request with heavy JS rendering...")
74+
start_time = time.time()
75+
76+
result = await client.scrape(
77+
website_url="https://example.com",
78+
render_heavy_js=True
79+
)
80+
81+
execution_time = time.time() - start_time
82+
html_content = result.get("html", "")
83+
84+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
85+
print(f"⏱️ Execution time: {execution_time:.2f} seconds")
86+
print(f"Request ID: {result.get('request_id', 'N/A')}")
87+
88+
return result
89+
90+
except Exception as e:
91+
print(f"❌ Error: {str(e)}")
92+
return None
93+
94+
95+
async def scrape_single_url(client: AsyncClient, url: str, use_js: bool = False) -> Dict[str, Any]:
96+
"""Scrape a single URL with error handling."""
97+
try:
98+
result = await client.scrape(
99+
website_url=url,
100+
render_heavy_js=use_js
101+
)
102+
103+
html_content = result.get("html", "")
104+
return {
105+
"url": url,
106+
"success": True,
107+
"html_length": len(html_content),
108+
"request_id": result.get("request_id"),
109+
"result": result
110+
}
111+
112+
except Exception as e:
113+
return {
114+
"url": url,
115+
"success": False,
116+
"error": str(e),
117+
"html_length": 0
118+
}
119+
120+
121+
async def concurrent_scraping_example():
122+
"""Demonstrate scraping multiple URLs concurrently."""
123+
print("\n⚡ Concurrent Scraping Example")
124+
print("=" * 35)
125+
126+
# URLs to scrape concurrently
127+
urls = [
128+
"https://example.com",
129+
"https://httpbin.org/html",
130+
"https://httpbin.org/json"
131+
]
132+
133+
async with AsyncClient.from_env() as client:
134+
print(f"Scraping {len(urls)} URLs concurrently...")
135+
start_time = time.time()
136+
137+
# Create tasks for concurrent execution
138+
tasks = [scrape_single_url(client, url) for url in urls]
139+
results = await asyncio.gather(*tasks, return_exceptions=True)
140+
141+
total_time = time.time() - start_time
142+
143+
# Process results
144+
successful = 0
145+
total_html_length = 0
146+
147+
for result in results:
148+
if isinstance(result, Exception):
149+
print(f"❌ Exception: {result}")
150+
continue
151+
152+
if result["success"]:
153+
successful += 1
154+
total_html_length += result["html_length"]
155+
print(f"✅ {result['url']}: {result['html_length']:,} chars")
156+
else:
157+
print(f"❌ {result['url']}: {result['error']}")
158+
159+
print(f"\n📊 Results:")
160+
print(f" Total time: {total_time:.2f} seconds")
161+
print(f" Successful: {successful}/{len(urls)}")
162+
print(f" Total HTML: {total_html_length:,} characters")
163+
print(f" Average per URL: {total_time/len(urls):.2f} seconds")
164+
165+
return results
166+
167+
168+
async def async_scrape_with_custom_headers():
169+
"""Demonstrate async scraping with custom headers."""
170+
print("\n🔧 Async Custom Headers Example")
171+
print("=" * 35)
172+
173+
# Custom headers
174+
custom_headers = {
175+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
176+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
177+
"Accept-Language": "en-US,en;q=0.5",
178+
"Connection": "keep-alive"
179+
}
180+
181+
async with AsyncClient.from_env() as client:
182+
try:
183+
print("Making async scrape request with custom headers...")
184+
result = await client.scrape(
185+
website_url="https://httpbin.org/headers",
186+
render_heavy_js=False,
187+
headers=custom_headers
188+
)
189+
190+
html_content = result.get("html", "")
191+
print(f"✅ Success! Received {len(html_content):,} characters of HTML")
192+
print(f"Request ID: {result.get('request_id', 'N/A')}")
193+
194+
return result
195+
196+
except Exception as e:
197+
print(f"❌ Error: {str(e)}")
198+
return None
199+
200+
201+
async def save_html_to_file_async(html_content: str, filename: str):
202+
"""Save HTML content to a file asynchronously."""
203+
output_dir = Path("async_scrape_output")
204+
output_dir.mkdir(exist_ok=True)
205+
206+
file_path = output_dir / f"{filename}.html"
207+
208+
# Use asyncio.to_thread for file I/O
209+
await asyncio.to_thread(
210+
lambda: file_path.write_text(html_content, encoding="utf-8")
211+
)
212+
213+
print(f"💾 HTML saved to: {file_path}")
214+
return file_path
215+
216+
217+
def demonstrate_curl_equivalent():
218+
"""Show the equivalent curl commands."""
219+
print("🌐 Equivalent curl commands:")
220+
print("=" * 35)
221+
222+
print("1. Basic async scrape (same as sync):")
223+
print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\")
224+
print(" -H \"Content-Type: application/json\" \\")
225+
print(" -H \"SGAI-APIKEY: your-api-key-here\" \\")
226+
print(" -d '{")
227+
print(" \"website_url\": \"https://example.com\",")
228+
print(" \"render_heavy_js\": false")
229+
print(" }'")
230+
231+
print("\n2. Multiple concurrent requests:")
232+
print("# Run multiple curl commands in parallel:")
233+
print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\")
234+
print(" -H \"Content-Type: application/json\" \\")
235+
print(" -H \"SGAI-APIKEY: your-api-key-here\" \\")
236+
print(" -d '{\"website_url\": \"https://example.com\"}' &")
237+
print("curl -X POST https://api.scrapegraphai.com/v1/scrape \\")
238+
print(" -H \"Content-Type: application/json\" \\")
239+
print(" -H \"SGAI-APIKEY: your-api-key-here\" \\")
240+
print(" -d '{\"website_url\": \"https://httpbin.org/html\"}' &")
241+
print("wait # Wait for all background jobs to complete")
242+
243+
244+
async def main():
245+
"""Main async function demonstrating scrape functionality."""
246+
print("🚀 Async Scrape API Examples")
247+
print("=" * 30)
248+
249+
# Show curl equivalents first
250+
demonstrate_curl_equivalent()
251+
252+
try:
253+
# Run async examples
254+
result1 = await basic_async_scrape()
255+
result2 = await async_scrape_with_heavy_js()
256+
result3 = await async_scrape_with_custom_headers()
257+
concurrent_results = await concurrent_scraping_example()
258+
259+
# Save results if successful
260+
if result1:
261+
html1 = result1.get("html", "")
262+
if html1:
263+
await save_html_to_file_async(html1, "basic_async_scrape")
264+
265+
if result3:
266+
html3 = result3.get("html", "")
267+
if html3:
268+
await save_html_to_file_async(html3, "custom_headers_async_scrape")
269+
270+
print("\n🎯 Summary:")
271+
print(f"✅ Basic async scrape: {'Success' if result1 else 'Failed'}")
272+
print(f"✅ Heavy JS async scrape: {'Success' if result2 else 'Failed'}")
273+
print(f"✅ Custom headers async scrape: {'Success' if result3 else 'Failed'}")
274+
print(f"✅ Concurrent scraping: {'Success' if concurrent_results else 'Failed'}")
275+
276+
except Exception as e:
277+
print(f"❌ Unexpected error: {str(e)}")
278+
279+
print("\n📚 Next steps:")
280+
print("• Try running multiple curl commands in parallel")
281+
print("• Experiment with different concurrency levels")
282+
print("• Test with your own list of URLs")
283+
print("• Compare async vs sync performance for multiple URLs")
284+
285+
286+
if __name__ == "__main__":
287+
asyncio.run(main())

0 commit comments

Comments
 (0)