diff --git a/CHANGELOG.md b/CHANGELOG.md index c031bcc..ebd1f79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,44 @@ # Changelog +## [2.1.0](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.2...v2.1.0) (2024-12-18) + +### Features + +- Added `html_api()` method for HTML API with unified GET/POST support +- Added `google_search()` method for Google Search API +- Added `amazon_search()` method for Amazon Search API +- Added `amazon_product()` method for Amazon Product API +- Added `walmart_search()` method for Walmart Search API +- Added `walmart_product()` method for Walmart Product API +- Added `youtube_search()` method for YouTube Search API +- Added `youtube_metadata()` method for YouTube Metadata API +- Added `youtube_transcript()` method for YouTube Transcript API +- Added `youtube_trainability()` method for YouTube Trainability API +- Added `chatgpt()` method for ChatGPT API +- Added `usage()` method for Usage API +- Refactored internal `request()` method to be API-agnostic + +### Deprecated + +- `get()` and `post()` methods are deprecated in favor of `html_api()` method + +## [2.0.2](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.1...v2.0.2) (2025-10-02) + +### Features + +- Added support for `ai_extract_rules` parameter in `SpbParams`. +- Added Python 3.11 support. + +### Changes + +- Dropped Python 3.7 support. + +## [2.0.1](https://github.com/ScrapingBee/scrapingbee-python/compare/v2.0.0...v2.0.1) (2023-10-17) + +### Bugfix + +- Fix typos in `README.md` (`block_ressources` -> `block_resources`, `json_scenario` -> `js_scenario`). + ## [2.0.0](https://github.com/ScrapingBee/scrapingbee-python/compare/v1.2.0...v2.0.0) (2023-10-03) ### Improvement @@ -8,4 +47,4 @@ ### Breaking change -- No need to url encode params anymore. +- No need to url encode params anymore. \ No newline at end of file diff --git a/README.md b/README.md index 328dc4d..30386c6 100644 --- a/README.md +++ b/README.md @@ -16,130 +16,264 @@ pip install scrapingbee ## Usage -The ScrapingBee Python SDK is a wrapper around the [requests](https://docs.python-requests.org/en/master/) library. ScrapingBee supports GET and POST requests. +The ScrapingBee Python SDK is a wrapper around the [requests](https://docs.python-requests.org/en/master/) library. Signup to ScrapingBee to [get your API key](https://app.scrapingbee.com/account/register) and some free credits to get started. -### Making a GET request +## Table of Contents + +- [HTML API](#html-api) +- [Google Search API](#google-search-api) +- [Amazon API](#amazon-api) +- [Walmart API](#walmart-api) +- [YouTube API](#youtube-api) +- [ChatGPT API](#chatgpt-api) +- [Usage API](#usage-api) + +--- + +## HTML API + +The HTML API allows you to scrape any webpage and get the HTML content. + +### Basic Request ```python ->>> from scrapingbee import ScrapingBeeClient +from scrapingbee import ScrapingBeeClient ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +client = ScrapingBeeClient(api_key='YOUR-API-KEY') ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +response = client.html_api( + 'https://www.scrapingbee.com', params={ - # Block ads on the page you want to scrape - 'block_ads': False, - # Block images and CSS on the page you want to scrape - 'block_resources': True, - # Premium proxy geolocation - 'country_code': '', - # Control the device the request will be sent from - 'device': 'desktop', - # Use some data extraction rules - 'extract_rules': {'title': 'h1'}, - # Use AI to extract data from the page - 'ai_extract_rules': {'product_name': 'The name of the product', 'price': 'The price in USD'}, - # Wrap response in JSON - 'json_response': False, - # Interact with the webpage you want to scrape - 'js_scenario': { - "instructions": [ - {"wait_for": "#slow_button"}, - {"click": "#slow_button"}, - {"scroll_x": 1000}, - {"wait": 1000}, - {"scroll_x": 1000}, - {"wait": 1000}, - ] - }, - # Use premium proxies to bypass difficult to scrape websites (10-25 credits/request) - 'premium_proxy': False, - # Execute JavaScript code with a Headless Browser (5 credits/request) - 'render_js': True, - # Return the original HTML before the JavaScript rendering - 'return_page_source': False, - # Return page screenshot as a png image - 'screenshot': False, - # Take a full page screenshot without the window limitation - 'screenshot_full_page': False, - # Transparently return the same HTTP code of the page requested. - 'transparent_status_code': False, - # Wait, in miliseconds, before returning the response - 'wait': 0, - # Wait for CSS selector before returning the response, ex ".title" - 'wait_for': '', - # Set the browser window width in pixel - 'window_width': 1920, - # Set the browser window height in pixel - 'window_height': 1080 - }, - headers={ - # Forward custom headers to the target website - "key": "value" - }, - cookies={ - # Forward custom cookies to the target website - "name": "value" + 'render_js': False, } ) ->>> response.text -'...' + +print(response.content) ``` -ScrapingBee takes various parameters to render JavaScript, execute a custom JavaScript script, use a premium proxy from a specific geolocation and more. +### Making a POST request -You can find all the supported parameters on [ScrapingBee's documentation](https://www.scrapingbee.com/documentation/). +```python +response = client.html_api( + 'https://httpbin.org/post', + method='POST', + data={ + 'key': 'value' + } +) +``` -You can send custom cookies and headers like you would normally do with the requests library. +--- -## Screenshot +## Google Search API -Here a little exemple on how to retrieve and store a screenshot from the ScrapingBee blog in its mobile resolution. +Scrape Google search results in real-time. ```python ->>> from scrapingbee import ScrapingBeeClient +response = client.google_search( + search='web scraping tools', + params={ + 'language': 'en', + 'country_code': 'us', + 'nb_results': 10 + } +) ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +print(response.json()) +``` + +--- + +## Amazon API + +Scrape Amazon search results and product details. ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +### Amazon Search + +```python +response = client.amazon_search( + query='laptop', params={ - # Take a screenshot - 'screenshot': True, - # Specify that we need the full height - 'screenshot_full_page': True, - # Specify a mobile width in pixel - 'window_width': 375 + 'domain': 'com', + 'language': 'en', + 'pages': 1 } ) ->>> if response.ok: - with open("./scrapingbee_mobile.png", "wb") as f: - f.write(response.content) +print(response.json()) ``` -## Using ScrapingBee with Scrapy +### Amazon Product + +```python +response = client.amazon_product( + query='B0D2Q9397Y', # ASIN + params={ + 'domain': 'com' + } +) -Scrapy is the most popular Python web scraping framework. You can easily [integrate ScrapingBee's API with the Scrapy middleware](https://github.com/ScrapingBee/scrapy-scrapingbee). +print(response.json()) +``` +--- -## Retries +## Walmart API -The client includes a retry mechanism for 5XX responses. +Scrape Walmart search results and product details. + +### Walmart Search + +```python +response = client.walmart_search( + query='laptop', + params={ + 'sort_by': 'best_match', + 'device': 'desktop' + } +) + +print(response.json()) +``` + +### Walmart Product ```python ->>> from scrapingbee import ScrapingBeeClient +response = client.walmart_product( + product_id='123456789', + params={ + 'device': 'desktop' + } +) + +print(response.json()) +``` + +--- ->>> client = ScrapingBeeClient(api_key='REPLACE-WITH-YOUR-API-KEY') +## YouTube API ->>> response = client.get( - 'https://www.scrapingbee.com/blog/', +Scrape YouTube search results, video metadata, and transcripts. + +### YouTube Search + +```python +response = client.youtube_search( + search='web scraping tutorial', params={ - 'render_js': True, - }, - retries=5 + 'sort_by': 'relevance', + 'type': 'video' + } ) + +print(response.json()) ``` + +### YouTube Metadata + +```python +response = client.youtube_metadata(video_id='dQw4w9WgXcQ') +print(response.json()) +``` + +### YouTube Transcript + +```python +response = client.youtube_transcript( + video_id='dQw4w9WgXcQ', + params={'language': 'en'} +) +print(response.json()) +``` + +### YouTube Trainability + +```python +response = client.youtube_trainability(video_id='dQw4w9WgXcQ') +print(response.json()) +``` + +--- + +## ChatGPT API + +Use ChatGPT with optional web search. + +```python +response = client.chatgpt( + prompt='What is web scraping?', + params={ + 'search': True, + 'country_code': 'us' + } +) + +print(response.json()) +``` + +--- + +## Usage API + +Check your API credit usage. + +```python +response = client.usage() +print(response.json()) +# { +# "max_api_credit": 8000000, +# "used_api_credit": 1000023, +# "max_concurrency": 200, +# "current_concurrency": 1 +# } +``` + +--- + +## Legacy Methods (Deprecated) + +The `get()` and `post()` methods are deprecated and will be removed in a future version. Please use `html_api()` instead. + +```python +# Deprecated +client.get(url, params={...}) + +# Use instead +client.html_api(url, method='GET', params={...}) +``` + +## Screenshot + +Here is a little example on how to retrieve and store a screenshot from the ScrapingBee blog. + +```python +from scrapingbee import ScrapingBeeClient + +client = ScrapingBeeClient(api_key='YOUR-API-KEY') + +response = client.html_api( + 'https://www.scrapingbee.com/', + params={ + 'screenshot': True, + 'screenshot_full_page': True, + 'window_width': 375, + } +) + +with open('screenshot.png', 'wb') as f: + f.write(response.content) +``` + +## Retries + +The client includes a retry mechanism for 5XX responses. + +```python +client.html_api(url, params={...}, retries=5) +``` + +## Using ScrapingBee with Scrapy + +Scrapy is the most popular Python web scraping framework. You can easily [integrate ScrapingBee's API with the Scrapy middleware](https://github.com/ScrapingBee/scrapy-scrapingbee). \ No newline at end of file diff --git a/manual-test.py b/manual-test.py new file mode 100644 index 0000000..b29fed8 --- /dev/null +++ b/manual-test.py @@ -0,0 +1,605 @@ +import os +from scrapingbee import ScrapingBeeClient + +API_KEY = os.environ.get("SCRAPINGBEE_API_KEY") +client = ScrapingBeeClient(API_KEY) + + +# ============================================ +# Helper Functions +# ============================================ + +def assert_test(condition, message): + if not condition: + raise AssertionError(message) + + +# ============================================ +# HTML API Tests (Legacy) +# ============================================ + +def test_html_get(): + print("=== Testing HTML API - GET ===") + try: + response = client.get( + url="https://httpbin.org/get", + params={"render_js": False} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("httpbin" in response.text, "Response does not contain expected content") + + print(f"Status: {response.status_code}") + print("✅ HTML GET test passed!\n") + except Exception as e: + print(f"❌ HTML GET test failed: {e}\n") + raise + + +def test_html_post(): + print("=== Testing HTML API - POST ===") + try: + response = client.post( + url="https://httpbin.org/post", + params={"render_js": False}, + data={"test": "data"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("test" in response.text, "Response does not contain posted data") + + print(f"Status: {response.status_code}") + print("✅ HTML POST test passed!\n") + except Exception as e: + print(f"❌ HTML POST test failed: {e}\n") + raise + + +# ============================================ +# HTML API Tests (New) +# ============================================ + +def test_html_api_get(): + print("=== Testing HTML API (New) - GET ===") + try: + response = client.html_api( + url="https://httpbin.org/get", + method="GET", + params={"render_js": False} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("httpbin" in response.text, "Response does not contain expected content") + + print(f"Status: {response.status_code}") + print("✅ HTML API GET test passed!\n") + except Exception as e: + print(f"❌ HTML API GET test failed: {e}\n") + raise + + +def test_html_api_post(): + print("=== Testing HTML API (New) - POST ===") + try: + response = client.html_api( + url="https://httpbin.org/post", + method="POST", + params={"render_js": False}, + data={"test": "data"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + assert_test("test" in response.text, "Response does not contain posted data") + + print(f"Status: {response.status_code}") + print("✅ HTML API POST test passed!\n") + except Exception as e: + print(f"❌ HTML API POST test failed: {e}\n") + raise + + +def test_html_api_extract_rules(): + print("=== Testing HTML API - Extract Rules ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com/blog/", + params={ + "render_js": False, + "extract_rules": { + "title": "h1", + "posts": { + "selector": ".container > div > div > div", + "type": "list", + "output": { + "title": "h4", + "link": "a@href" + } + } + } + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Extracted title is missing") + assert_test(isinstance(data.get("posts"), list), "Extracted posts is not a list") + assert_test(len(data.get("posts", [])) > 0, "No posts extracted") + + print(f"Status: {response.status_code}") + print(f"Extracted title: {data.get('title')}") + print(f"Extracted posts count: {len(data.get('posts', []))}") + print("✅ HTML API Extract Rules test passed!\n") + except Exception as e: + print(f"❌ HTML API Extract Rules test failed: {e}\n") + raise + + +def test_html_api_js_scenario(): + print("=== Testing HTML API - JS Scenario ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com", + params={ + "render_js": True, + "js_scenario": { + "instructions": [ + {"wait": 1000}, + {"scroll_y": 500}, + {"wait": 500} + ] + } + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.text, "Response is empty") + + print(f"Status: {response.status_code}") + print(f"Content: {response.text[:300]}") # ← Fixed: Match Node.js output + print("✅ HTML API JS Scenario test passed!\n") + except Exception as e: + print(f"❌ HTML API JS Scenario test failed: {e}\n") + raise + + +def test_html_api_screenshot(): + print("=== Testing HTML API - Screenshot ===") + try: + response = client.html_api( + url="https://www.scrapingbee.com", + params={ + "render_js": True, + "screenshot": True, + "window_width": 1920, + "window_height": 1080 + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test(response.content, "Response is empty") + assert_test(len(response.content) > 10000, "Screenshot seems too small") + + # Check PNG signature + png_signature = b'\x89PNG\r\n\x1a\n' + assert_test(response.content[:8] == png_signature, "Response is not a valid PNG") + + print(f"Status: {response.status_code}") + print(f"Screenshot size: {len(response.content)} bytes") + print("✅ HTML API Screenshot test passed!\n") + except Exception as e: + print(f"❌ HTML API Screenshot test failed: {e}\n") + raise + + +def test_html_api_json_response(): + print("=== Testing HTML API - JSON Response ===") + try: + response = client.html_api( + url="https://httpbin.org/get", + params={ + "render_js": False, + "json_response": True + } + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("body") is not None, "JSON response missing body field") + assert_test(data.get("xhr") is not None, "JSON response missing xhr field") + + # Handle body as string or object + body = data.get("body") + body_preview = body[:300] if isinstance(body, str) else str(body)[:300] + + print(f"Status: {response.status_code}") + print(f"Content: {body_preview}") + print("✅ HTML API JSON Response test passed!\n") + except Exception as e: + print(f"❌ HTML API JSON Response test failed: {e}\n") + raise + + +def test_html_api_with_headers(): + print("=== Testing HTML API - Custom Headers ===") + try: + response = client.html_api( + url="https://httpbin.org/headers", + params={"render_js": False}, + headers={"X-Custom-Header": "CustomValue123"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("CustomValue123" in response.text, "Custom header not forwarded") + + print(f"Status: {response.status_code}") + print("✅ HTML API Custom Headers test passed!\n") + except Exception as e: + print(f"❌ HTML API Custom Headers test failed: {e}\n") + raise + + +def test_html_api_with_cookies(): + print("=== Testing HTML API - Custom Cookies ===") + try: + response = client.html_api( + url="https://httpbin.org/cookies", + params={"render_js": False}, + cookies={"session_id": "abc123", "user_token": "xyz789"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("abc123" in response.text or "xyz789" in response.text, "Cookies not forwarded") + + print(f"Status: {response.status_code}") + print("✅ HTML API Custom Cookies test passed!\n") + except Exception as e: + print(f"❌ HTML API Custom Cookies test failed: {e}\n") + raise + + +def test_html_api_post_with_headers_and_cookies(): + print("=== Testing HTML API - POST with Headers + Cookies ===") + try: + response = client.html_api( + url="https://httpbin.org/post", + method="POST", + params={"render_js": False}, + headers={"X-Test-Header": "TestValue"}, + cookies={"session": "mysession123"}, + data={"action": "submit"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + assert_test("submit" in response.text, "Posted data not in response") + + print(f"Status: {response.status_code}") + print("✅ HTML API POST with Headers + Cookies test passed!\n") + except Exception as e: + print(f"❌ HTML API POST with Headers + Cookies test failed: {e}\n") + raise + + +# ============================================ +# Google Search API +# ============================================ + +def test_google_search(): + print("=== Testing Google Search API ===") + try: + response = client.google_search( + search="scrapingbee", + params={"language": "en", "country_code": "us"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("organic_results"), "Missing organic_results in response") + assert_test(isinstance(data.get("organic_results"), list), "organic_results is not a list") + assert_test(len(data.get("organic_results", [])) > 0, "No organic results found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('organic_results', []))}") + print("✅ Google Search test passed!\n") + except Exception as e: + print(f"❌ Google Search test failed: {e}\n") + raise + + +# ============================================ +# Amazon API +# ============================================ + +def test_amazon_search(): + print("=== Testing Amazon Search API ===") + try: + response = client.amazon_search( + query="laptop", + params={"domain": "com", "pages": 1} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("products"), "Missing products in response") + assert_test(isinstance(data.get("products"), list), "products is not a list") + assert_test(len(data.get("products", [])) > 0, "No products found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('products', []))}") + print("✅ Amazon Search test passed!\n") + except Exception as e: + print(f"❌ Amazon Search test failed: {e}\n") + raise + + +def test_amazon_product(): + print("=== Testing Amazon Product API ===") + try: + response = client.amazon_product( + query="B0D2Q9397Y", + params={"domain": "com"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Missing product title in response") + + print(f"Status: {response.status_code}") + print(f"Product title: {data.get('title', '')[:50]}") + print("✅ Amazon Product test passed!\n") + except Exception as e: + print(f"❌ Amazon Product test failed: {e}\n") + raise + + +# ============================================ +# Walmart API +# ============================================ + +def test_walmart_search(): + print("=== Testing Walmart Search API ===") + try: + response = client.walmart_search( + query="laptop", + params={"device": "desktop", "sort_by": "best_match"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("products"), "Missing products in response") + assert_test(isinstance(data.get("products"), list), "products is not a list") + assert_test(len(data.get("products", [])) > 0, "No products found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('products', []))}") + print("✅ Walmart Search test passed!\n") + except Exception as e: + print(f"❌ Walmart Search test failed: {e}\n") + raise + + +def test_walmart_product(): + print("=== Testing Walmart Product API ===") + try: + response = client.walmart_product( + product_id="454408250", + params={"device": "desktop"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title"), "Missing product title in response") + + print(f"Status: {response.status_code}") + print(f"Product title: {data.get('title', '')[:50]}") + print("✅ Walmart Product test passed!\n") + except Exception as e: + print(f"❌ Walmart Product test failed: {e}\n") + raise + + +# ============================================ +# YouTube API +# ============================================ + +def test_youtube_search(): + print("=== Testing YouTube Search API ===") + try: + response = client.youtube_search( + search="web scraping tutorial", + params={"sort_by": "relevance", "type": "video"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("results"), "Missing results in response") + assert_test(isinstance(data.get("results"), list), "results is not a list") + assert_test(len(data.get("results", [])) > 0, "No results found") + + print(f"Status: {response.status_code}") + print(f"Results found: {len(data.get('results', []))}") + print("✅ YouTube Search test passed!\n") + except Exception as e: + print(f"❌ YouTube Search test failed: {e}\n") + raise + + +def test_youtube_metadata(): + print("=== Testing YouTube Metadata API ===") + try: + response = client.youtube_metadata(video_id="dQw4w9WgXcQ") + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("title") or data.get("like_count") is not None, "Missing expected metadata fields") + + print(f"Status: {response.status_code}") + print(f"Like count: {data.get('like_count')}") + print("✅ YouTube Metadata test passed!\n") + except Exception as e: + print(f"❌ YouTube Metadata test failed: {e}\n") + raise + + +def test_youtube_transcript(): + print("=== Testing YouTube Transcript API ===") + try: + response = client.youtube_transcript( + video_id="sfyL4BswUeE", + params={"language": "en"} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("text") or data.get("transcript"), "Missing transcript in response") + + transcript_preview = (data.get("text") or str(data.get("transcript", "")))[:100] + print(f"Status: {response.status_code}") + print(f"Transcript preview: {transcript_preview}") + print("✅ YouTube Transcript test passed!\n") + except Exception as e: + print(f"❌ YouTube Transcript test failed: {e}\n") + raise + + +def test_youtube_trainability(): + print("=== Testing YouTube Trainability API ===") + try: + response = client.youtube_trainability(video_id="dQw4w9WgXcQ") + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("permitted") is not None, "Missing permitted field in response") + + print(f"Status: {response.status_code}") + print(f"Permitted: {data.get('permitted')}") + print("✅ YouTube Trainability test passed!\n") + except Exception as e: + print(f"❌ YouTube Trainability test failed: {e}\n") + raise + + +# ============================================ +# ChatGPT API +# ============================================ + +def test_chatgpt(): + print("=== Testing ChatGPT API ===") + try: + response = client.chatgpt( + prompt="What is web scraping? Answer in one sentence.", + params={"search": True} + ) + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("results_text") or data.get("results_markdown"), "Missing response text") + + response_text = (data.get("results_text") or data.get("results_markdown", ""))[:100] + print(f"Status: {response.status_code}") + print(f"Response: {response_text}") + print("✅ ChatGPT test passed!\n") + except Exception as e: + print(f"❌ ChatGPT test failed: {e}\n") + raise + + +# ============================================ +# Usage API +# ============================================ + +def test_usage(): + print("=== Testing Usage API ===") + try: + response = client.usage() + + assert_test(response.status_code == 200, f"Expected status 200, got {response.status_code}") + + data = response.json() + assert_test(data.get("max_api_credit") is not None, "Missing max_api_credit") + assert_test(data.get("used_api_credit") is not None, "Missing used_api_credit") + assert_test(data.get("max_concurrency") is not None, "Missing max_concurrency") + + print(f"Status: {response.status_code}") + print(f"Max API credits: {data.get('max_api_credit')}") + print(f"Used API credits: {data.get('used_api_credit')}") + print(f"Max concurrency: {data.get('max_concurrency')}") + print("✅ Usage test passed!\n") + except Exception as e: + print(f"❌ Usage test failed: {e}\n") + raise + + +# ============================================ +# Run All Tests +# ============================================ + +def run_tests(): + print("\n🚀 Starting ScrapingBee Python SDK Tests\n") + + tests = [ + # Legacy HTML API + test_html_get, + test_html_post, + + # New HTML API + test_html_api_get, + test_html_api_post, + test_html_api_extract_rules, + test_html_api_js_scenario, + test_html_api_screenshot, + test_html_api_json_response, + test_html_api_with_headers, + test_html_api_with_cookies, + test_html_api_post_with_headers_and_cookies, + + # Other APIs + test_google_search, + test_amazon_search, + test_amazon_product, + test_walmart_search, + test_walmart_product, + test_youtube_search, + test_youtube_metadata, + test_youtube_transcript, + test_youtube_trainability, + test_chatgpt, + test_usage, + ] + + passed = 0 + failed = 0 + + for test in tests: + try: + test() + passed += 1 + except Exception: + failed += 1 + + print("🏁 All tests completed!") + print(f"✅ Passed: {passed}") + print(f"❌ Failed: {failed}") + print(f"📊 Total: {len(tests)}\n") + + if failed > 0: + exit(1) + + +if __name__ == "__main__": + run_tests() diff --git a/scrapingbee/__version__.py b/scrapingbee/__version__.py index 0309ae2..9aa3f90 100644 --- a/scrapingbee/__version__.py +++ b/scrapingbee/__version__.py @@ -1 +1 @@ -__version__ = "2.0.2" +__version__ = "2.1.0" diff --git a/scrapingbee/client.py b/scrapingbee/client.py index 67f72c3..766b60b 100644 --- a/scrapingbee/client.py +++ b/scrapingbee/client.py @@ -1,68 +1,116 @@ -from typing import Optional +import warnings +from functools import wraps + from requests import Response, Session from requests.adapters import HTTPAdapter from urllib3.util import Retry -from .utils import get_scrapingbee_url, process_headers + +from .utils import process_headers, process_params + + +def deprecated(reason): + """Decorator to mark functions as deprecated.""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + f"{func.__name__}() is deprecated. {reason}", + category=DeprecationWarning, + stacklevel=2 + ) + return func(*args, **kwargs) + return wrapper + return decorator class ScrapingBeeClient: - api_url = "https://app.scrapingbee.com/api/v1/" + # API Endpoints + HTML_API_URL = "https://app.scrapingbee.com/api/v1/" + GOOGLE_API_URL = "https://app.scrapingbee.com/api/v1/store/google" + AMAZON_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/amazon/search" + AMAZON_PRODUCT_API_URL = "https://app.scrapingbee.com/api/v1/amazon/product" + WALMART_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/walmart/search" + WALMART_PRODUCT_API_URL = "https://app.scrapingbee.com/api/v1/walmart/product" + YOUTUBE_SEARCH_API_URL = "https://app.scrapingbee.com/api/v1/youtube/search" + YOUTUBE_METADATA_API_URL = "https://app.scrapingbee.com/api/v1/youtube/metadata" + YOUTUBE_TRANSCRIPT_API_URL = "https://app.scrapingbee.com/api/v1/youtube/transcript" + YOUTUBE_TRAINABILITY_API_URL = "https://app.scrapingbee.com/api/v1/youtube/trainability" + CHATGPT_API_URL = "https://app.scrapingbee.com/api/v1/chatgpt" + USAGE_API_URL = "https://app.scrapingbee.com/api/v1/usage" def __init__(self, api_key: str): self.api_key = api_key + # ============================================ + # Core Request Method + # ============================================ + def request( self, method: str, url: str, - params: Optional[dict] = None, - data: Optional[dict] = None, - json: Optional[dict] = None, - headers: Optional[dict] = None, - cookies: Optional[dict] = None, - retries: Optional[int] = None, + params: dict, + headers: dict = None, + data: dict = None, + json: dict = None, + retries: int = None, **kwargs ) -> Response: - if not params: - params = {} - - # Process headers and set forward_headers - if headers: - params["forward_headers"] = True - headers = process_headers(headers) - - # Add cookies to params - if cookies: - # ScrapingBee reads cookies from url parameters - params["cookies"] = cookies - - # Get ScrapingBee API URL - spb_url = get_scrapingbee_url(self.api_url, self.api_key, url, params) + """Core request method - adds api_key and makes the HTTP call.""" + params["api_key"] = self.api_key session = Session() if retries: - # Retries if it is a network error or a 5xx error on an idempotent request (GET) - retries = Retry(total=retries, raise_on_status=False, status_forcelist=frozenset(range(500, 600))) - session.mount('https://', HTTPAdapter(max_retries=retries)) - session.mount('http://', HTTPAdapter(max_retries=retries)) + retry_strategy = Retry( + total=retries, + raise_on_status=False, + status_forcelist=frozenset(range(500, 600)) + ) + session.mount("https://", HTTPAdapter(max_retries=retry_strategy)) + session.mount("http://", HTTPAdapter(max_retries=retry_strategy)) + + if json is not None: + return session.request(method, url, params=params, json=json, headers=headers, **kwargs) + return session.request(method, url, params=params, data=data, headers=headers, **kwargs) - if not data and json is not None: - return session.request(method, spb_url, json=json, headers=headers, **kwargs) - return session.request(method, spb_url, data=data, headers=headers, **kwargs) + # ============================================ + # HTML API (Legacy - WILL BE REMOVED) + # ============================================ + @deprecated("Please use html_api() instead. This method will be removed in version 2.0.0.") def get( self, url: str, params: dict = None, headers: dict = None, cookies: dict = None, - retries: Optional[int] = None, + retries: int = None, **kwargs ) -> Response: - return self.request("GET", url, params=params, headers=headers, cookies=cookies, retries=retries, **kwargs) + """HTML API - GET request. DEPRECATED: Use html_api() instead.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method="GET", + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + retries=retries, + **kwargs + ) + @deprecated("Please use html_api() instead. This method will be removed in version 2.0.0.") def post( self, url: str, @@ -71,8 +119,305 @@ def post( json: dict = None, headers: dict = None, cookies: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """HTML API - POST request. DEPRECATED: Use html_api() instead.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method="POST", + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + data=data, + json=json, + retries=retries, + **kwargs + ) + + # ============================================ + # HTML API (New) + # ============================================ + + def html_api( + self, + url: str, + method: str = "GET", + params: dict = None, + data: dict = None, + json: dict = None, + headers: dict = None, + cookies: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """HTML API - Scrape any webpage.""" + if params is None: + params = {} + + params["url"] = url + if cookies: + params["cookies"] = cookies + + processed_headers = process_headers(headers) + if headers: + params["forward_headers"] = True + + return self.request( + method=method, + url=self.HTML_API_URL, + params=process_params(params), + headers=processed_headers, + data=data, + json=json, + retries=retries, + **kwargs + ) + + # ============================================ + # Google Search API + # ============================================ + + def google_search( + self, + search: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Google Search API - Scrape Google search results.""" + if params is None: + params = {} + params["search"] = search + + return self.request( + method="GET", + url=self.GOOGLE_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Amazon API + # ============================================ + + def amazon_search( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Amazon Search API - Scrape Amazon search results.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.AMAZON_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def amazon_product( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Amazon Product API - Scrape Amazon product details.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.AMAZON_PRODUCT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Walmart API + # ============================================ + + def walmart_search( + self, + query: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Walmart Search API - Scrape Walmart search results.""" + if params is None: + params = {} + params["query"] = query + + return self.request( + method="GET", + url=self.WALMART_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def walmart_product( + self, + product_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """Walmart Product API - Scrape Walmart product details.""" + if params is None: + params = {} + params["product_id"] = product_id + + return self.request( + method="GET", + url=self.WALMART_PRODUCT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # YouTube API + # ============================================ + + def youtube_search( + self, + search: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Search API - Scrape YouTube search results.""" + if params is None: + params = {} + params["search"] = search + + return self.request( + method="GET", + url=self.YOUTUBE_SEARCH_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_metadata( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Metadata API - Get YouTube video metadata.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_METADATA_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_transcript( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Transcript API - Get YouTube video transcript.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_TRANSCRIPT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + def youtube_trainability( + self, + video_id: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """YouTube Trainability API - Check video trainability.""" + if params is None: + params = {} + params["video_id"] = video_id + + return self.request( + method="GET", + url=self.YOUTUBE_TRAINABILITY_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # ChatGPT API + # ============================================ + + def chatgpt( + self, + prompt: str, + params: dict = None, + retries: int = None, + **kwargs + ) -> Response: + """ChatGPT API - Use ChatGPT with optional web search.""" + if params is None: + params = {} + params["prompt"] = prompt + + return self.request( + method="GET", + url=self.CHATGPT_API_URL, + params=params, + retries=retries, + **kwargs + ) + + # ============================================ + # Usage API + # ============================================ + + def usage( + self, + retries: int = None, **kwargs ) -> Response: + """Usage API - Check API credit usage and account limits.""" return self.request( - "POST", url, params=params, data=data, json=json, headers=headers, cookies=cookies, **kwargs + method="GET", + url=self.USAGE_API_URL, + params={}, + retries=retries, + **kwargs ) diff --git a/scrapingbee/utils.py b/scrapingbee/utils.py index 7501533..ffb3d7c 100644 --- a/scrapingbee/utils.py +++ b/scrapingbee/utils.py @@ -1,6 +1,5 @@ import base64 import json -import urllib from typing import Optional from .__version__ import __version__ @@ -53,20 +52,3 @@ def process_params(params: dict) -> dict: else: new_params[k] = v return new_params - - -def get_scrapingbee_url(api_url: str, api_key: str, url: str, params: dict) -> str: - all_params = { - 'api_key': api_key, - 'url': url - } - if params: - all_params.update(params) - - # Process params - spb_params = process_params(all_params) - - # Format url query string - qs = urllib.parse.urlencode(spb_params) - - return f'{api_url}?{qs}' diff --git a/tests/test_client.py b/tests/test_client.py index 2a3d0f8..a10d1a8 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -11,6 +11,10 @@ def client(): return ScrapingBeeClient(api_key='API_KEY') +# ============================================ +# Legacy HTML API Tests (get) +# ============================================ + @mock.patch('scrapingbee.client.Session') def test_get(mock_session, client): '''It should make a GET request with the url and API key''' @@ -18,8 +22,8 @@ def test_get(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, data=None, headers=DEFAULT_HEADERS ) @@ -27,13 +31,13 @@ def test_get(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_params(mock_session, client): - '''It should add parameters to the url''' + '''It should add parameters to the request''' client.get('https://httpbin.org', params={'render_js': True}) mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&render_js=True', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'render_js': True}, data=None, headers=DEFAULT_HEADERS, ) @@ -46,17 +50,16 @@ def test_get_with_headers(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&forward_headers=True', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'forward_headers': True}, data=None, - headers={'Spb-Content-Type': 'text/html; charset=utf-8', - **DEFAULT_HEADERS}, + headers={'Spb-Content-Type': 'text/html; charset=utf-8', **DEFAULT_HEADERS}, ) @mock.patch('scrapingbee.client.Session') def test_get_with_cookies(mock_session, client): - '''It should format the cookies and add them to the url''' + '''It should format the cookies and add them to the params''' client.get('https://httpbin.org', cookies={ 'name_1': 'value_1', 'name_2': 'value_2', @@ -64,8 +67,8 @@ def test_get_with_cookies(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&cookies=name_1%3Dvalue_1%3Bname_2%3Dvalue_2', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'cookies': 'name_1=value_1;name_2=value_2'}, data=None, headers=DEFAULT_HEADERS, ) @@ -73,7 +76,7 @@ def test_get_with_cookies(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_extract_rules(mock_session, client): - '''It should format the extract_rules and add them to the url''' + '''It should format the extract_rules and add them to the params''' client.get('https://httpbin.org', params={ 'extract_rules': { "title": "h1", @@ -83,10 +86,12 @@ def test_get_with_extract_rules(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'extract_rules=%7B%22title%22%3A+%22h1%22%2C+%22' - 'subtitle%22%3A+%22%23subtitle%22%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'extract_rules': '{"title": "h1", "subtitle": "#subtitle"}' + }, data=None, headers=DEFAULT_HEADERS, ) @@ -94,7 +99,7 @@ def test_get_with_extract_rules(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_js_scenario(mock_session, client): - '''It should format the extract_rules and add them to the url''' + '''It should format the js_scenario and add them to the params''' client.get('https://httpbin.org', params={ 'js_scenario': { 'instructions': [ @@ -105,9 +110,12 @@ def test_get_with_js_scenario(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'js_scenario=%7B%22instructions%22%3A+%5B%7B%22click%22%3A+%22%23buttonId%22%7D%5D%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'js_scenario': '{"instructions": [{"click": "#buttonId"}]}' + }, data=None, headers=DEFAULT_HEADERS, ) @@ -115,7 +123,7 @@ def test_get_with_js_scenario(mock_session, client): @mock.patch('scrapingbee.client.Session') def test_get_with_ai_extract_rules(mock_session, client): - '''It should format the ai_extract_rules and add them to the url''' + '''It should format the ai_extract_rules and add them to the params''' client.get('https://httpbin.org', params={ 'ai_extract_rules': { "product_name": "The name of the product", @@ -125,15 +133,21 @@ def test_get_with_ai_extract_rules(mock_session, client): mock_session.return_value.request.assert_called_with( 'GET', - 'https://app.scrapingbee.com/api/v1/' - '?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&' - 'ai_extract_rules=%7B%22product_name%22%3A+%22The+name+of+the+product%22%2C+%22' - 'price%22%3A+%22The+price+in+USD%22%7D', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'ai_extract_rules': '{"product_name": "The name of the product", "price": "The price in USD"}' + }, data=None, headers=DEFAULT_HEADERS, ) +# ============================================ +# Legacy HTML API Tests (post) +# ============================================ + @mock.patch('scrapingbee.client.Session') def test_post(mock_session, client): '''It should make a POST request with some data''' @@ -141,7 +155,483 @@ def test_post(mock_session, client): mock_session.return_value.request.assert_called_with( 'POST', - 'https://app.scrapingbee.com/api/v1/?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data={'KEY_1': 'VALUE_1'}, + headers=DEFAULT_HEADERS + ) + + +# ============================================ +# New HTML API Tests (html_api) +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_html_api_get(mock_session, client): + '''It should make a GET request with html_api''' + client.html_api('https://httpbin.org') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_post(mock_session, client): + '''It should make a POST request with html_api''' + client.html_api('https://httpbin.org', method='POST') + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_params(mock_session, client): + '''It should add parameters to html_api request''' + client.html_api('https://httpbin.org', params={'render_js': True, 'premium_proxy': True}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'render_js': True, 'premium_proxy': True}, + data=None, + headers=DEFAULT_HEADERS + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_headers(mock_session, client): + '''It should prefix header names with Spb- and set forward_headers''' + client.html_api('https://httpbin.org', headers={'Content-Type': 'text/html; charset=utf-8'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'forward_headers': True}, + data=None, + headers={'Spb-Content-Type': 'text/html; charset=utf-8', **DEFAULT_HEADERS}, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_cookies(mock_session, client): + '''It should format the cookies and add them to the params''' + client.html_api('https://httpbin.org', cookies={ + 'name_1': 'value_1', + 'name_2': 'value_2', + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org', 'cookies': 'name_1=value_1;name_2=value_2'}, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_headers_and_cookies(mock_session, client): + '''It should handle headers and cookies in html_api''' + client.html_api( + 'https://httpbin.org', + method='POST', + headers={'X-Custom': 'value'}, + cookies={'session': 'abc123'} + ) + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'cookies': 'session=abc123', + 'forward_headers': True + }, + data=None, + headers={'Spb-X-Custom': 'value', **DEFAULT_HEADERS} + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_extract_rules(mock_session, client): + '''It should format the extract_rules and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'extract_rules': { + "title": "h1", + "subtitle": "#subtitle" + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'extract_rules': '{"title": "h1", "subtitle": "#subtitle"}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_js_scenario(mock_session, client): + '''It should format the js_scenario and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'js_scenario': { + 'instructions': [ + {"click": "#buttonId"} + ] + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'js_scenario': '{"instructions": [{"click": "#buttonId"}]}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_with_ai_extract_rules(mock_session, client): + '''It should format the ai_extract_rules and add them to the params''' + client.html_api('https://httpbin.org', params={ + 'ai_extract_rules': { + "product_name": "The name of the product", + "price": "The price in USD" + } + }) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/', + params={ + 'api_key': 'API_KEY', + 'url': 'https://httpbin.org', + 'ai_extract_rules': '{"product_name": "The name of the product", "price": "The price in USD"}' + }, + data=None, + headers=DEFAULT_HEADERS, + ) + + +@mock.patch('scrapingbee.client.Session') +def test_html_api_post_with_data(mock_session, client): + '''It should make a POST request with some data''' + client.html_api('https://httpbin.org', method='POST', data={'KEY_1': 'VALUE_1'}) + + mock_session.return_value.request.assert_called_with( + 'POST', + 'https://app.scrapingbee.com/api/v1/', + params={'api_key': 'API_KEY', 'url': 'https://httpbin.org'}, data={'KEY_1': 'VALUE_1'}, headers=DEFAULT_HEADERS ) + + +# ============================================ +# Google Search API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_google_search(mock_session, client): + '''It should make a Google Search request''' + client.google_search('test query') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/store/google', + params={'api_key': 'API_KEY', 'search': 'test query'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_google_search_with_params(mock_session, client): + '''It should add parameters to Google Search request''' + client.google_search('test query', params={'language': 'en', 'country_code': 'us'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/store/google', + params={'api_key': 'API_KEY', 'search': 'test query', 'language': 'en', 'country_code': 'us'}, + data=None, + headers=None + ) + + +# ============================================ +# Amazon API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_amazon_search(mock_session, client): + '''It should make an Amazon Search request''' + client.amazon_search('laptop') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/search', + params={'api_key': 'API_KEY', 'query': 'laptop'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_search_with_params(mock_session, client): + '''It should add parameters to Amazon Search request''' + client.amazon_search('laptop', params={'domain': 'com', 'pages': 2}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/search', + params={'api_key': 'API_KEY', 'query': 'laptop', 'domain': 'com', 'pages': 2}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_product(mock_session, client): + '''It should make an Amazon Product request''' + client.amazon_product('B0D2Q9397Y') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/product', + params={'api_key': 'API_KEY', 'query': 'B0D2Q9397Y'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_amazon_product_with_params(mock_session, client): + '''It should add parameters to Amazon Product request''' + client.amazon_product('B0D2Q9397Y', params={'domain': 'com'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/amazon/product', + params={'api_key': 'API_KEY', 'query': 'B0D2Q9397Y', 'domain': 'com'}, + data=None, + headers=None + ) + + +# ============================================ +# Walmart API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_walmart_search(mock_session, client): + '''It should make a Walmart Search request''' + client.walmart_search('laptop') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/search', + params={'api_key': 'API_KEY', 'query': 'laptop'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_search_with_params(mock_session, client): + '''It should add parameters to Walmart Search request''' + client.walmart_search('laptop', params={'sort_by': 'best_match'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/search', + params={'api_key': 'API_KEY', 'query': 'laptop', 'sort_by': 'best_match'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_product(mock_session, client): + '''It should make a Walmart Product request''' + client.walmart_product('123456789') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/product', + params={'api_key': 'API_KEY', 'product_id': '123456789'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_walmart_product_with_params(mock_session, client): + '''It should add parameters to Walmart Product request''' + client.walmart_product('123456789', params={'device': 'desktop'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/walmart/product', + params={'api_key': 'API_KEY', 'product_id': '123456789', 'device': 'desktop'}, + data=None, + headers=None + ) + + +# ============================================ +# YouTube API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_youtube_search(mock_session, client): + '''It should make a YouTube Search request''' + client.youtube_search('web scraping') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/search', + params={'api_key': 'API_KEY', 'search': 'web scraping'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_search_with_params(mock_session, client): + '''It should add parameters to YouTube Search request''' + client.youtube_search('web scraping', params={'sort_by': 'relevance', 'type': 'video'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/search', + params={'api_key': 'API_KEY', 'search': 'web scraping', 'sort_by': 'relevance', 'type': 'video'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_metadata(mock_session, client): + '''It should make a YouTube Metadata request''' + client.youtube_metadata('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/metadata', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_transcript(mock_session, client): + '''It should make a YouTube Transcript request''' + client.youtube_transcript('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/transcript', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_transcript_with_params(mock_session, client): + '''It should add parameters to YouTube Transcript request''' + client.youtube_transcript('dQw4w9WgXcQ', params={'language': 'en'}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/transcript', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ', 'language': 'en'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_youtube_trainability(mock_session, client): + '''It should make a YouTube Trainability request''' + client.youtube_trainability('dQw4w9WgXcQ') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/youtube/trainability', + params={'api_key': 'API_KEY', 'video_id': 'dQw4w9WgXcQ'}, + data=None, + headers=None + ) + + +# ============================================ +# ChatGPT API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_chatgpt(mock_session, client): + '''It should make a ChatGPT request''' + client.chatgpt('What is web scraping?') + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/chatgpt', + params={'api_key': 'API_KEY', 'prompt': 'What is web scraping?'}, + data=None, + headers=None + ) + + +@mock.patch('scrapingbee.client.Session') +def test_chatgpt_with_params(mock_session, client): + '''It should add parameters to ChatGPT request''' + client.chatgpt('What is web scraping?', params={'search': True}) + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/chatgpt', + params={'api_key': 'API_KEY', 'prompt': 'What is web scraping?', 'search': True}, + data=None, + headers=None + ) + + +# ============================================ +# Usage API Tests +# ============================================ + +@mock.patch('scrapingbee.client.Session') +def test_usage(mock_session, client): + '''It should make a Usage request''' + client.usage() + + mock_session.return_value.request.assert_called_with( + 'GET', + 'https://app.scrapingbee.com/api/v1/usage', + params={'api_key': 'API_KEY'}, + data=None, + headers=None + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 583e497..aa6b13b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,8 +3,7 @@ process_json_stringify_param, process_headers, process_cookies, - process_params, - get_scrapingbee_url, + process_params ) @@ -18,7 +17,7 @@ def test_process_headers(): """It should add a Spb- prefix to header names""" output = process_headers({"Accept-Language": "En-US"}) assert output == { - "User-Agent": "ScrapingBee-Python/2.0.2", + "User-Agent": "ScrapingBee-Python/2.1.0", "Spb-Accept-Language": "En-US", } @@ -57,14 +56,3 @@ def test_process_params(): """It should keep boolean parameters""" output = process_params({"render_js": True}) assert output == {"render_js": True} - - -def test_get_scrapingbee_url(): - """It should generate a url""" - output = get_scrapingbee_url( - "https://app.scrapingbee.com/api/v1/", "API_KEY", "https://httpbin.org", {"render_js": True, "wait_for": "#foo"} - ) - assert ( - output == "https://app.scrapingbee.com/api/v1/" - "?api_key=API_KEY&url=https%3A%2F%2Fhttpbin.org&render_js=True&wait_for=%23foo" - )