diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index 0b6fb8e..05fe6dd 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -131,6 +131,65 @@ const numberOfScrolls = 10; // Will scroll 10 times to load more content The `numberOfScrolls` parameter accepts values between 0 and 100, allowing you to control how many times the page should be scrolled before extraction. +#### Scraping with Cookies + +Use cookies for authentication and session management when scraping websites that require login or have user-specific content: + +```javascript +import { smartScraper } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com/dashboard'; +const prompt = 'Extract user profile information'; + +// Define cookies for authentication +const cookies = { + session_id: 'abc123def456', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + user_preferences: 'dark_mode,usd' +}; + +(async () => { + try { + const response = await smartScraper(apiKey, url, prompt, null, null, null, cookies); + console.log(response.result); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + +**Common Use Cases:** +- **E-commerce sites**: User authentication, shopping cart persistence +- **Social media**: Session management, user preferences +- **Banking/Financial**: Secure authentication, transaction history +- **News sites**: User preferences, subscription content +- **API endpoints**: Authentication tokens, API keys + +#### Advanced Scraping with Cookies, Scrolling, and Pagination + +Combine cookies with infinite scrolling and pagination for comprehensive data extraction: + +```javascript +import { smartScraper } from 'scrapegraph-js'; + +const apiKey = 'your-api-key'; +const url = 'https://example.com/feed'; +const prompt = 'Extract all posts from the feed'; +const cookies = { session_token: 'xyz789abc123' }; +const numberOfScrolls = 10; // Scroll 10 times +const totalPages = 5; // Scrape 5 pages + +(async () => { + try { + const response = await smartScraper(apiKey, url, prompt, null, numberOfScrolls, totalPages, cookies); + console.log('Extracted data:', response); + } catch (error) { + console.error('Error:', error); + } +})(); +``` + ### Search Scraping Search and extract information from multiple web sources using AI. diff --git a/scrapegraph-js/examples/cookies_integration_example.js b/scrapegraph-js/examples/cookies_integration_example.js new file mode 100644 index 0000000..5c71e37 --- /dev/null +++ b/scrapegraph-js/examples/cookies_integration_example.js @@ -0,0 +1,261 @@ +/** + * Comprehensive example demonstrating cookies integration for web scraping. + * + * This example shows various real-world scenarios where cookies are essential: + * 1. E-commerce site scraping with authentication + * 2. Social media scraping with session cookies + * 3. Banking/financial site scraping with secure cookies + * 4. News site scraping with user preferences + * 5. API endpoint scraping with authentication tokens + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A .env file with your SGAI_APIKEY + * + * Example .env file: + * SGAI_APIKEY=your_api_key_here + */ + +import { smartScraper } from 'scrapegraph-js'; +import { z } from 'zod'; +import 'dotenv/config'; + +// Define data schemas for different scenarios +const ProductInfoSchema = z.object({ + name: z.string().describe('Product name'), + price: z.string().describe('Product price'), + availability: z.string().describe('Product availability status'), + rating: z.string().optional().describe('Product rating') +}); + +const SocialMediaPostSchema = z.object({ + author: z.string().describe('Post author'), + content: z.string().describe('Post content'), + likes: z.string().optional().describe('Number of likes'), + comments: z.string().optional().describe('Number of comments'), + timestamp: z.string().optional().describe('Post timestamp') +}); + +const NewsArticleSchema = z.object({ + title: z.string().describe('Article title'), + summary: z.string().describe('Article summary'), + author: z.string().optional().describe('Article author'), + publish_date: z.string().optional().describe('Publish date') +}); + +const BankTransactionSchema = z.object({ + date: z.string().describe('Transaction date'), + description: z.string().describe('Transaction description'), + amount: z.string().describe('Transaction amount'), + type: z.string().describe('Transaction type (credit/debit)') +}); + +async function scrapeEcommerceWithAuth() { + console.log('='.repeat(60)); + console.log('E-COMMERCE SITE SCRAPING WITH AUTHENTICATION'); + console.log('='.repeat(60)); + + // Example cookies for an e-commerce site + const cookies = { + session_id: 'abc123def456', + user_id: 'user789', + cart_id: 'cart101112', + preferences: 'dark_mode,usd', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...' + }; + + const websiteUrl = 'https://example-ecommerce.com/products'; + const userPrompt = 'Extract product information including name, price, availability, and rating'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + ProductInfoSchema, + 5, // numberOfScrolls - Scroll to load more products + null, // totalPages + cookies + ); + + console.log('✅ E-commerce scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`❌ Error in e-commerce scraping: ${error.message}`); + } +} + +async function scrapeSocialMediaWithSession() { + console.log('\n' + '='.repeat(60)); + console.log('SOCIAL MEDIA SCRAPING WITH SESSION COOKIES'); + console.log('='.repeat(60)); + + // Example cookies for a social media site + const cookies = { + session_token: 'xyz789abc123', + user_session: 'def456ghi789', + csrf_token: 'jkl012mno345', + remember_me: 'true', + language: 'en_US' + }; + + const websiteUrl = 'https://example-social.com/feed'; + const userPrompt = 'Extract posts from the feed including author, content, likes, and comments'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + SocialMediaPostSchema, + 10, // numberOfScrolls - Scroll to load more posts + null, // totalPages + cookies + ); + + console.log('✅ Social media scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`❌ Error in social media scraping: ${error.message}`); + } +} + +async function scrapeNewsWithPreferences() { + console.log('\n' + '='.repeat(60)); + console.log('NEWS SITE SCRAPING WITH USER PREFERENCES'); + console.log('='.repeat(60)); + + // Example cookies for a news site + const cookies = { + user_preferences: 'technology,science,ai', + reading_level: 'advanced', + region: 'US', + subscription_tier: 'premium', + theme: 'dark' + }; + + const websiteUrl = 'https://example-news.com/technology'; + const userPrompt = 'Extract news articles including title, summary, author, and publish date'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + NewsArticleSchema, + null, // numberOfScrolls + 3, // totalPages - Scrape multiple pages + cookies + ); + + console.log('✅ News scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`❌ Error in news scraping: ${error.message}`); + } +} + +async function scrapeBankingWithSecureCookies() { + console.log('\n' + '='.repeat(60)); + console.log('BANKING SITE SCRAPING WITH SECURE COOKIES'); + console.log('='.repeat(60)); + + // Example secure cookies for a banking site + const cookies = { + secure_session: 'pqr678stu901', + auth_token: 'vwx234yz567', + mfa_verified: 'true', + device_id: 'device_abc123', + last_activity: '2024-01-15T10:30:00Z' + }; + + const websiteUrl = 'https://example-bank.com/transactions'; + const userPrompt = 'Extract recent transactions including date, description, amount, and type'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + BankTransactionSchema, + null, // numberOfScrolls + 5, // totalPages - Scrape multiple pages of transactions + cookies + ); + + console.log('✅ Banking scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`❌ Error in banking scraping: ${error.message}`); + } +} + +async function scrapeApiWithAuthTokens() { + console.log('\n' + '='.repeat(60)); + console.log('API ENDPOINT SCRAPING WITH AUTH TOKENS'); + console.log('='.repeat(60)); + + // Example API authentication cookies + const cookies = { + api_token: 'api_abc123def456', + client_id: 'client_789', + access_token: 'access_xyz789', + refresh_token: 'refresh_abc123', + scope: 'read:all' + }; + + const websiteUrl = 'https://api.example.com/data'; + const userPrompt = 'Extract data from the API response'; + + try { + const response = await smartScraper( + process.env.SGAI_APIKEY, + websiteUrl, + userPrompt, + null, // No schema for generic API response + null, // numberOfScrolls + null, // totalPages + cookies + ); + + console.log('✅ API scraping completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`❌ Error in API scraping: ${error.message}`); + } +} + +async function main() { + const apiKey = process.env.SGAI_APIKEY; + + // Check if API key is available + if (!apiKey) { + console.error('Error: SGAI_APIKEY not found in .env file'); + console.log('Please create a .env file with your API key:'); + console.log('SGAI_APIKEY=your_api_key_here'); + return; + } + + console.log('🍪 COOKIES INTEGRATION EXAMPLES'); + console.log('This demonstrates various real-world scenarios where cookies are essential for web scraping.'); + + // Run all examples + await scrapeEcommerceWithAuth(); + await scrapeSocialMediaWithSession(); + await scrapeNewsWithPreferences(); + await scrapeBankingWithSecureCookies(); + await scrapeApiWithAuthTokens(); + + console.log('\n' + '='.repeat(60)); + console.log('✅ All examples completed!'); + console.log('='.repeat(60)); +} + +// Run the example +main().catch(console.error); \ No newline at end of file diff --git a/scrapegraph-js/examples/smartScraper_cookies_example.js b/scrapegraph-js/examples/smartScraper_cookies_example.js new file mode 100644 index 0000000..93786fb --- /dev/null +++ b/scrapegraph-js/examples/smartScraper_cookies_example.js @@ -0,0 +1,125 @@ +/** + * Example demonstrating how to use the SmartScraper API with cookies. + * + * This example shows how to: + * 1. Set up the API request with cookies for authentication + * 2. Use cookies with infinite scrolling + * 3. Define a Zod schema for structured output + * 4. Make the API call and handle the response + * 5. Process the extracted data + * + * Requirements: + * - Node.js 16+ + * - scrapegraph-js + * - A .env file with your SGAI_APIKEY + * + * Example .env file: + * SGAI_APIKEY=your_api_key_here + */ + +import { smartScraper } from 'scrapegraph-js'; +import { z } from 'zod'; +import 'dotenv/config'; + +// Define the data schema for structured output +const CookieInfoSchema = z.object({ + cookies: z.record(z.string()).describe('Dictionary of cookie key-value pairs') +}); + +async function main() { + const apiKey = process.env.SGAI_APIKEY; + + // Check if API key is available + if (!apiKey) { + console.error('Error: SGAI_APIKEY not found in .env file'); + console.log('Please create a .env file with your API key:'); + console.log('SGAI_APIKEY=your_api_key_here'); + return; + } + + // Example 1: Basic cookies example (httpbin.org/cookies) + console.log('='.repeat(60)); + console.log('EXAMPLE 1: Basic Cookies Example'); + console.log('='.repeat(60)); + + const websiteUrl = 'https://httpbin.org/cookies'; + const userPrompt = 'Extract all cookies info'; + const cookies = { cookies_key: 'cookies_value' }; + + try { + // Perform the scraping with cookies + const response = await smartScraper( + apiKey, + websiteUrl, + userPrompt, + CookieInfoSchema, + null, // numberOfScrolls + null, // totalPages + cookies + ); + + // Print the results + console.log('\nExtracted Cookie Information:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } + + // Example 2: Cookies with infinite scrolling + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 2: Cookies with Infinite Scrolling'); + console.log('='.repeat(60)); + + const cookiesWithScroll = { session_id: 'abc123', user_token: 'xyz789' }; + + try { + // Perform the scraping with cookies and infinite scrolling + const response = await smartScraper( + apiKey, + websiteUrl, + 'Extract all cookies and scroll information', + CookieInfoSchema, + 3, // numberOfScrolls + null, // totalPages + cookiesWithScroll + ); + + // Print the results + console.log('\nExtracted Cookie Information with Scrolling:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } + + // Example 3: Cookies with pagination + console.log('\n' + '='.repeat(60)); + console.log('EXAMPLE 3: Cookies with Pagination'); + console.log('='.repeat(60)); + + const cookiesWithPagination = { auth_token: 'secret123', preferences: 'dark_mode' }; + + try { + // Perform the scraping with cookies and pagination + const response = await smartScraper( + apiKey, + websiteUrl, + 'Extract all cookies from multiple pages', + CookieInfoSchema, + null, // numberOfScrolls + 3, // totalPages + cookiesWithPagination + ); + + // Print the results + console.log('\nExtracted Cookie Information with Pagination:'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error(`Error occurred: ${error.message}`); + } +} + +// Run the example +main().catch(console.error); \ No newline at end of file diff --git a/scrapegraph-js/examples/smartScraper_cookies_simple_example.js b/scrapegraph-js/examples/smartScraper_cookies_simple_example.js new file mode 100644 index 0000000..dbe975c --- /dev/null +++ b/scrapegraph-js/examples/smartScraper_cookies_simple_example.js @@ -0,0 +1,40 @@ +/** + * Simple example demonstrating cookies usage with SmartScraper. + * + * This example shows the basic pattern for using cookies with the API. + */ + +import { smartScraper } from 'scrapegraph-js'; +import 'dotenv/config'; + +const apiKey = process.env.SGAI_APIKEY; + +// Example cookies for authentication +const cookies = { + session_id: 'abc123def456', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + user_preferences: 'dark_mode,usd' +}; + +async function scrapeWithCookies() { + try { + const response = await smartScraper( + apiKey, + 'https://example.com/dashboard', + 'Extract user profile information', + null, // schema + null, // numberOfScrolls + null, // totalPages + cookies // cookies parameter + ); + + console.log('✅ Scraping with cookies completed successfully'); + console.log(JSON.stringify(response, null, 2)); + + } catch (error) { + console.error('❌ Error:', error.message); + } +} + +// Run the example +scrapeWithCookies(); \ No newline at end of file diff --git a/scrapegraph-js/package.json b/scrapegraph-js/package.json index 8528db9..a1a21b2 100644 --- a/scrapegraph-js/package.json +++ b/scrapegraph-js/package.json @@ -2,7 +2,7 @@ "name": "scrapegraph-js", "author": "ScrapeGraphAI", "version": "0.1.1", - "description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs.", + "description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.", "repository": { "type": "git", "url": "https://github.com/ScrapeGraphAI/scrapegraph-sdk", @@ -22,7 +22,12 @@ "gpt-3", "gpt-4", "llm", - "ai" + "ai", + "cookies", + "authentication", + "session-management", + "infinite-scroll", + "pagination" ], "main": "index.js", "module": "index.js", diff --git a/scrapegraph-js/src/smartScraper.js b/scrapegraph-js/src/smartScraper.js index 2eff633..fc93573 100644 --- a/scrapegraph-js/src/smartScraper.js +++ b/scrapegraph-js/src/smartScraper.js @@ -12,10 +12,11 @@ import { zodToJsonSchema } from 'zod-to-json-schema'; * @param {Object} [schema] - Optional schema object defining the output structure * @param {number} [numberOfScrolls] - Optional number of times to scroll the page (0-100). If not provided, no scrolling will be performed. * @param {number} [totalPages] - Optional number of pages to scrape (1-10). If not provided, only the first page will be scraped. + * @param {Object} [cookies] - Optional cookies object for authentication and session management * @returns {Promise} Extracted data in JSON format matching the provided schema * @throws - Will throw an error in case of an HTTP failure. */ -export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null) { +export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null) { const endpoint = 'https://api.scrapegraphai.com/v1/smartscraper'; const headers = { 'accept': 'application/json', @@ -28,6 +29,14 @@ export async function smartScraper(apiKey, url, prompt, schema = null, numberOfS user_prompt: prompt, }; + if (cookies) { + if (typeof cookies === 'object' && cookies !== null) { + payload.cookies = cookies; + } else { + throw new Error('Cookies must be an object with key-value pairs'); + } + } + if (schema) { if (schema instanceof ZodType) { payload.output_schema = zodToJsonSchema(schema); diff --git a/scrapegraph-js/test_cookies_integration.js b/scrapegraph-js/test_cookies_integration.js new file mode 100644 index 0000000..4860f0e --- /dev/null +++ b/scrapegraph-js/test_cookies_integration.js @@ -0,0 +1,92 @@ +/** + * Test file to verify cookies integration functionality. + */ + +import { smartScraper } from './src/smartScraper.js'; + +function testCookiesIntegration() { + console.log('🧪 Testing Cookies Integration'); + console.log('='.repeat(50)); + + // Test 1: Basic cookies validation + console.log('\n1. Testing basic cookies validation...'); + + const cookies = { session_id: 'abc123', auth_token: 'xyz789' }; + + // Create a mock payload to test the logic + const mockPayload = { + website_url: 'https://httpbin.org/cookies', + user_prompt: 'Extract cookie information' + }; + + // Simulate the cookies validation logic + if (cookies) { + if (typeof cookies === 'object' && cookies !== null) { + mockPayload.cookies = cookies; + console.log('✅ Cookies validation passed'); + console.log(`✅ Cookies included: ${JSON.stringify(mockPayload.cookies)}`); + } else { + console.log('❌ Cookies validation failed - not an object'); + } + } + + // Test 2: Complex cookies scenario + console.log('\n2. Testing complex cookies scenario...'); + + const complexCookies = { + session_id: 'abc123def456', + user_id: 'user789', + auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...', + preferences: 'dark_mode,usd', + cart_id: 'cart101112', + csrf_token: 'csrf_xyz789' + }; + + const complexPayload = { + website_url: 'https://example.com/dashboard', + user_prompt: 'Extract user profile and preferences' + }; + + if (complexCookies) { + if (typeof complexCookies === 'object' && complexCookies !== null) { + complexPayload.cookies = complexCookies; + console.log('✅ Complex cookies validation passed'); + console.log(`✅ Complex cookies count: ${Object.keys(complexPayload.cookies).length}`); + } + } + + // Test 3: Invalid cookies + console.log('\n3. Testing invalid cookies...'); + + const invalidCookies = 'not_an_object'; + + try { + if (invalidCookies) { + if (typeof invalidCookies === 'object' && invalidCookies !== null) { + console.log('❌ Should have failed validation'); + } else { + console.log('✅ Invalid cookies correctly rejected'); + } + } + } catch (error) { + console.log('✅ Error handling works correctly'); + } + + // Test 4: Function signature validation + console.log('\n4. Testing function signature...'); + + // Check if the function accepts the cookies parameter + const functionString = smartScraper.toString(); + if (functionString.includes('cookies = null')) { + console.log('✅ Function signature includes cookies parameter'); + } else { + console.log('❌ Function signature missing cookies parameter'); + } + + console.log('\n' + '='.repeat(50)); + console.log('✅ All cookies integration tests completed!'); + console.log('='.repeat(50)); +} + +// Run the test +testCookiesIntegration(); \ No newline at end of file diff --git a/scrapegraph-py/README.md b/scrapegraph-py/README.md index bb4c3ce..43ce70e 100644 --- a/scrapegraph-py/README.md +++ b/scrapegraph-py/README.md @@ -95,6 +95,75 @@ response = client.smartscraper( +
+🍪 Cookies Support + +Use cookies for authentication and session management: + +```python +from scrapegraph_py import Client + +client = Client(api_key="your-api-key-here") + +# Define cookies for authentication +cookies = { + "session_id": "abc123def456", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", + "user_preferences": "dark_mode,usd" +} + +response = client.smartscraper( + website_url="https://example.com/dashboard", + user_prompt="Extract user profile information", + cookies=cookies +) +``` + +**Common Use Cases:** +- **E-commerce sites**: User authentication, shopping cart persistence +- **Social media**: Session management, user preferences +- **Banking/Financial**: Secure authentication, transaction history +- **News sites**: User preferences, subscription content +- **API endpoints**: Authentication tokens, API keys + +
+ +
+🔄 Advanced Features + +**Infinite Scrolling:** +```python +response = client.smartscraper( + website_url="https://example.com/feed", + user_prompt="Extract all posts from the feed", + cookies=cookies, + number_of_scrolls=10 # Scroll 10 times to load more content +) +``` + +**Pagination:** +```python +response = client.smartscraper( + website_url="https://example.com/products", + user_prompt="Extract all product information", + cookies=cookies, + total_pages=5 # Scrape 5 pages +) +``` + +**Combined with Cookies:** +```python +response = client.smartscraper( + website_url="https://example.com/dashboard", + user_prompt="Extract user data from all pages", + cookies=cookies, + number_of_scrolls=5, + total_pages=3 +) +``` + +
+ ### 🔍 SearchScraper Perform AI-powered web searches with structured results and reference URLs. diff --git a/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py b/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py index 6cb65a9..cca05bb 100644 --- a/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py +++ b/scrapegraph-py/examples/async/async_smartscraper_cookies_example.py @@ -1,413 +1,130 @@ -#!/usr/bin/env python3 """ -SmartScraper Cookies Example (Async) - -This example demonstrates how to use cookies with SmartScraper API using the asynchronous client. -Cookies are passed through the headers parameter as a Cookie header. +Example demonstrating how to use the SmartScraper API with cookies (Async). + +This example shows how to: +1. Set up the API request with cookies for authentication +2. Use cookies with infinite scrolling +3. Define a Pydantic model for structured output +4. Make the API call and handle the response +5. Process the extracted data + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here """ import asyncio import json -import logging import os -import time -from pydantic import BaseModel, Field from typing import Dict, Optional + from dotenv import load_dotenv +from pydantic import BaseModel, Field from scrapegraph_py import AsyncClient -from scrapegraph_py.exceptions import APIError # Load environment variables from .env file load_dotenv() -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) -logger = logging.getLogger(__name__) - - +# Define the data models for structured output class CookieInfo(BaseModel): """Model representing cookie information.""" - cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") - -class UserProfile(BaseModel): - """Model representing user profile information.""" - username: str = Field(description="User's username") - email: Optional[str] = Field(description="User's email address") - preferences: Optional[Dict[str, str]] = Field(description="User preferences") + cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") -async def basic_cookies_example(): - """Example 1: Basic cookies example using httpbin.org/cookies""" - - print("=" * 60) - print("EXAMPLE 1: Basic Cookies Example") - print("=" * 60) - - # Initialize client from environment variable - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - print("Please either:") - print(" 1. Set environment variable: export SGAI_API_KEY='your-api-key-here'") - print(" 2. Create a .env file with: SGAI_API_KEY=your-api-key-here") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") +async def main(): + """Example usage of the cookies scraper.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") return - - # Configuration - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies info" - - # Example cookies - these will be sent to httpbin.org/cookies - cookies = {"cookies_key": "cookies_value", "test_cookie": "test_value"} - - # Convert cookies dict to Cookie header string - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - headers = {"Cookie": cookie_header} - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"🍪 Cookies: {cookies}") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Perform the scraping with cookies - result = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - headers=headers, - output_schema=CookieInfo, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print("\nExtracted Cookie Information:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - except APIError as e: - print(f"❌ API Error: {e}") - print("This could be due to:") - print(" - Invalid API key") - print(" - Rate limiting") - print(" - Server issues") - - except Exception as e: - print(f"❌ Unexpected error: {e}") - print("This could be due to:") - print(" - Network connectivity issues") - print(" - Invalid website URL") - print(" - Cookie format issues") + # Initialize the async client + async with AsyncClient.from_env() as client: + # Example 1: Basic cookies example (httpbin.org/cookies) + print("=" * 60) + print("EXAMPLE 1: Basic Cookies Example") + print("=" * 60) -async def cookies_with_scrolling_example(): - """Example 2: Cookies with infinite scrolling""" - - print("\n" + "=" * 60) - print("EXAMPLE 2: Cookies with Infinite Scrolling") - print("=" * 60) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Configuration - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies and scroll information" - - # Example session cookies - cookies = {"session_id": "abc123", "user_token": "xyz789"} - - # Convert cookies dict to Cookie header string - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - headers = {"Cookie": cookie_header} - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"🍪 Cookies: {cookies}") - print(f"🔄 Number of scrolls: 3") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Perform the scraping with cookies and infinite scrolling - result = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - headers=headers, - number_of_scrolls=3, - output_schema=CookieInfo, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print("\nExtracted Cookie Information with Scrolling:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - except APIError as e: - print(f"❌ API Error: {e}") - - except Exception as e: - print(f"❌ Unexpected error: {e}") + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies info" + cookies = {"cookies_key": "cookies_value"} + try: + # Perform the scraping with cookies + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=CookieInfo, + ) -async def cookies_with_pagination_example(): - """Example 3: Cookies with pagination""" - - print("\n" + "=" * 60) - print("EXAMPLE 3: Cookies with Pagination") - print("=" * 60) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Configuration - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract all cookies from multiple pages" - - # Example authentication cookies - cookies = {"auth_token": "secret123", "preferences": "dark_mode"} - - # Convert cookies dict to Cookie header string - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - headers = {"Cookie": cookie_header} - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"🍪 Cookies: {cookies}") - print(f"📄 Total Pages: 3") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Perform the scraping with cookies and pagination - result = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - headers=headers, - total_pages=3, - output_schema=CookieInfo, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print("\nExtracted Cookie Information with Pagination:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - except APIError as e: - print(f"❌ API Error: {e}") - - except Exception as e: - print(f"❌ Unexpected error: {e}") + # Print the results + print("\nExtracted Cookie Information:") + print(json.dumps(response, indent=2)) + except Exception as e: + print(f"Error occurred: {str(e)}") -async def ecommerce_cookies_example(): - """Example 4: E-commerce site scraping with authentication cookies""" - - print("\n" + "=" * 60) - print("EXAMPLE 4: E-commerce Site Scraping with Authentication") - print("=" * 60) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Example cookies for an e-commerce site - cookies = { - "session_id": "abc123def456", - "user_id": "user789", - "cart_id": "cart101112", - "preferences": "dark_mode,usd", - "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." - } - - # Convert cookies dict to Cookie header string - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - headers = {"Cookie": cookie_header} - - # Note: Using a test URL since we can't access real e-commerce sites - website_url = "https://httpbin.org/cookies" - user_prompt = "Extract product information including name, price, availability, and rating" - - print(f"🌐 Website URL: {website_url}") - print(f"📝 User Prompt: {user_prompt}") - print(f"🍪 E-commerce Cookies: {cookies}") - print(f"🔄 Number of scrolls: 5") - print("-" * 50) - - try: - # Start timing - start_time = time.time() - - # Perform the scraping with e-commerce cookies - result = await client.smartscraper( - website_url=website_url, - user_prompt=user_prompt, - headers=headers, - number_of_scrolls=5, # Scroll to load more products - output_schema=CookieInfo, - ) - - # Calculate duration - duration = time.time() - start_time - - print(f"✅ Request completed in {duration:.2f} seconds") - print("\nE-commerce Scraping Results:") - print(json.dumps(result, indent=2, ensure_ascii=False)) - - except APIError as e: - print(f"❌ API Error: {e}") - - except Exception as e: - print(f"❌ Unexpected error: {e}") + # Example 2: Cookies with infinite scrolling + print("\n" + "=" * 60) + print("EXAMPLE 2: Cookies with Infinite Scrolling") + print("=" * 60) + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies and scroll information" + cookies = {"session_id": "abc123", "user_token": "xyz789"} -async def concurrent_cookies_example(): - """Example 5: Concurrent requests with different cookies""" - - print("\n" + "=" * 60) - print("EXAMPLE 5: Concurrent Requests with Different Cookies") - print("=" * 60) - - api_key = os.getenv("SGAI_API_KEY") - if not api_key: - print("❌ Error: SGAI_API_KEY environment variable not set") - return - - try: - client = AsyncClient(api_key=api_key) - except Exception as e: - print(f"❌ Error initializing client: {e}") - return - - # Different cookie sets for different scenarios - cookie_sets = [ - { - "name": "Social Media Session", - "cookies": {"session_token": "xyz789abc123", "user_session": "def456ghi789"} - }, - { - "name": "News Site Preferences", - "cookies": {"user_preferences": "technology,science,ai", "reading_level": "advanced"} - }, - { - "name": "Banking Secure Session", - "cookies": {"secure_session": "pqr678stu901", "auth_token": "vwx234yz567"} - } - ] - - async def scrape_with_cookies(cookie_set): - """Helper function to scrape with specific cookies""" - cookies = cookie_set["cookies"] - cookie_header = "; ".join([f"{k}={v}" for k, v in cookies.items()]) - headers = {"Cookie": cookie_header} - try: - result = await client.smartscraper( - website_url="https://httpbin.org/cookies", - user_prompt=f"Extract cookies for {cookie_set['name']}", - headers=headers, + # Perform the scraping with cookies and infinite scrolling + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + number_of_scrolls=3, output_schema=CookieInfo, ) - return {"success": True, "name": cookie_set["name"], "result": result} + + # Print the results + print("\nExtracted Cookie Information with Scrolling:") + print(json.dumps(response, indent=2)) + except Exception as e: - return {"success": False, "name": cookie_set["name"], "error": str(e)} - - print("🚀 Starting concurrent requests with different cookies...") - start_time = time.time() - - # Create tasks for concurrent execution - tasks = [scrape_with_cookies(cookie_set) for cookie_set in cookie_sets] - results = await asyncio.gather(*tasks, return_exceptions=True) - - duration = time.time() - start_time - print(f"✅ All concurrent requests completed in {duration:.2f} seconds") - - # Display results - for result in results: - if isinstance(result, dict): - if result.get("success"): - print(f"✅ {result['name']}: Success") - print(f" Cookies: {result['result']}") - else: - print(f"❌ {result['name']}: {result.get('error', 'Unknown error')}") - else: - print(f"❌ Unexpected result type: {type(result)}") + print(f"Error occurred: {str(e)}") + # Example 3: Cookies with pagination + print("\n" + "=" * 60) + print("EXAMPLE 3: Cookies with Pagination") + print("=" * 60) -async def main(): - """Main function to run all cookies examples""" - - print("ScrapeGraph SDK - SmartScraper Cookies Examples (Async)") - print("=" * 70) - print("This example demonstrates how to use cookies with SmartScraper API") - print("Cookies are passed through the headers parameter as a Cookie header") - print("=" * 70) - - # Run all examples - await basic_cookies_example() - await cookies_with_scrolling_example() - await cookies_with_pagination_example() - await ecommerce_cookies_example() - await concurrent_cookies_example() - - print("\n" + "=" * 70) - print("All cookies examples completed!") - print("\nKey points:") - print("1. Cookies are passed via the 'headers' parameter") - print("2. Cookie format: 'key1=value1; key2=value2'") - print("3. Cookies work with all SmartScraper features (scrolling, pagination)") - print("4. Use concurrent requests for better performance") - print("\nCommon use cases:") - print("- Authentication for protected pages") - print("- Session management for dynamic content") - print("- User preferences and settings") - print("- Shopping cart and user state") + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies from multiple pages" + cookies = {"auth_token": "secret123", "preferences": "dark_mode"} + + try: + # Perform the scraping with cookies and pagination + response = await client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + total_pages=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Pagination:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") if __name__ == "__main__": diff --git a/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py b/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py new file mode 100644 index 0000000..a2d2977 --- /dev/null +++ b/scrapegraph-py/examples/miscellaneous/cookies_integration_example.py @@ -0,0 +1,278 @@ +""" +Comprehensive example demonstrating cookies integration for web scraping. + +This example shows various real-world scenarios where cookies are essential: +1. E-commerce site scraping with authentication +2. Social media scraping with session cookies +3. Banking/financial site scraping with secure cookies +4. News site scraping with user preferences +5. API endpoint scraping with authentication tokens + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +from typing import Dict, List, Optional + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +# Define data models for different scenarios +class ProductInfo(BaseModel): + """Model for e-commerce product information.""" + + name: str = Field(description="Product name") + price: str = Field(description="Product price") + availability: str = Field(description="Product availability status") + rating: Optional[str] = Field(description="Product rating", default=None) + + +class SocialMediaPost(BaseModel): + """Model for social media post information.""" + + author: str = Field(description="Post author") + content: str = Field(description="Post content") + likes: Optional[str] = Field(description="Number of likes", default=None) + comments: Optional[str] = Field(description="Number of comments", default=None) + timestamp: Optional[str] = Field(description="Post timestamp", default=None) + + +class NewsArticle(BaseModel): + """Model for news article information.""" + + title: str = Field(description="Article title") + summary: str = Field(description="Article summary") + author: Optional[str] = Field(description="Article author", default=None) + publish_date: Optional[str] = Field(description="Publish date", default=None) + + +class BankTransaction(BaseModel): + """Model for banking transaction information.""" + + date: str = Field(description="Transaction date") + description: str = Field(description="Transaction description") + amount: str = Field(description="Transaction amount") + type: str = Field(description="Transaction type (credit/debit)") + + +def scrape_ecommerce_with_auth(): + """Example: Scrape e-commerce site with authentication cookies.""" + print("=" * 60) + print("E-COMMERCE SITE SCRAPING WITH AUTHENTICATION") + print("=" * 60) + + # Example cookies for an e-commerce site + cookies = { + "session_id": "abc123def456", + "user_id": "user789", + "cart_id": "cart101112", + "preferences": "dark_mode,usd", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..." + } + + website_url = "https://example-ecommerce.com/products" + user_prompt = "Extract product information including name, price, availability, and rating" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=ProductInfo, + number_of_scrolls=5 # Scroll to load more products + ) + + print("✅ E-commerce scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"❌ Error in e-commerce scraping: {str(e)}") + + +def scrape_social_media_with_session(): + """Example: Scrape social media with session cookies.""" + print("\n" + "=" * 60) + print("SOCIAL MEDIA SCRAPING WITH SESSION COOKIES") + print("=" * 60) + + # Example cookies for a social media site + cookies = { + "session_token": "xyz789abc123", + "user_session": "def456ghi789", + "csrf_token": "jkl012mno345", + "remember_me": "true", + "language": "en_US" + } + + website_url = "https://example-social.com/feed" + user_prompt = "Extract posts from the feed including author, content, likes, and comments" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=SocialMediaPost, + number_of_scrolls=10 # Scroll to load more posts + ) + + print("✅ Social media scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"❌ Error in social media scraping: {str(e)}") + + +def scrape_news_with_preferences(): + """Example: Scrape news site with user preference cookies.""" + print("\n" + "=" * 60) + print("NEWS SITE SCRAPING WITH USER PREFERENCES") + print("=" * 60) + + # Example cookies for a news site + cookies = { + "user_preferences": "technology,science,ai", + "reading_level": "advanced", + "region": "US", + "subscription_tier": "premium", + "theme": "dark" + } + + website_url = "https://example-news.com/technology" + user_prompt = "Extract news articles including title, summary, author, and publish date" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=NewsArticle, + total_pages=3 # Scrape multiple pages + ) + + print("✅ News scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"❌ Error in news scraping: {str(e)}") + + +def scrape_banking_with_secure_cookies(): + """Example: Scrape banking site with secure authentication cookies.""" + print("\n" + "=" * 60) + print("BANKING SITE SCRAPING WITH SECURE COOKIES") + print("=" * 60) + + # Example secure cookies for a banking site + cookies = { + "secure_session": "pqr678stu901", + "auth_token": "vwx234yz567", + "mfa_verified": "true", + "device_id": "device_abc123", + "last_activity": "2024-01-15T10:30:00Z" + } + + website_url = "https://example-bank.com/transactions" + user_prompt = "Extract recent transactions including date, description, amount, and type" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=BankTransaction, + total_pages=5 # Scrape multiple pages of transactions + ) + + print("✅ Banking scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"❌ Error in banking scraping: {str(e)}") + + +def scrape_api_with_auth_tokens(): + """Example: Scrape API endpoint with authentication tokens.""" + print("\n" + "=" * 60) + print("API ENDPOINT SCRAPING WITH AUTH TOKENS") + print("=" * 60) + + # Example API authentication cookies + cookies = { + "api_token": "api_abc123def456", + "client_id": "client_789", + "access_token": "access_xyz789", + "refresh_token": "refresh_abc123", + "scope": "read:all" + } + + website_url = "https://api.example.com/data" + user_prompt = "Extract data from the API response" + + try: + client = Client.from_env() + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + headers={ + "Accept": "application/json", + "Content-Type": "application/json" + } + ) + + print("✅ API scraping completed successfully") + print(json.dumps(response, indent=2)) + client.close() + + except Exception as e: + print(f"❌ Error in API scraping: {str(e)}") + + +def main(): + """Run all cookies integration examples.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + print("🍪 COOKIES INTEGRATION EXAMPLES") + print("This demonstrates various real-world scenarios where cookies are essential for web scraping.") + + # Run all examples + scrape_ecommerce_with_auth() + scrape_social_media_with_session() + scrape_news_with_preferences() + scrape_banking_with_secure_cookies() + scrape_api_with_auth_tokens() + + print("\n" + "=" * 60) + print("✅ All examples completed!") + print("=" * 60) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/smartscraper_cookies_example.py b/scrapegraph-py/examples/sync/smartscraper_cookies_example.py new file mode 100644 index 0000000..01cba1d --- /dev/null +++ b/scrapegraph-py/examples/sync/smartscraper_cookies_example.py @@ -0,0 +1,134 @@ +""" +Example demonstrating how to use the SmartScraper API with cookies. + +This example shows how to: +1. Set up the API request with cookies for authentication +2. Use cookies with infinite scrolling +3. Define a Pydantic model for structured output +4. Make the API call and handle the response +5. Process the extracted data + +Requirements: +- Python 3.7+ +- scrapegraph-py +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +from typing import Dict, Optional + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +# Define the data models for structured output +class CookieInfo(BaseModel): + """Model representing cookie information.""" + + cookies: Dict[str, str] = Field(description="Dictionary of cookie key-value pairs") + + +def main(): + """Example usage of the cookies scraper.""" + # Check if API key is available + if not os.getenv("SGAI_API_KEY"): + print("Error: SGAI_API_KEY not found in .env file") + print("Please create a .env file with your API key:") + print("SGAI_API_KEY=your_api_key_here") + return + + # Initialize the client + client = Client.from_env() + + # Example 1: Basic cookies example (httpbin.org/cookies) + print("=" * 60) + print("EXAMPLE 1: Basic Cookies Example") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies info" + cookies = {"cookies_key": "cookies_value"} + + try: + # Perform the scraping with cookies + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 2: Cookies with infinite scrolling + print("\n" + "=" * 60) + print("EXAMPLE 2: Cookies with Infinite Scrolling") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies and scroll information" + cookies = {"session_id": "abc123", "user_token": "xyz789"} + + try: + # Perform the scraping with cookies and infinite scrolling + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + number_of_scrolls=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Scrolling:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Example 3: Cookies with pagination + print("\n" + "=" * 60) + print("EXAMPLE 3: Cookies with Pagination") + print("=" * 60) + + website_url = "https://httpbin.org/cookies" + user_prompt = "Extract all cookies from multiple pages" + cookies = {"auth_token": "secret123", "preferences": "dark_mode"} + + try: + # Perform the scraping with cookies and pagination + response = client.smartscraper( + website_url=website_url, + user_prompt=user_prompt, + cookies=cookies, + total_pages=3, + output_schema=CookieInfo, + ) + + # Print the results + print("\nExtracted Cookie Information with Pagination:") + print(json.dumps(response, indent=2)) + + except Exception as e: + print(f"Error occurred: {str(e)}") + + # Close the client + client.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index 555a7d3..155a3f0 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -174,11 +174,12 @@ async def smartscraper( website_url: Optional[str] = None, website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, output_schema: Optional[BaseModel] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, ): - """Send a smartscraper request with optional pagination support""" + """Send a smartscraper request with optional pagination support and cookies""" logger.info("🔍 Starting smartscraper request") if website_url: logger.debug(f"🌐 URL: {website_url}") @@ -186,6 +187,8 @@ async def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if cookies: + logger.debug("🍪 Using cookies for authentication/session management") if number_of_scrolls is not None: logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") if total_pages is not None: @@ -196,6 +199,7 @@ async def smartscraper( website_url=website_url, website_html=website_html, headers=headers, + cookies=cookies, user_prompt=user_prompt, output_schema=output_schema, number_of_scrolls=number_of_scrolls, diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 1a845d4..9cde1f6 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -182,11 +182,12 @@ def smartscraper( website_url: Optional[str] = None, website_html: Optional[str] = None, headers: Optional[dict[str, str]] = None, + cookies: Optional[Dict[str, str]] = None, output_schema: Optional[BaseModel] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, ): - """Send a smartscraper request with optional pagination support""" + """Send a smartscraper request with optional pagination support and cookies""" logger.info("🔍 Starting smartscraper request") if website_url: logger.debug(f"🌐 URL: {website_url}") @@ -194,6 +195,8 @@ def smartscraper( logger.debug("📄 Using provided HTML content") if headers: logger.debug("🔧 Using custom headers") + if cookies: + logger.debug("🍪 Using cookies for authentication/session management") if number_of_scrolls is not None: logger.debug(f"🔄 Number of scrolls: {number_of_scrolls}") if total_pages is not None: @@ -204,6 +207,7 @@ def smartscraper( website_url=website_url, website_html=website_html, headers=headers, + cookies=cookies, user_prompt=user_prompt, output_schema=output_schema, number_of_scrolls=number_of_scrolls, diff --git a/scrapegraph-py/scrapegraph_py/models/smartscraper.py b/scrapegraph-py/scrapegraph_py/models/smartscraper.py index 33d233d..b57fb93 100644 --- a/scrapegraph-py/scrapegraph_py/models/smartscraper.py +++ b/scrapegraph-py/scrapegraph_py/models/smartscraper.py @@ -1,6 +1,6 @@ # Models for smartscraper endpoint -from typing import Optional, Type +from typing import Optional, Type, Dict from uuid import UUID from bs4 import BeautifulSoup @@ -28,6 +28,11 @@ class SmartScraperRequest(BaseModel): }, description="Optional headers to send with the request, including cookies and user agent", ) + cookies: Optional[Dict[str, str]] = Field( + None, + example={"session_id": "abc123", "user_token": "xyz789"}, + description="Dictionary of cookies to send with the request for authentication or session management", + ) output_schema: Optional[Type[BaseModel]] = None number_of_scrolls: Optional[conint(ge=0, le=100)] = Field( default=None, diff --git a/scrapegraph-py/test_cookies_integration.py b/scrapegraph-py/test_cookies_integration.py new file mode 100644 index 0000000..9cac46f --- /dev/null +++ b/scrapegraph-py/test_cookies_integration.py @@ -0,0 +1,97 @@ +""" +Test file to verify cookies integration functionality. +""" + +import json +from pydantic import BaseModel, Field + +from scrapegraph_py.models.smartscraper import SmartScraperRequest + + +class TestCookieInfo(BaseModel): + """Test model for cookie information.""" + + cookies: dict = Field(description="Dictionary of cookie key-value pairs") + + +def test_cookies_integration(): + """Test that cookies are properly integrated into SmartScraperRequest.""" + + print("🧪 Testing Cookies Integration") + print("=" * 50) + + # Test 1: Basic cookies + print("\n1. Testing basic cookies...") + cookies = {"session_id": "abc123", "auth_token": "xyz789"} + + request = SmartScraperRequest( + user_prompt="Extract cookie information", + website_url="https://httpbin.org/cookies", + cookies=cookies + ) + + data = request.model_dump() + print(f"✅ Cookies included in request: {data.get('cookies')}") + + # Test 2: Cookies with output schema + print("\n2. Testing cookies with output schema...") + + request_with_schema = SmartScraperRequest( + user_prompt="Extract cookie information", + website_url="https://httpbin.org/cookies", + cookies=cookies, + output_schema=TestCookieInfo + ) + + data_with_schema = request_with_schema.model_dump() + print(f"✅ Cookies with schema: {data_with_schema.get('cookies')}") + print(f"✅ Output schema included: {data_with_schema.get('output_schema') is not None}") + + # Test 3: Cookies with scrolling and pagination + print("\n3. Testing cookies with advanced features...") + + request_advanced = SmartScraperRequest( + user_prompt="Extract cookie information from multiple pages", + website_url="https://httpbin.org/cookies", + cookies=cookies, + number_of_scrolls=5, + total_pages=3, + output_schema=TestCookieInfo + ) + + data_advanced = request_advanced.model_dump() + print(f"✅ Advanced request cookies: {data_advanced.get('cookies')}") + print(f"✅ Number of scrolls: {data_advanced.get('number_of_scrolls')}") + print(f"✅ Total pages: {data_advanced.get('total_pages')}") + + # Test 4: Complex cookies scenario + print("\n4. Testing complex cookies scenario...") + + complex_cookies = { + "session_id": "abc123def456", + "user_id": "user789", + "auth_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", + "preferences": "dark_mode,usd", + "cart_id": "cart101112", + "csrf_token": "csrf_xyz789" + } + + request_complex = SmartScraperRequest( + user_prompt="Extract user profile and preferences", + website_url="https://example.com/dashboard", + cookies=complex_cookies, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}, + output_schema=TestCookieInfo + ) + + data_complex = request_complex.model_dump() + print(f"✅ Complex cookies count: {len(data_complex.get('cookies', {}))}") + print(f"✅ Headers included: {data_complex.get('headers') is not None}") + + print("\n" + "=" * 50) + print("✅ All cookies integration tests passed!") + print("=" * 50) + + +if __name__ == "__main__": + test_cookies_integration() \ No newline at end of file