wasmerio · Sarthacker · Nov 26, 2025
diff --git a/WebScraper (Google Search API)/.gitignore b/WebScraper (Google Search API)/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/WebScraper (Google Search API)/README.md b/WebScraper (Google Search API)/README.md
@@ -0,0 +1,47 @@
+# 🌐 Python Web Surfing
+This Python project allows you to scrape search results from the web using ```Google API``` and ```Google Custom Search Engine ID```, extract useful information, and perform basic data analysis using ```Gemini API```. It is designed to be reliable, modular, and easy to run from the command line.
+
+---
+
+## ✅ Functionalities Implemented
+
+1. **Extracting Titles, URLs, and Snippets**  
+   - Scrapes and saves the title, URL, and snippet/description from search results.
+
+2. **Taking Dynamic Input (Query from Command Line)**  
+   - Run the scraper with any search query directly from the command line:  
+   ```bash
+   python scraper.py <your query>
+   ```
+   For Example 
+   ```bash
+   python scraper.py "AI in healthcare"
+   ```
+
+3. **Saving Results to CSV File**
+    - Results are saved in a seperate CSV file for each query.
+
+4. **Running in Headless Mode (Browser in Background)**
+    - The usage of the Custom Search Engine ID makes it totally headless.
+
+5. **Crawling Multiple Pages**
+    - The scraper can crawl multiple pages of search results (Free tier Google API only allows max 10 results at a time).
+
+6. **Adding Logs**
+    - Logs are stored in ```data/logs/```.
+
+7. **Data Summarizer**
+    - Summarizes the results all the results that were fetched and stores them in ```data_analysis``` folder.
+
+## ⚡ How to Run
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Run Scraper
+```bash
+python scraper.py <your query>
+```
+## 💡 Notes
+- Ensure you have ```Google API``` key, ```Google Custom Search Engine ID``` and ```Gemini API``` key set up in the script.
+- Logs are automatically created for debugging and tracking scraping activity.
diff --git a/WebScraper (Google Search API)/data_analysis/AI in healthcare_summary.txt b/WebScraper (Google Search API)/data_analysis/AI in healthcare_summary.txt
@@ -0,0 +1,7 @@
+Artificial intelligence is a powerful and disruptive technology with the potential to
+fundamentally transform medicine and healthcare delivery. AI systems analyze patient data to
+predict health risks, diagnose diseases, and develop personalized treatment plans, thereby
+assisting clinicians with decision-making. By creating more efficient workflows and enabling
+better self-management of chronic illnesses, AI aims to make healthcare more personalized,
+accessible, and effective. Ultimately, the goal is to improve patient care, achieve better
+strategic outcomes, and potentially save lives.
diff --git a/WebScraper (Google Search API)/requirements.txt b/WebScraper (Google Search API)/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+python-dotenv
+requests
+google-generativeai
diff --git a/WebScraper (Google Search API)/scraper.py b/WebScraper (Google Search API)/scraper.py
@@ -0,0 +1,112 @@
+import csv
+import logging
+import sys
+import os
+import requests
+from datetime import datetime
+from dotenv import load_dotenv
+import pandas as pd
+import textwrap
+
+os.environ['GRPC_VERBOSITY'] = 'NONE'
+import google.generativeai as genai
+
+
+# Setup logging
+os.makedirs("data/logs", exist_ok=True)
+logging.basicConfig(
+    filename=f"data/logs/scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+load_dotenv()
+
+API_KEY = os.getenv("GOOGLE_API_KEY")
+CX = os.getenv("CUSTOM_SEARCH_ENGINE_ID")
+gemini_api=os.getenv("GEMINI_API")
+
+def scrape_google(query, num_results=10):
+    url = "https://www.googleapis.com/customsearch/v1"
+    params = {"key": API_KEY, "cx": CX, "q": query, "num": num_results}
+
+    logging.info(f"Fetching results for query: {query}")
+    try:
+        req = requests.get(url, params=params)
+        req.raise_for_status()
+        data = req.json()
+        results = []
+
+        for item in data.get("items", []):
+            results.append({
+                "Title": item.get("title", ""),
+                "URL": item.get("link", ""),
+                "Snippet": item.get("snippet", "")
+            })
+
+        logging.info(f"Fetched {len(results)} results for query: {query}")
+        return results
+
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Request failed: {e}")
+        return []
+
+def save_results(results, filename):
+    if not results:
+        logging.warning("No results to save.")
+        print("❌ No results to save.")
+        return
+
+    os.makedirs("data", exist_ok=True)
+    with open(filename, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=["Title", "URL", "Snippet"])
+        writer.writeheader()
+        writer.writerows(results)
+    logging.info(f"Saved results to {filename}")
+    print(f"✅ Saved {len(results)} results to {filename}")
+
+def summarize(query):
+    filename=f"./data/{query}.csv"
+    df=pd.read_csv(filename)
+    texts_combined = "\n\n".join(df["Snippet"].astype(str).tolist())
+    PROMPT=f'''
+        You are an expert text summarizer. I will provide you with multiple short text excerpts. 
+        Your task is to read all of them and produce a single, concise summary that captures the 
+        key ideas, themes, and main points across all excerpts. 
+
+        Make the summary clear, coherent, and around 3–5 sentences long.
+
+        Texts:
+        {texts_combined}
+
+        Output only the final summary.
+    '''
+    genai.configure(api_key=gemini_api)
+    model = genai.GenerativeModel('gemini-2.5-pro')
+    response = model.generate_content(PROMPT)
+
+    wrapped_text = textwrap.fill(response.text, width=95)
+
+    folder_path = "data_analysis"
+    os.makedirs(folder_path, exist_ok=True)
+    summary_file_path = os.path.join(folder_path, f"{query}_summary.txt")
+
+    with open(summary_file_path, "w", encoding="utf-8") as f:
+        f.write(wrapped_text)
+
+    print(f"✅ Summary saved to {summary_file_path}")
+
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scraper.py <search query>")
+        sys.exit(1)
+
+    query = "".join(sys.argv[1])
+    logging.info(f"Starting scrape for query: {query}")
+
+    data = scrape_google(query)
+    save_results(data,f"./data/{query}.csv")
+
+    summarize(query)