Python-Scripts/WebScraper (Google Search API)/scraper.py at ab9ff0acadde2894cd2b2e07eab8ca1551f8a291 · wasmerio/Python-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import csv
import logging
import sys
import os
import requests
from datetime import datetime
from dotenv import load_dotenv
import pandas as pd
import textwrap

os.environ['GRPC_VERBOSITY'] = 'NONE'
import google.generativeai as genai


# Setup logging
os.makedirs("data/logs", exist_ok=True)
logging.basicConfig(
    filename=f"data/logs/scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

load_dotenv()

API_KEY = os.getenv("GOOGLE_API_KEY")
CX = os.getenv("CUSTOM_SEARCH_ENGINE_ID")
gemini_api=os.getenv("GEMINI_API")

def scrape_google(query, num_results=10):
    url = "https://www.googleapis.com/customsearch/v1"
    params = {"key": API_KEY, "cx": CX, "q": query, "num": num_results}

    logging.info(f"Fetching results for query: {query}")
    try:
        req = requests.get(url, params=params)
        req.raise_for_status()
        data = req.json()
        results = []

        for item in data.get("items", []):
            results.append({
                "Title": item.get("title", ""),
                "URL": item.get("link", ""),
                "Snippet": item.get("snippet", "")
            })

        logging.info(f"Fetched {len(results)} results for query: {query}")
        return results

    except requests.exceptions.RequestException as e:
        logging.error(f"Request failed: {e}")
        return []

def save_results(results, filename):
    if not results:
        logging.warning("No results to save.")
        print("❌ No results to save.")
        return

    os.makedirs("data", exist_ok=True)
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["Title", "URL", "Snippet"])
        writer.writeheader()
        writer.writerows(results)
    logging.info(f"Saved results to {filename}")
    print(f"✅ Saved {len(results)} results to {filename}")

def summarize(query):
    filename=f"./data/{query}.csv"
    df=pd.read_csv(filename)
    texts_combined = "\n\n".join(df["Snippet"].astype(str).tolist())
    PROMPT=f'''
        You are an expert text summarizer. I will provide you with multiple short text excerpts.
        Your task is to read all of them and produce a single, concise summary that captures the
        key ideas, themes, and main points across all excerpts.

        Make the summary clear, coherent, and around 3–5 sentences long.

        Texts:
        {texts_combined}

        Output only the final summary.
    '''
    genai.configure(api_key=gemini_api)
    model = genai.GenerativeModel('gemini-2.5-pro')
    response = model.generate_content(PROMPT)

    wrapped_text = textwrap.fill(response.text, width=95)

    folder_path = "data_analysis"
    os.makedirs(folder_path, exist_ok=True)
    summary_file_path = os.path.join(folder_path, f"{query}_summary.txt")

    with open(summary_file_path, "w", encoding="utf-8") as f:
        f.write(wrapped_text)

    print(f"✅ Summary saved to {summary_file_path}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python scraper.py <search query>")
        sys.exit(1)

    query = "".join(sys.argv[1])
    logging.info(f"Starting scrape for query: {query}")

    data = scrape_google(query)
    save_results(data,f"./data/{query}.csv")

    summarize(query)