-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebscraper.py
More file actions
204 lines (156 loc) · 6.02 KB
/
webscraper.py
File metadata and controls
204 lines (156 loc) · 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Simple Web Scraper
This script demonstrates a basic web scraper using BeautifulSoup4 and requests.
It fetches a web page, parses its HTML content, and extracts useful information.
"""
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import os
class WebScraper:
"""A simple web scraper class to fetch and parse web content"""
def __init__(self, url):
"""Initialize the scraper with a URL
Args:
url (str): The URL to scrape
"""
self.url = url
self.soup = None
self.response = None
def fetch_page(self):
"""Fetch the web page content
Returns:
bool: True if the page was successfully fetched, False otherwise
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.response = requests.get(self.url, headers=headers, timeout=10)
self.response.raise_for_status() # Raise an exception for 4XX/5XX responses
self.soup = BeautifulSoup(self.response.text, 'html.parser')
return True
except requests.exceptions.RequestException as e:
print(f"Error fetching the page: {e}")
return False
def get_page_title(self):
"""Extract the page title
Returns:
str: The page title or None if not found
"""
if not self.soup:
return None
title_tag = self.soup.find('title')
return title_tag.text.strip() if title_tag else None
def get_all_links(self):
"""Extract all links from the page
Returns:
list: List of dictionaries containing link text and href
"""
if not self.soup:
return []
links = []
for link in self.soup.find_all('a', href=True):
href = link['href']
# Convert relative URLs to absolute URLs
if href.startswith('/'):
base_url = '/'.join(self.url.split('/')[:3]) # http(s)://domain.com
href = base_url + href
links.append({
'text': link.text.strip(),
'href': href
})
return links
def get_all_images(self):
"""Extract all images from the page
Returns:
list: List of dictionaries containing image info
"""
if not self.soup:
return []
images = []
for img in self.soup.find_all('img', src=True):
src = img['src']
# Convert relative URLs to absolute URLs
if src.startswith('/'):
base_url = '/'.join(self.url.split('/')[:3]) # http(s)://domain.com
src = base_url + src
alt = img.get('alt', '')
images.append({
'src': src,
'alt': alt
})
return images
def extract_text_content(self):
"""Extract the main text content from the page
Returns:
str: The main text content
"""
if not self.soup:
return ""
# Remove script and style elements
for script in self.soup(["script", "style"]):
script.extract()
# Get text
text = self.soup.get_text()
# Break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# Break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# Drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def save_to_csv(self, data, filename):
"""Save data to a CSV file
Args:
data (list): List of dictionaries with the same keys
filename (str): Name of the CSV file
Returns:
bool: True if the data was successfully saved, False otherwise
"""
if not data:
return False
try:
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
return True
except Exception as e:
print(f"Error saving to CSV: {e}")
return False
def main():
"""Main function to demonstrate the scraper"""
url = "https://news.ycombinator.com/" # Example: Hacker News
scraper = WebScraper(url)
print(f"Fetching content from {url}...")
if scraper.fetch_page():
print("Page fetched successfully!")
title = scraper.get_page_title()
print(f"\nPage title: {title}")
print("\nExtracting links...")
links = scraper.get_all_links()
print(f"Found {len(links)} links")
# Display first 5 links
for i, link in enumerate(links[:5]):
print(f"{i+1}. {link['text']} - {link['href']}")
# Save links to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"links_{timestamp}.csv"
if scraper.save_to_csv(links, filename):
print(f"\nLinks saved to {filename}")
# Get images
images = scraper.get_all_images()
print(f"\nFound {len(images)} images")
# Display first 3 images
for i, img in enumerate(images[:3]):
print(f"{i+1}. {img['alt']} - {img['src']}")
else:
print("Failed to fetch the page.")
if __name__ == "__main__":
main()