-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebScaper_ANN_newsagg.py
More file actions
51 lines (43 loc) · 1.51 KB
/
WebScaper_ANN_newsagg.py
File metadata and controls
51 lines (43 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import datetime
# path to chromedriver.exe
path = "D:\Libraries\Arkiralor's Software Setups\Chromedriver\chromedriver.exe"
# create instance of webdriver
driver = webdriver.Chrome(path)
# site url
url = 'https://www.animenewsnetwork.com/'
def scrape():
pageInfo = []
try:
# wait for search results to be fetched
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.a, 'latest-headline'))
)
except Exception as ex:
print(ex)
driver.quit()
# contains the search results
searchResults = driver.find_elements_by_class_name('latest-headline')
for result in searchResults:
element = result.find_element_by_css_selector('a')
link = element.get_attribute('href')
header = result.find_element_by_css_selector('h3').text
text = result.find_element_by_class_name('latest-article-excerpt').text
pageInfo.append({'header': header, 'link': link, 'text': text})
return pageInfo
def main():
# Code to open a specific url
driver.get(url)
pageInfo = scrape()
print(pageInfo)
df = pd.DataFrame(pageInfo)
x = str(datetime.date.today())
fileName = 'otakunews' + '_' + x + '.csv'
df.to_csv(fileName)
driver.quit()
if __name__=="__main__":
main()