Skip to content
This repository was archived by the owner on Sep 18, 2024. It is now read-only.

Commit 6bf6382

Browse files
authored
Add files via upload
1 parent accb5fa commit 6bf6382

10 files changed

Lines changed: 1064 additions & 0 deletions

02_crawl_from_csv.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
## This script allows to download the files even if the main process just wrote the csv without downloading the data from the servers
11+
import sys, argparse, csv, os, json
12+
from os import path
13+
from newsplease import NewsPlease
14+
15+
database = dict()
16+
17+
def craw_from_file(filename):
18+
print(filename)
19+
print('-------------------')
20+
print('-------------------')
21+
dkey = filename[:-15]
22+
dkey_dbfile = dkey+".json"
23+
dkey_db = {}
24+
if path.exists(dkey_dbfile):
25+
dkey_db = json.load(open(dkey_dbfile))
26+
dkey_db_nextKey = 1
27+
if len(dkey_db) > 0:
28+
dkey_db_nextKey = int(max(dkey_db,key=int))+1
29+
with open(filename, 'r') as csvfile:
30+
reader = csv.reader(csvfile)
31+
for row in reader:
32+
if not (dkey in database):
33+
database[dkey] = set()
34+
else:
35+
database[dkey] = set(database[dkey])
36+
if not (row[0] in database[dkey]):
37+
try:
38+
database[dkey].add(row[0])
39+
dkey_db[str(dkey_db_nextKey)] = NewsPlease.from_url(row[0]).get_serializable_dict()
40+
print("Article #"+str(dkey_db_nextKey)+" was parsed")
41+
dkey_db_nextKey = dkey_db_nextKey+1
42+
except Exception as e:
43+
print("Error: ''"+str(e)+"'' for article at ''"+row[0]+"''! downloading it as a simple text file...")
44+
json.dump(dkey_db,open(dkey_dbfile,'w'))
45+
dump_database('db.json')
46+
47+
48+
def load_database(filename):
49+
global database
50+
if path.exists(filename):
51+
with open(filename) as json_file:
52+
data = json.load(json_file)
53+
for k in data:
54+
data[k] = set(data[k])
55+
database = data
56+
57+
def dump_database(filename):
58+
for k in database:
59+
database[k] = list(database[k])
60+
json.dump(database,open(filename,'w'))
61+
62+
def get_all_csvs():
63+
files = [f for f in os.listdir('.') if os.path.isfile(f)]
64+
for f in files:
65+
if (f.endswith('.csv')):
66+
craw_from_file(f)
67+
68+
69+
if __name__ == '__main__':
70+
load_database('db.json')
71+
get_all_csvs()
72+
dump_database('db.json')

crawl_from_csv.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
11+
# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
12+
import scrape_by_keyword
13+
import argparse
14+
15+
if __name__ == '__main__':
16+
parser = argparse.ArgumentParser(description="Scrape by Keyword: scraping the online news via Python.")
17+
parser.add_argument("--RSSFeedCsv", default=False, type=str,
18+
help="Defines the configuration csv file containing the rss feed from the newspapers")
19+
parser.add_argument('--threads', default=8, type=int, help="The minimum amount of threads to be used in the scraping")
20+
args = parser.parse_args()
21+
scrape_by_keyword.rssFeedFile = args.RSSFeedCsv
22+
scrape_by_keyword.workers = args.threads
23+
scrape_by_keyword.load_database('db.json')
24+
if scrape_by_keyword.workers == 0:
25+
scrape_by_keyword.get_all_csvs()
26+
else:
27+
scrape_by_keyword.multi_crawl()
28+
scrape_by_keyword.dump_database('db.json')

generate_final_db.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
11+
# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
12+
import json, os
13+
14+
if __name__ == '__main__':
15+
db = {}
16+
files = [f for f in os.listdir('.') if os.path.isfile(f)]
17+
for f in files:
18+
if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
19+
print(f)
20+
try:
21+
l = json.load(open(f))
22+
for k in l:
23+
db[l[k]["url"]] = l[k]
24+
except:
25+
print("Error on "+f)
26+
print(str(len(db))+" articles were downloaded in total!")
27+
json.dump(db,open("final_db.json","w"), indent=4, sort_keys=True)

interval_stats.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
import scrape_by_keyword
11+
12+
for (x,y) in scrape_by_keyword.url_list(scrape_by_keyword.date_global_minimum.strftime('%Y-%m-%d')):
13+
scrape_by_keyword.NewsThread(x, scrape_by_keyword.phrases, y, scrape_by_keyword.database, scrape_by_keyword.proxies, True)

newcastle.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
www.chroniclelive.co.uk/?service=rss,ncl_chron_$day.csv
2+
https://www.nexus.org.uk/news.xml,nexus_$day.csv
3+
https://www.ncl.ac.uk/data/mobile/rss/pressoffice/pressnews/index.xml,uni_$day.csv
4+
https://www.theguardian.com/uk/newcastle/rss,guardian_$day.csv

plot.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
11+
# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
12+
import json, os, scrape_by_keyword
13+
14+
if __name__ == '__main__':
15+
files = [f for f in os.listdir('.') if os.path.isfile(f)]
16+
ll = []
17+
for f in files:
18+
if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
19+
ll.extend(scrape_by_keyword.plot_jsons(f, True))
20+
scrape_by_keyword.plot_list("Total",ll)
21+

reconstruct_db_from_jsons.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python3
2+
__author__ = "Giacomo Bergami"
3+
__copyright__ = "Copyright 2020, Giacomo Bergami"
4+
__credits__ = ["Giacomo Bergami"]
5+
__license__ = "GPL"
6+
__version__ = "2.0"
7+
__maintainer__ = "Giacomo Bergami"
8+
__email__ = "bergamigiacomo@gmail.com"
9+
__status__ = "Production"
10+
# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
11+
# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
12+
import json, os
13+
14+
if __name__ == '__main__':
15+
db = {}
16+
files = [f for f in os.listdir('.') if os.path.isfile(f)]
17+
for f in files:
18+
if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
19+
print(f)
20+
try:
21+
l = json.load(open(f))
22+
dkey = f[:-5]
23+
db[dkey] = set()
24+
for k in l:
25+
db[dkey].add(l[k]["url"])
26+
db[dkey] = list(db[dkey])
27+
except:
28+
print("Error on "+f)
29+
json.dump(db,open("db.json","w"))

requirements.txt

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
ago==0.0.95
2+
attrs==23.1.0
3+
Automat==22.10.0
4+
beautifulsoup4==4.12.2
5+
boto3==1.28.75
6+
botocore==1.31.75
7+
bs4==0.0.1
8+
certifi==2023.7.22
9+
cffi==1.16.0
10+
chardet==5.2.0
11+
charset-normalizer==3.3.1
12+
click==8.1.7
13+
colorama==0.4.6
14+
constantly==23.10.4
15+
cryptography==41.0.5
16+
cssselect==1.2.0
17+
dateparser==1.1.8
18+
dotmap==1.3.30
19+
elastic-transport==8.10.0
20+
elasticsearch==8.10.1
21+
faust-cchardet==2.1.19
22+
feedfinder2==0.0.4
23+
feedparser==6.0.10
24+
filelock==3.13.1
25+
hjson==3.1.0
26+
hurry.filesize==0.9
27+
hyperlink==21.0.0
28+
idna==3.4
29+
incremental==22.10.0
30+
itemadapter==0.8.0
31+
itemloaders==1.1.0
32+
jieba3k==0.35.1
33+
jmespath==1.0.1
34+
joblib==1.3.2
35+
langdetect==1.0.9
36+
lxml==4.9.3
37+
news-please==1.5.35
38+
newspaper3k==0.2.8
39+
nltk==3.8.1
40+
packaging==23.2
41+
parsel==1.8.1
42+
Pillow==10.1.0
43+
plac==1.4.1
44+
plotille==5.0.0
45+
Protego==0.3.0
46+
psycopg2-binary==2.9.9
47+
pyasn1==0.5.0
48+
pyasn1-modules==0.3.0
49+
pycparser==2.21
50+
PyDispatcher==2.0.7
51+
PyMySQL==1.1.0
52+
pyOpenSSL==23.3.0
53+
python-dateutil==2.8.2
54+
pytz==2023.3.post1
55+
PyYAML==6.0.1
56+
queuelib==1.6.2
57+
readability-lxml==0.8.1
58+
regex==2023.10.3
59+
requests==2.31.0
60+
requests-file==1.5.1
61+
s3transfer==0.7.0
62+
Scrapy==2.11.0
63+
service-identity==23.1.0
64+
sgmllib3k==1.0.0
65+
six==1.16.0
66+
soupsieve==2.5
67+
tinysegmenter==0.3
68+
tldextract==5.0.1
69+
tqdm==4.66.1
70+
Twisted==22.10.0
71+
typing_extensions==4.8.0
72+
tzlocal==5.2
73+
urllib3==2.0.7
74+
w3lib==2.1.2
75+
warcio==1.7.4
76+
zope.interface==6.1

0 commit comments

Comments
 (0)