Add files via upload

jackbergus · web-flow · commit 6bf638242311 · 2023-10-31T22:47:32.000Z
diff --git a/02_crawl_from_csv.py b/02_crawl_from_csv.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+## This script allows to download the files even if the main process just wrote the csv without downloading the data from the servers
+import sys, argparse, csv, os, json
+from os import path
+from newsplease import NewsPlease
+
+database = dict()
+
+def craw_from_file(filename):
+    print(filename)
+    print('-------------------')
+    print('-------------------')
+    dkey = filename[:-15]
+    dkey_dbfile = dkey+".json"
+    dkey_db = {}
+    if path.exists(dkey_dbfile):
+        dkey_db = json.load(open(dkey_dbfile))
+    dkey_db_nextKey = 1 
+    if len(dkey_db) > 0:
+        dkey_db_nextKey = int(max(dkey_db,key=int))+1
+    with open(filename, 'r') as csvfile:
+    	reader = csv.reader(csvfile)
+    	for row in reader:
+    	        if not (dkey in database):
+    	            database[dkey] = set()
+    	        else:
+    	            database[dkey] = set(database[dkey])
+    	        if not (row[0] in database[dkey]):
+    	            try:
+    	               database[dkey].add(row[0])
+    	               dkey_db[str(dkey_db_nextKey)] = NewsPlease.from_url(row[0]).get_serializable_dict()
+    	               print("Article #"+str(dkey_db_nextKey)+" was parsed")
+    	               dkey_db_nextKey = dkey_db_nextKey+1
+    	            except Exception as e:
+    	               print("Error: ''"+str(e)+"'' for article at ''"+row[0]+"''! downloading it as a simple text file...")
+    json.dump(dkey_db,open(dkey_dbfile,'w'))
+    dump_database('db.json')
+
+
+def load_database(filename):
+    global database
+    if path.exists(filename):
+        with open(filename) as json_file:
+            data = json.load(json_file)
+            for k in data:
+                data[k] = set(data[k])
+            database = data
+
+def dump_database(filename):
+    for k in database:
+        database[k] = list(database[k])
+    json.dump(database,open(filename,'w'))
+
+def get_all_csvs():
+    files = [f for f in os.listdir('.') if os.path.isfile(f)]
+    for f in files:
+        if (f.endswith('.csv')):
+            craw_from_file(f)
+         
+
+if __name__ == '__main__':
+    load_database('db.json')
+    get_all_csvs()
+    dump_database('db.json')
diff --git a/crawl_from_csv.py b/crawl_from_csv.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
+# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
+import scrape_by_keyword
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Scrape by Keyword: scraping the online news via Python.")
+    parser.add_argument("--RSSFeedCsv", default=False, type=str,
+                        help="Defines the configuration csv file containing the rss feed from the newspapers")
+    parser.add_argument('--threads', default=8, type=int, help="The minimum amount of threads to be used in the scraping")
+    args = parser.parse_args()
+    scrape_by_keyword.rssFeedFile = args.RSSFeedCsv
+    scrape_by_keyword.workers = args.threads
+    scrape_by_keyword.load_database('db.json')
+    if scrape_by_keyword.workers == 0:
+    	scrape_by_keyword.get_all_csvs()
+    else:
+    	scrape_by_keyword.multi_crawl()
+    scrape_by_keyword.dump_database('db.json')
diff --git a/generate_final_db.py b/generate_final_db.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
+# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
+import json, os
+
+if __name__ == '__main__':
+  db = {}
+  files = [f for f in os.listdir('.') if os.path.isfile(f)]
+  for f in files:
+        if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
+            print(f)
+            try:
+              l = json.load(open(f))
+              for k in l:
+               db[l[k]["url"]] = l[k]
+            except:
+              print("Error on "+f)
+  print(str(len(db))+" articles were downloaded in total!")
+  json.dump(db,open("final_db.json","w"), indent=4, sort_keys=True)
diff --git a/interval_stats.py b/interval_stats.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+import scrape_by_keyword
+
+for (x,y) in scrape_by_keyword.url_list(scrape_by_keyword.date_global_minimum.strftime('%Y-%m-%d')):
+	scrape_by_keyword.NewsThread(x, scrape_by_keyword.phrases, y, scrape_by_keyword.database, scrape_by_keyword.proxies, True)
diff --git a/newcastle.txt b/newcastle.txt
@@ -0,0 +1,4 @@
+www.chroniclelive.co.uk/?service=rss,ncl_chron_$day.csv
+https://www.nexus.org.uk/news.xml,nexus_$day.csv
+https://www.ncl.ac.uk/data/mobile/rss/pressoffice/pressnews/index.xml,uni_$day.csv
+https://www.theguardian.com/uk/newcastle/rss,guardian_$day.csv
diff --git a/plot.py b/plot.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
+# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
+import json, os, scrape_by_keyword
+
+if __name__ == '__main__':
+  files = [f for f in os.listdir('.') if os.path.isfile(f)]
+  ll = []
+  for f in files:
+        if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
+            ll.extend(scrape_by_keyword.plot_jsons(f, True))
+  scrape_by_keyword.plot_list("Total",ll)
+  
diff --git a/reconstruct_db_from_jsons.py b/reconstruct_db_from_jsons.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+__author__ = "Giacomo Bergami"
+__copyright__ = "Copyright 2020, Giacomo Bergami"
+__credits__ = ["Giacomo Bergami"]
+__license__ = "GPL"
+__version__ = "2.0"
+__maintainer__ = "Giacomo Bergami"
+__email__ = "bergamigiacomo@gmail.com"
+__status__ = "Production"
+# This script can be used when you want to upload all the urls listed in the CSV files into the json dbs.
+# This could be done when "scrape_by_keyword" fails during the processing of the RSS feed
+import json, os
+
+if __name__ == '__main__':
+  db = {}
+  files = [f for f in os.listdir('.') if os.path.isfile(f)]
+  for f in files:
+        if (f.endswith('.json')) and (not (f == "db.json") and not (f == "final_db.json")):
+            print(f)
+            try:
+              l = json.load(open(f))
+              dkey = f[:-5]
+              db[dkey] = set()
+              for k in l:
+               db[dkey].add(l[k]["url"])
+              db[dkey] = list(db[dkey])
+            except:
+              print("Error on "+f)
+  json.dump(db,open("db.json","w"))
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,76 @@
+ago==0.0.95
+attrs==23.1.0
+Automat==22.10.0
+beautifulsoup4==4.12.2
+boto3==1.28.75
+botocore==1.31.75
+bs4==0.0.1
+certifi==2023.7.22
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.1
+click==8.1.7
+colorama==0.4.6
+constantly==23.10.4
+cryptography==41.0.5
+cssselect==1.2.0
+dateparser==1.1.8
+dotmap==1.3.30
+elastic-transport==8.10.0
+elasticsearch==8.10.1
+faust-cchardet==2.1.19
+feedfinder2==0.0.4
+feedparser==6.0.10
+filelock==3.13.1
+hjson==3.1.0
+hurry.filesize==0.9
+hyperlink==21.0.0
+idna==3.4
+incremental==22.10.0
+itemadapter==0.8.0
+itemloaders==1.1.0
+jieba3k==0.35.1
+jmespath==1.0.1
+joblib==1.3.2
+langdetect==1.0.9
+lxml==4.9.3
+news-please==1.5.35
+newspaper3k==0.2.8
+nltk==3.8.1
+packaging==23.2
+parsel==1.8.1
+Pillow==10.1.0
+plac==1.4.1
+plotille==5.0.0
+Protego==0.3.0
+psycopg2-binary==2.9.9
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+PyDispatcher==2.0.7
+PyMySQL==1.1.0
+pyOpenSSL==23.3.0
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+queuelib==1.6.2
+readability-lxml==0.8.1
+regex==2023.10.3
+requests==2.31.0
+requests-file==1.5.1
+s3transfer==0.7.0
+Scrapy==2.11.0
+service-identity==23.1.0
+sgmllib3k==1.0.0
+six==1.16.0
+soupsieve==2.5
+tinysegmenter==0.3
+tldextract==5.0.1
+tqdm==4.66.1
+Twisted==22.10.0
+typing_extensions==4.8.0
+tzlocal==5.2
+urllib3==2.0.7
+w3lib==2.1.2
+warcio==1.7.4
+zope.interface==6.1
diff --git a/scrape_by_keyword.py b/scrape_by_keyword.py
diff --git a/scrape_news.sh b/scrape_news.sh