|
| 1 | +#!/usr/bin/env python3 |
| 2 | +__author__ = "Giacomo Bergami" |
| 3 | +__copyright__ = "Copyright 2020, Giacomo Bergami" |
| 4 | +__credits__ = ["Giacomo Bergami"] |
| 5 | +__license__ = "GPL" |
| 6 | +__version__ = "2.0" |
| 7 | +__maintainer__ = "Giacomo Bergami" |
| 8 | +__email__ = "bergamigiacomo@gmail.com" |
| 9 | +__status__ = "Production" |
| 10 | +## This script allows to download the files even if the main process just wrote the csv without downloading the data from the servers |
| 11 | +import sys, argparse, csv, os, json |
| 12 | +from os import path |
| 13 | +from newsplease import NewsPlease |
| 14 | + |
| 15 | +database = dict() |
| 16 | + |
| 17 | +def craw_from_file(filename): |
| 18 | + print(filename) |
| 19 | + print('-------------------') |
| 20 | + print('-------------------') |
| 21 | + dkey = filename[:-15] |
| 22 | + dkey_dbfile = dkey+".json" |
| 23 | + dkey_db = {} |
| 24 | + if path.exists(dkey_dbfile): |
| 25 | + dkey_db = json.load(open(dkey_dbfile)) |
| 26 | + dkey_db_nextKey = 1 |
| 27 | + if len(dkey_db) > 0: |
| 28 | + dkey_db_nextKey = int(max(dkey_db,key=int))+1 |
| 29 | + with open(filename, 'r') as csvfile: |
| 30 | + reader = csv.reader(csvfile) |
| 31 | + for row in reader: |
| 32 | + if not (dkey in database): |
| 33 | + database[dkey] = set() |
| 34 | + else: |
| 35 | + database[dkey] = set(database[dkey]) |
| 36 | + if not (row[0] in database[dkey]): |
| 37 | + try: |
| 38 | + database[dkey].add(row[0]) |
| 39 | + dkey_db[str(dkey_db_nextKey)] = NewsPlease.from_url(row[0]).get_serializable_dict() |
| 40 | + print("Article #"+str(dkey_db_nextKey)+" was parsed") |
| 41 | + dkey_db_nextKey = dkey_db_nextKey+1 |
| 42 | + except Exception as e: |
| 43 | + print("Error: ''"+str(e)+"'' for article at ''"+row[0]+"''! downloading it as a simple text file...") |
| 44 | + json.dump(dkey_db,open(dkey_dbfile,'w')) |
| 45 | + dump_database('db.json') |
| 46 | + |
| 47 | + |
| 48 | +def load_database(filename): |
| 49 | + global database |
| 50 | + if path.exists(filename): |
| 51 | + with open(filename) as json_file: |
| 52 | + data = json.load(json_file) |
| 53 | + for k in data: |
| 54 | + data[k] = set(data[k]) |
| 55 | + database = data |
| 56 | + |
| 57 | +def dump_database(filename): |
| 58 | + for k in database: |
| 59 | + database[k] = list(database[k]) |
| 60 | + json.dump(database,open(filename,'w')) |
| 61 | + |
| 62 | +def get_all_csvs(): |
| 63 | + files = [f for f in os.listdir('.') if os.path.isfile(f)] |
| 64 | + for f in files: |
| 65 | + if (f.endswith('.csv')): |
| 66 | + craw_from_file(f) |
| 67 | + |
| 68 | + |
| 69 | +if __name__ == '__main__': |
| 70 | + load_database('db.json') |
| 71 | + get_all_csvs() |
| 72 | + dump_database('db.json') |
0 commit comments