Skip to content

Commit 413d9d6

Browse files
committed
allows specifying dump files individually
fixes #112
1 parent a062fd6 commit 413d9d6

3 files changed

Lines changed: 63 additions & 20 deletions

File tree

README.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,23 +62,41 @@ $ sha256sum -c discogs_*_CHECKSUM.txt
6262

6363
Run `run.py` to convert the dump files to csv.
6464

65+
There are two run modes:
66+
67+
1. You can point it to a directory where the discogs dump files are
68+
and use one or multiple `--export` options to indicate which files to process:
69+
6570
```sh
6671
# ensure the virtual environment is active
6772
(.discogsenv) $ python3 run.py \
6873
--bz2 \ # compresses resulting csv files
6974
--apicounts \ # provides more accurate progress counts
7075
--export artist --export label --export master --export release \
76+
--output csv-dir # folder where to output the csv files
7177
dump-dir \ # folder where the data dumps are
72-
csv-dir # folder where to output the csv files
78+
```
79+
80+
2. You can specify the individual files instead:
81+
82+
```sh
83+
# ensure the virtual environment is active
84+
(.discogsenv) $ python3 run.py \
85+
--bz2 \ # compresses resulting csv files
86+
--apicounts \ # provides more accurate progress counts
87+
--output csv-dir # folder where to output the csv files
88+
path/to/discogs_20200806_artist.xml.gz path/to/discogs_20200806_labels.xml.gz
7389
```
7490

7591
`run.py` takes the following arguments:
7692

7793
- `--export`: the types of dump files to export: "artist", "label", "master", "release.
7894
It matches the names of the dump files, e.g. "discogs_20200806_*artist*s.xml.gz"
95+
Not needed if the individual files are specified.
7996
- `--bz2`: Compresses output csv files using bz2 compression library.
8097
- `--limit=<lines>`: Limits export to some number of entities
8198
- `--apicounts`: Makes progress report more accurate by getting total amounts from Discogs API.
99+
- `--output` : the folder where to store the csv files; default it current directory
82100

83101
The exporter provides progress information in real time:
84102

discogsxml2db/exporter.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,19 @@ def _write_rows(writer, entity, name):
3737
class EntityCsvExporter(object):
3838
"""Read a Discogs dump XML file and exports SQL table records as CSV.
3939
"""
40-
def __init__(self, entity, in_dir, out_dir,
40+
def __init__(self, entity, in_file_or_dir, out_dir,
4141
limit=None, bz2=True,
4242
dry_run=False, debug=False, max_hint=None, verbose=False):
4343
self.entity = entity
4444
self.parser = _parsers[entity]()
4545
self.max_hint = max_hint
4646
self.verbose = verbose
4747

48-
lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
49-
self.pattern = os.path.join(in_dir, lookup)
48+
if os.path.isfile(in_file_or_dir):
49+
self.pattern = in_file_or_dir
50+
else:
51+
lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
52+
self.pattern = os.path.join(in_file_or_dir, lookup)
5053

5154
# where and how the exporter will write to
5255
self.out_dir = out_dir
@@ -287,8 +290,7 @@ def write_track_artists(self, writer, release):
287290

288291

289292
def main(arguments):
290-
in_base = arguments['INPUT']
291-
out_base = arguments['OUTPUT'] or '.'
293+
out_base = arguments['--output'] or '.'
292294
limit = int(arguments['--limit']) if arguments['--limit'] else None
293295
bz2_on = arguments['--bz2']
294296
debug = arguments['--debug']
@@ -312,14 +314,33 @@ def main(arguments):
312314
except Exception:
313315
pass
314316

315-
for entity in arguments['--export']:
316-
expected_count = rough_counts['{}s'.format(entity)]
317-
exporter = _exporters[entity](
318-
in_base,
319-
out_base,
320-
limit=limit,
321-
bz2=bz2_on,
322-
debug=debug,
323-
max_hint=min(expected_count, limit or expected_count),
324-
dry_run=dry_run)
325-
exporter.export()
317+
if arguments['INPUT_DIR']:
318+
# use --export to select the entities
319+
in_base = arguments['INPUT_DIR']
320+
for entity in arguments['--export']:
321+
expected_count = rough_counts['{}s'.format(entity)]
322+
exporter = _exporters[entity](
323+
in_base,
324+
out_base,
325+
limit=limit,
326+
bz2=bz2_on,
327+
debug=debug,
328+
max_hint=min(expected_count, limit or expected_count),
329+
dry_run=dry_run)
330+
exporter.export()
331+
elif arguments["<INPUT_FILE>"]:
332+
for in_file in arguments["<INPUT_FILE>"]:
333+
for entity in _exporters:
334+
# discogs files are named discogs_{date}_{entity}s.xml
335+
if f"_{entity}" in in_file:
336+
expected_count = rough_counts['{}s'.format(entity)]
337+
exporter = _exporters[entity](
338+
in_file,
339+
out_base,
340+
limit=limit,
341+
bz2=bz2_on,
342+
debug=debug,
343+
max_hint=min(expected_count, limit or expected_count),
344+
dry_run=dry_run)
345+
exporter.export()
346+
break

run.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
33
"""Usage:
4-
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] INPUT [OUTPUT] [--export=<entity>]...
4+
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<output>] INPUT_DIR [--export=<entity>]...
5+
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<output>] <INPUT_FILE> <INPUT_FILE>...
56
67
Options:
78
--bz2 Compress output files using bz2 compression library.
89
--limit=<lines> Limit export to some number of entities
9-
--export=<entity> Limit export to some entities (repeatable)
10+
--export=<entity> Limit export to some entities (repeatable).
11+
Entity is one of: artist, label, master, release.
1012
--debug Turn on debugging prints
1113
--apicounts Check entities counts with Discogs API
12-
--dry-run Do not write
14+
--dry-run Do not write csv files.
1315
1416
"""
1517
import sys
@@ -20,4 +22,6 @@
2022

2123
if __name__ == '__main__':
2224
arguments = docopt(__doc__, version='Discogs-to-SQL exporter')
25+
if arguments["--debug"]:
26+
print(arguments)
2327
sys.exit(main(arguments))

0 commit comments

Comments
 (0)