1- import os
2- import subprocess
3- from typing import NamedTuple
4-
5- import logging
6- from .const import texas_list
7-
8- logger = logging .getLogger (__name__ )
9-
10-
11- class DS (NamedTuple ):
12- files : str | list [str ]
13- save_name : str | None = None
14- credentials : bool = False
15- desc : str | None = None
16-
1+ from ..utils .download import DS
172
183physio = "requires credentials and license from https://physionet.org"
194
205datasets = {
216 # Open Datasets
227 "adult" : DS ("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/" ),
23- # Texas, open with license
24- "texas" : DS (
25- texas_list ,
26- desc = "license: https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm" ,
27- ),
288 # Physionet
299 "mimic_iv_1_0" : DS (
3010 "https://physionet.org/files/mimiciv/1.0/" , "mimiciv_1_0" , True , physio
@@ -44,98 +24,4 @@ class DS(NamedTuple):
4424 "s3:sdv-datasets" ,
4525 desc = "license MIT (not clear if that applies to data), requires boto3 package" ,
4626 ),
47- }
48-
49-
50- def download_files (name : str , dir : str , files : list [str ]):
51- if not files :
52- assert False , "Empty file list"
53-
54- logger .info (f"Downloading dataset { name } files iteratively with wget." )
55- args = ["wget" , "-m" , "-np" , "-nH" , "-c" , "-P" , dir ]
56-
57- template_fn = files [0 ]
58- # We have to skip parent dirs manually
59- cut_dirs = len (template_fn .split ("/" )) - 4
60- if cut_dirs > 0 :
61- args .append (f"--cut-dirs={ cut_dirs } " )
62-
63- args .extend (files )
64- subprocess .run (args )
65-
66-
67- def download_index (
68- name : str , download_dir : str , url_dir : str , username : str | None = None
69- ):
70- logger .info (f"Downloading dataset { name } through its index listing and wget." )
71- assert url_dir [- 1 ] == "/" , "Url dir should end with a `/`"
72-
73- args = ["wget" , "-m" , "-np" , "-nH" , "-c" , "-P" , download_dir ]
74-
75- # We have to skip parent dirs manually
76- cut_dirs = len (url_dir .split ("/" )) - 4
77- if cut_dirs > 0 :
78- args .append (f"--cut-dirs={ cut_dirs } " )
79-
80- args .append (url_dir )
81- if username :
82- args .extend (["--user" , username , "--ask-password" ])
83- subprocess .run (args )
84-
85-
86- def download_s3 (name : str , download_dir : str , bucket : str ):
87- try :
88- import boto3
89- from botocore import UNSIGNED
90- from botocore .client import Config
91- except Exception :
92- assert False , "Specified dataset requires the aws package 'boto3'"
93-
94- logger .info (f"Downloading dataset { name } from s3 using boto3." )
95- s3 = boto3 .resource ("s3" , config = Config (signature_version = UNSIGNED ))
96- ds_bucket = s3 .Bucket (bucket )
97-
98- for s3_object in ds_bucket .objects .all ():
99- _ , filename = os .path .split (s3_object .key )
100- fn = os .path .join (download_dir , filename )
101- if os .path .isfile (fn ):
102- logger .info (f"File already downloaded, skipping: { filename } " )
103- continue
104-
105- logger .info (f"Downloading { filename } ({ s3_object .size / 1e6 :.3f} mb)" )
106- ds_bucket .download_file (s3_object .key , fn )
107-
108-
109- def main (download_dir : str , names : list [str ], username : str | None ):
110- assert os .path .exists (
111- download_dir
112- ), f'Download path "{ download_dir } " doesn\' t exist.'
113-
114- for name in names :
115- assert name in datasets , f"Dataset { name } not found."
116- ds = datasets [name ]
117-
118- save_name = ds .save_name or name
119- save_path = os .path .join (download_dir , save_name )
120- os .makedirs (save_path , exist_ok = True )
121-
122- if ds .credentials :
123- assert username , f"Dataset requires credentials, use --user <user>"
124-
125- if isinstance (ds .files , list ):
126- download_files (name , save_path , ds .files )
127- else :
128- assert isinstance (ds .files , str )
129- if ds .files .startswith ("s3:" ):
130- download_s3 (name , save_path , ds .files .replace ("s3:" , "" ))
131- else :
132- download_index (
133- name , save_path , ds .files , username if ds .credentials else None
134- )
135-
136-
137- def get_description ():
138- desc = "The following data stores are available:\n "
139- for name , ds in datasets .items ():
140- desc += f"{ name :15s} : { ds .desc or '' } \n "
141- return desc
27+ }
0 commit comments