3535
3636class Foundry (FoundryBase ):
3737 """Foundry Client Base Class
38- TODO:
39- -------
40- Add Docstring
4138
39+ Foundry object used for all interactions with Foundry datasets and models. Interfaces with MDF Connect Client,
40+ Globus Compute, Globus Auth, Globus Transfer, Globus Search, DLHub, and relevant Globus Endpoints
4241 """
4342
4443 dlhub_client : Any
@@ -55,6 +54,7 @@ def __init__(
5554 ** data
5655 ):
5756 """Initialize a Foundry client
57+
5858 Args:
5959 name (str): Name of the foundry dataset. If not supplied, metadata will not be loaded into
6060 the Foundry object
@@ -166,6 +166,7 @@ def __init__(
166166
167167 def _load (self , name , download = True , globus = True , verbose = False , metadata = None , authorizers = None , interval = None ):
168168 """Load the metadata for a Foundry dataset into the client
169+
169170 Args:
170171 name (str): Name of the foundry dataset
171172 download (bool): If True, download the data associated with the package (default is True)
@@ -228,12 +229,14 @@ def _load(self, name, download=True, globus=True, verbose=False, metadata=None,
228229
229230 def search (self , q = None , limit = None ):
230231 """Search available Foundry datasets
231- q (str): query string to match
232- limit (int): maximum number of results to return
233232
234- Returns
235- -------
236- (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
233+ Args:
234+ q (str): query string to match
235+ limit (int): maximum number of results to return
236+
237+ Returns:
238+ (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication
239+ year, and DOI
237240 """
238241 if not q :
239242 q = None
@@ -258,22 +261,24 @@ def search(self, q=None, limit=None):
258261
259262 def list (self ):
260263 """List available Foundry datasets
261- Returns
262- -------
263- (pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication year, and DOI
264+
265+ Returns:
266+ (pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication
267+ year, and DOI
264268 """
265269 return self .search ()
266270
267271 def run (self , name , inputs , funcx_endpoint = None , ** kwargs ):
268- """Run a model on data
272+ """Run a model on inputted data
269273
270274 Args:
271- name (str): DLHub model name
272- inputs: Data to send to DLHub as inputs (should be JSON serializable)
273- funcx_endpoint (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
275+ name (str): DLHub model name
276+ inputs: Data to send to DLHub as inputs (should be JSON serializable, example types include dict, list,
277+ np.ndarray, etc)
278+ funcx_endpoint (str) (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
274279
275280 Returns:
276- Returns results after invocation via the DLHub service
281+ Results after invocation via the DLHub service
277282 """
278283 if funcx_endpoint is not None :
279284 self .dlhub_client .fx_endpoint = funcx_endpoint
@@ -291,14 +296,13 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
291296 subclass Foundry and override the load_data function
292297
293298 Args:
294- inputs (list): List of strings for input columns
295- targets (list): List of strings for output columns
296- source_id (string): Relative path to the source file
299+ source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
300+ globus (bool): If True, download using Globus, otherwise, HTTPS
297301 as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
298302 splits (list): Labels of splits to be loaded
299303
300304 Returns:
301- (dict): a labeled dictionary of tuples
305+ data (dict): a labeled dictionary of tuples
302306 """
303307 data = {}
304308
@@ -328,6 +332,14 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
328332 "Metadata not loaded into Foundry object, make sure to call load()" ) from e
329333
330334 def _repr_html_ (self ) -> str :
335+ """Format the Foundry object for notebook rendering as HTML output
336+
337+ Args:
338+ self (Foundry)
339+
340+ Returns:
341+ buf (str): buffer containing the HTML to render
342+ """
331343 if not self .dc :
332344 buf = str (self )
333345 else :
@@ -343,6 +355,16 @@ def _repr_html_(self) -> str:
343355 return buf
344356
345357 def get_citation (self ) -> str :
358+ """Obtain BibTeX citation for the dataset
359+
360+ Uses the dataset currently loaded in the Foundry object described by `self`
361+
362+ Args:
363+ self (Foundry)
364+
365+ Returns:
366+ bibtex (str): The BibTeX citation in string format
367+ """
346368 subjects = [subject ['subject' ] for subject in self .dc ['subjects' ]]
347369 doi_str = f"doi = {{{ self .dc ['identifier' ]['identifier' ]} }}"
348370 url_str = f"url = {{https://doi.org/{ self .dc ['identifier' ]['identifier' ]} }}"
@@ -364,6 +386,7 @@ def publish_dataset(
364386 ** kwargs : Dict [str , Any ],) -> Dict [str , Any ]:
365387 """Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
366388 Transfer using the `globus_data_source` argument. Only one upload method may be specified.
389+
367390 Args:
368391 foundry_metadata (dict): Dict of metadata describing data package
369392 title (string): Title of data package
@@ -396,11 +419,9 @@ def publish_dataset(
396419 related_dois (list): DOIs related to this dataset,
397420 not including the dataset's own DOI (for example, an associated paper's DOI).
398421
399- Returns
400- -------
401- (dict) MDF Connect Response: Response from MDF Connect to allow tracking
402- of dataset. Contains `source_id`, which can be used to check the
403- status of the submission
422+ Returns:
423+ res (MDF Connect Response): Response from MDF Connect to allow tracking of dataset. Contains
424+ `source_id`, which can be used to check the status of the submission
404425 """
405426 # ensure metadata is properly formatted
406427 self .validate_metadata (foundry_metadata )
@@ -463,23 +484,27 @@ def publish_model(self, title, creators, short_name, servable_type, serv_options
463484 "pytorch",
464485 "tensorflow",
465486 "sklearn")
466- serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
467- https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
468- ``create_model`` signature. use the argument names as keys and their values as the values.
487+ serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can
488+ be found at https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html
489+ under the appropriate ``create_model`` signature. use the argument names as keys and their values as
490+ the values.
469491 affiliations (list): list of affiliations for each author
470492 paper_doi (str): DOI of a paper that describes the servable
493+
471494 Returns:
472495 (string): task id of this submission, can be used to check for success
496+
473497 Raises:
474498 ValueError: If the given servable_type is not in the list of acceptable types
475499 Exception: If the serv_options are incomplete or the request to publish results in an error
476500 """
477- return self .dlhub_client .easy_publish (title , creators , short_name , servable_type , serv_options , affiliations , paper_doi )
501+ return self .dlhub_client .easy_publish (title , creators , short_name , servable_type , serv_options , affiliations ,
502+ paper_doi )
478503
479504 def check_status (self , source_id , short = False , raw = False ):
480505 """Check the status of your submission.
481506
482- Arguments :
507+ Args :
483508 source_id (str): The ``source_id`` (``source_name`` + version information) of the
484509 submission to check. Returned in the ``res`` result from ``publish()`` via MDF Connect Client.
485510 short (bool): When ``False``, will print a status summary containing
@@ -493,47 +518,38 @@ def check_status(self, source_id, short=False, raw=False):
493518 **Default:** ``False``
494519
495520 Returns:
496- If `` raw`` is `` True``, *dict*: The full status result.
521+ (dict): Brief status result of dataset publication. If `raw` is True, the full status result.
497522 """
498523 return self .connect_client .check_status (source_id , short , raw )
499524
500- # def check_model_status(self, res):
501- # """Check status of model or function publication to DLHub
502- #
503- # TODO: currently broken on DLHub side of things
504- # """
505- # # return self.dlhub_client.get_task_status(res)
506- # pass
507-
508525 def configure (self , ** kwargs ):
509526 """Set Foundry config
510- Keyword Args:
511- file (str): Path to the file containing
512- (default: self.config.metadata_file)
513527
514- dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
515- data_file (str): : filename for the data file default:"foundry.hdf5"
516- destination_endpoint (str): Globus endpoint UUID where Foundry data should move
517- local_cache_dir (str): Where to place collected data default:"./data"
528+ Keyword Args:
529+ file (str): Path to the file containing (default: self.config.metadata_file)
530+ dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
531+ data_file (str): : filename for the data file default:"foundry.hdf5"
532+ destination_endpoint (str): Globus endpoint UUID where Foundry data should move
533+ local_cache_dir (str): Where to place collected data default:"./data"
518534
519- Returns
520- -------
521- (Foundry): self: for chaining
535+ Returns:
536+ self (Foundry): for chaining
522537 """
523538 self .config = FoundryConfig (** kwargs )
524539 return self
525540
526- def download (self , globus : bool = True , interval : int = 20 , parallel_https : int = 4 , verbose : bool = False ) -> 'Foundry' :
541+ def download (self , globus : bool = True , interval : int = 20 , parallel_https : int = 4 , verbose : bool = False ) -> \
542+ 'Foundry' :
527543 """Download a Foundry dataset
528544
529545 Args:
530- globus: if True, use Globus to download the data else try HTTPS
531- interval: How often to wait before checking Globus transfer status
532- parallel_https: Number of files to download in parallel if using HTTPS
533- verbose: Produce more debug messages to screen
546+ globus (bool) : if True, use Globus to download the data else try HTTPS
547+ interval (int) : How often to wait before checking Globus transfer status
548+ parallel_https (int) : Number of files to download in parallel if using HTTPS
549+ verbose (bool) : Produce more debug messages to screen
534550
535551 Returns:
536- self, for chaining
552+ self (Foundry): for chaining
537553 """
538554 # Check if the dir already exists
539555 path = os .path .join (self .config .local_cache_dir , self .mdf ["source_id" ])
@@ -622,14 +638,14 @@ def download(self, globus: bool = True, interval: int = 20, parallel_https: int
622638 def get_keys (self , type = None , as_object = False ):
623639 """Get keys for a Foundry dataset
624640
625- Arguments :
641+ Args :
626642 type (str): The type of key to be returned e.g., "input", "target"
627643 as_object (bool): When ``False``, will return a list of keys in as strings
628644 When ``True``, will return the full key objects
629645 **Default:** ``False``
630- Returns: (list) String representations of keys or if ``as_object``
631- is False otherwise returns the full key objects.
632-
646+ Returns:
647+ key_list (list): String representations of keys or if ``as_object`` is False otherwise returns the full
648+ key objects
633649 """
634650
635651 if as_object :
@@ -649,7 +665,25 @@ def get_keys(self, type=None, as_object=False):
649665 key_list = key_list + k
650666 return key_list
651667
652- def _load_data (self , file = None , source_id = None , globus = True , as_hdf5 = False ):
668+ def _load_data (self , file = None , source_id = None , as_hdf5 = False ):
669+ """Handle the bulk of loading a dataset logic
670+
671+ Args:
672+ file (str): Relative path to the data file (specified via splits). Supported file types include tabular
673+ (eg JSON, JSON lines, csv) and HDF5
674+ source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
675+ as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
676+
677+ Returns:
678+ (Pandas.dataframe): Tabular dataset formatted to Pandas dataframe
679+ tmp_data (dict): HDF5 data (if applicable) reformatted into dict form for easy output
680+
681+ Raises:
682+ ValueError: If path to the data file is valid. Or, if tabular data cannot be read
683+ FileNotFoundError: If no file was found at the expected path
684+ NotImplementedError: If file type inputted is not supported
685+
686+ """
653687 # Build the path to access the cached data
654688 if source_id :
655689 path = os .path .join (self .config .local_cache_dir , source_id )
@@ -721,7 +755,8 @@ def _get_inputs_targets(self, split: str = None):
721755 split (string): Split to get inputs and outputs from.
722756 **Default:** ``None``
723757
724- Returns: (Tuple) Tuple of the inputs and outputs
758+ Returns:
759+ (Tuple): Tuple of the inputs and outputs
725760 """
726761 raw = self .load_data (as_hdf5 = False )
727762
@@ -762,10 +797,10 @@ def to_torch(self, split: str = None):
762797 """Convert Foundry Dataset to a PyTorch Dataset
763798
764799 Arguments:
765- split (string): Split to create PyTorch Dataset on.
766- **Default:** ``None``
800+ split (string): Split to create PyTorch Dataset on. Default is None.
767801
768- Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
802+ Returns:
803+ (TorchDataset): PyTorch Dataset of all the data from the specified split
769804
770805 """
771806 from foundry .loaders .torch_wrapper import TorchDataset
@@ -777,10 +812,10 @@ def to_tensorflow(self, split: str = None):
777812 """Convert Foundry Dataset to a Tensorflow Sequence
778813
779814 Arguments:
780- split (string): Split to create Tensorflow Sequence on.
781- **Default:** ``None``
815+ split (string): Split to create Tensorflow Sequence on. Default is None.
782816
783- Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
817+ Returns:
818+ (TensorflowSequence): Tensorflow Sequence of all the data from the specified split
784819
785820 """
786821 from foundry .loaders .tf_wrapper import TensorflowSequence
0 commit comments