3535
3636class Foundry (FoundryBase ):
3737 """Foundry Client Base Class
38- TODO:
39- -------
40- Add Docstring
4138
39+ Foundry object used for all interactions with Foundry datasets and models. Interfaces with MDF Connect Client,
40+ Globus Compute, Globus Auth, Globus Transfer, Globus Search, DLHub, and relevant Globus Endpoints
4241 """
4342
4443 dlhub_client : Any
@@ -53,6 +52,7 @@ def __init__(
5352 self , no_browser = False , no_local_server = False , index = "mdf" , authorizers = None , ** data
5453 ):
5554 """Initialize a Foundry client
55+
5656 Args:
5757 no_browser (bool): Whether to open the browser for the Globus Auth URL.
5858 no_local_server (bool): Whether a local server is available.
@@ -147,6 +147,7 @@ def __init__(
147147
148148 def load (self , name , download = True , globus = False , verbose = False , metadata = None , authorizers = None , ** kwargs ):
149149 """Load the metadata for a Foundry dataset into the client
150+
150151 Args:
151152 name (str): Name of the foundry dataset
152153 download (bool): If True, download the data associated with the package (default is True)
@@ -210,12 +211,14 @@ def load(self, name, download=True, globus=False, verbose=False, metadata=None,
210211
211212 def search (self , q = None , limit = None ):
212213 """Search available Foundry datasets
213- q (str): query string to match
214- limit (int): maximum number of results to return
215214
216- Returns
217- -------
218- (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
215+ Args:
216+ q (str): query string to match
217+ limit (int): maximum number of results to return
218+
219+ Returns:
220+ (pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication
221+ year, and DOI
219222 """
220223 if not q :
221224 q = None
@@ -240,22 +243,24 @@ def search(self, q=None, limit=None):
240243
241244 def list (self ):
242245 """List available Foundry datasets
243- Returns
244- -------
245- (pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication year, and DOI
246+
247+ Returns:
248+ (pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication
249+ year, and DOI
246250 """
247251 return self .search ()
248252
249253 def run (self , name , inputs , funcx_endpoint = None , ** kwargs ):
250- """Run a model on data
254+ """Run a model on inputted data
251255
252256 Args:
253- name (str): DLHub model name
254- inputs: Data to send to DLHub as inputs (should be JSON serializable)
255- funcx_endpoint (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
257+ name (str): DLHub model name
258+ inputs: Data to send to DLHub as inputs (should be JSON serializable, example types include dict, list,
259+ np.ndarray, etc)
260+ funcx_endpoint (str) (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
256261
257262 Returns:
258- Returns results after invocation via the DLHub service
263+ Results after invocation via the DLHub service
259264 """
260265 if funcx_endpoint is not None :
261266 self .dlhub_client .fx_endpoint = funcx_endpoint
@@ -273,14 +278,13 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
273278 subclass Foundry and override the load_data function
274279
275280 Args:
276- inputs (list): List of strings for input columns
277- targets (list): List of strings for output columns
278- source_id (string): Relative path to the source file
281+ source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
282+ globus (bool): If True, download using Globus, otherwise, HTTPS
279283 as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
280284 splits (list): Labels of splits to be loaded
281285
282286 Returns:
283- (dict): a labeled dictionary of tuples
287+ data (dict): a labeled dictionary of tuples
284288 """
285289 data = {}
286290
@@ -310,6 +314,14 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
310314 "Metadata not loaded into Foundry object, make sure to call load()" ) from e
311315
312316 def _repr_html_ (self ) -> str :
317+ """Format the Foundry object for notebook rendering as HTML output
318+
319+ Args:
320+ self (Foundry)
321+
322+ Returns:
323+ buf (str): buffer containing the HTML to render
324+ """
313325 if not self .dc :
314326 buf = str (self )
315327 else :
@@ -325,6 +337,16 @@ def _repr_html_(self) -> str:
325337 return buf
326338
327339 def get_citation (self ) -> str :
340+ """Obtain BibTeX citation for the dataset
341+
342+ Uses the dataset currently loaded in the Foundry object described by `self`
343+
344+ Args:
345+ self (Foundry)
346+
347+ Returns:
348+ bibtex (str): The BibTeX citation in string format
349+ """
328350 subjects = [subject ['subject' ] for subject in self .dc ['subjects' ]]
329351 doi_str = f"doi = {{{ self .dc ['identifier' ]['identifier' ]} }}"
330352 url_str = f"url = {{https://doi.org/{ self .dc ['identifier' ]['identifier' ]} }}"
@@ -346,6 +368,7 @@ def publish_dataset(
346368 ** kwargs : Dict [str , Any ],) -> Dict [str , Any ]:
347369 """Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
348370 Transfer using the `globus_data_source` argument. Only one upload method may be specified.
371+
349372 Args:
350373 foundry_metadata (dict): Dict of metadata describing data package
351374 title (string): Title of data package
@@ -378,11 +401,9 @@ def publish_dataset(
378401 related_dois (list): DOIs related to this dataset,
379402 not including the dataset's own DOI (for example, an associated paper's DOI).
380403
381- Returns
382- -------
383- (dict) MDF Connect Response: Response from MDF Connect to allow tracking
384- of dataset. Contains `source_id`, which can be used to check the
385- status of the submission
404+ Returns:
405+ res (MDF Connect Response): Response from MDF Connect to allow tracking of dataset. Contains
406+ `source_id`, which can be used to check the status of the submission
386407 """
387408 # ensure metadata is properly formatted
388409 self .validate_metadata (foundry_metadata )
@@ -461,23 +482,27 @@ def publish_model(self, title, creators, short_name, servable_type, serv_options
461482 "pytorch",
462483 "tensorflow",
463484 "sklearn")
464- serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
465- https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
466- ``create_model`` signature. use the argument names as keys and their values as the values.
485+ serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can
486+ be found at https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html
487+ under the appropriate ``create_model`` signature. use the argument names as keys and their values as
488+ the values.
467489 affiliations (list): list of affiliations for each author
468490 paper_doi (str): DOI of a paper that describes the servable
491+
469492 Returns:
470493 (string): task id of this submission, can be used to check for success
494+
471495 Raises:
472496 ValueError: If the given servable_type is not in the list of acceptable types
473497 Exception: If the serv_options are incomplete or the request to publish results in an error
474498 """
475- return self .dlhub_client .easy_publish (title , creators , short_name , servable_type , serv_options , affiliations , paper_doi )
499+ return self .dlhub_client .easy_publish (title , creators , short_name , servable_type , serv_options , affiliations ,
500+ paper_doi )
476501
477502 def check_status (self , source_id , short = False , raw = False ):
478503 """Check the status of your submission.
479504
480- Arguments :
505+ Args :
481506 source_id (str): The ``source_id`` (``source_name`` + version information) of the
482507 submission to check. Returned in the ``res`` result from ``publish()`` via MDF Connect Client.
483508 short (bool): When ``False``, will print a status summary containing
@@ -491,47 +516,38 @@ def check_status(self, source_id, short=False, raw=False):
491516 **Default:** ``False``
492517
493518 Returns:
494- If `` raw`` is `` True``, *dict*: The full status result.
519+ (dict): Brief status result of dataset publication. If `raw` is True, the full status result.
495520 """
496521 return self .connect_client .check_status (source_id , short , raw )
497522
498- # def check_model_status(self, res):
499- # """Check status of model or function publication to DLHub
500- #
501- # TODO: currently broken on DLHub side of things
502- # """
503- # # return self.dlhub_client.get_task_status(res)
504- # pass
505-
506523 def configure (self , ** kwargs ):
507524 """Set Foundry config
508- Keyword Args:
509- file (str): Path to the file containing
510- (default: self.config.metadata_file)
511525
512- dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
513- data_file (str): : filename for the data file default:"foundry.hdf5"
514- destination_endpoint (str): Globus endpoint UUID where Foundry data should move
515- local_cache_dir (str): Where to place collected data default:"./data"
526+ Keyword Args:
527+ file (str): Path to the file containing (default: self.config.metadata_file)
528+ dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
529+ data_file (str): : filename for the data file default:"foundry.hdf5"
530+ destination_endpoint (str): Globus endpoint UUID where Foundry data should move
531+ local_cache_dir (str): Where to place collected data default:"./data"
516532
517- Returns
518- -------
519- (Foundry): self: for chaining
533+ Returns:
534+ self (Foundry): for chaining
520535 """
521536 self .config = FoundryConfig (** kwargs )
522537 return self
523538
524- def download (self , globus : bool = True , interval : int = 20 , parallel_https : int = 4 , verbose : bool = False ) -> 'Foundry' :
539+ def download (self , globus : bool = True , interval : int = 20 , parallel_https : int = 4 , verbose : bool = False ) -> \
540+ 'Foundry' :
525541 """Download a Foundry dataset
526542
527543 Args:
528- globus: if True, use Globus to download the data else try HTTPS
529- interval: How often to wait before checking Globus transfer status
530- parallel_https: Number of files to download in parallel if using HTTPS
531- verbose: Produce more debug messages to screen
544+ globus (bool) : if True, use Globus to download the data else try HTTPS
545+ interval (int) : How often to wait before checking Globus transfer status
546+ parallel_https (int) : Number of files to download in parallel if using HTTPS
547+ verbose (bool) : Produce more debug messages to screen
532548
533549 Returns:
534- self, for chaining
550+ self (Foundry): for chaining
535551 """
536552 # Check if the dir already exists
537553 path = os .path .join (self .config .local_cache_dir , self .mdf ["source_id" ])
@@ -620,14 +636,14 @@ def download(self, globus: bool = True, interval: int = 20, parallel_https: int
620636 def get_keys (self , type = None , as_object = False ):
621637 """Get keys for a Foundry dataset
622638
623- Arguments :
639+ Args :
624640 type (str): The type of key to be returned e.g., "input", "target"
625641 as_object (bool): When ``False``, will return a list of keys in as strings
626642 When ``True``, will return the full key objects
627643 **Default:** ``False``
628- Returns: (list) String representations of keys or if ``as_object``
629- is False otherwise returns the full key objects.
630-
644+ Returns:
645+ key_list (list): String representations of keys or if ``as_object`` is False otherwise returns the full
646+ key objects
631647 """
632648
633649 if as_object :
@@ -648,6 +664,25 @@ def get_keys(self, type=None, as_object=False):
648664 return key_list
649665
650666 def _load_data (self , file = None , source_id = None , globus = True , as_hdf5 = False ):
667+ """Handle the bulk of loading a dataset logic
668+
669+ Args:
670+ file (str): Relative path to the data file (specified via splits). Supported file types include tabular
671+ (eg JSON, JSON lines, csv) and HDF5
672+ source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
673+ globus (bool): If True, download using Globus, otherwise, HTTPS. Necessary for test functionality
674+ as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
675+
676+ Returns:
677+ (Pandas.dataframe): Tabular dataset formatted to Pandas dataframe
678+ tmp_data (dict): HDF5 data (if applicable) reformatted into dict form for easy output
679+
680+ Raises:
681+ ValueError: If path to the data file is valid. Or, if tabular data cannot be read
682+ FileNotFoundError: If no file was found at the expected path
683+ NotImplementedError: If file type inputted is not supported
684+
685+ """
651686 # Build the path to access the cached data
652687 if source_id :
653688 path = os .path .join (self .config .local_cache_dir , source_id )
@@ -719,7 +754,8 @@ def _get_inputs_targets(self, split: str = None):
719754 split (string): Split to get inputs and outputs from.
720755 **Default:** ``None``
721756
722- Returns: (Tuple) Tuple of the inputs and outputs
757+ Returns:
758+ (Tuple): Tuple of the inputs and outputs
723759 """
724760 raw = self .load_data (as_hdf5 = False )
725761
@@ -760,10 +796,10 @@ def to_torch(self, split: str = None):
760796 """Convert Foundry Dataset to a PyTorch Dataset
761797
762798 Arguments:
763- split (string): Split to create PyTorch Dataset on.
764- **Default:** ``None``
799+ split (string): Split to create PyTorch Dataset on. Default is None.
765800
766- Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
801+ Returns:
802+ (TorchDataset): PyTorch Dataset of all the data from the specified split
767803
768804 """
769805 from foundry .loaders .torch_wrapper import TorchDataset
@@ -775,10 +811,10 @@ def to_tensorflow(self, split: str = None):
775811 """Convert Foundry Dataset to a Tensorflow Sequence
776812
777813 Arguments:
778- split (string): Split to create Tensorflow Sequence on.
779- **Default:** ``None``
814+ split (string): Split to create Tensorflow Sequence on. Default is None.
780815
781- Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
816+ Returns:
817+ (TensorflowSequence): Tensorflow Sequence of all the data from the specified split
782818
783819 """
784820 from foundry .loaders .tf_wrapper import TensorflowSequence
0 commit comments