Skip to content

Commit 85e74c7

Browse files
committed
first pass at docstring updates
1 parent 35c2e12 commit 85e74c7

2 files changed

Lines changed: 103 additions & 67 deletions

File tree

docs/how-to-contribute/contributing.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ If you want to contribute, start working through the Foundry codebase, navigate
3434
* Tests should follow [testing best practices](https://www..org/community/contribute/tests)
3535

3636
guide.
37+
* Any contributions should include adequate in-line and function-level documentation; docstrings should be in the [Google docstring](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) format
3738

3839
## Pull Request Process
3940

foundry/foundry.py

Lines changed: 102 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,9 @@
3535

3636
class Foundry(FoundryBase):
3737
"""Foundry Client Base Class
38-
TODO:
39-
-------
40-
Add Docstring
4138
39+
Foundry object used for all interactions with Foundry datasets and models. Interfaces with MDF Connect Client,
40+
Globus Compute, Globus Auth, Globus Transfer, Globus Search, DLHub, and relevant Globus Endpoints
4241
"""
4342

4443
dlhub_client: Any
@@ -55,6 +54,7 @@ def __init__(
5554
**data
5655
):
5756
"""Initialize a Foundry client
57+
5858
Args:
5959
name (str): Name of the foundry dataset. If not supplied, metadata will not be loaded into
6060
the Foundry object
@@ -166,6 +166,7 @@ def __init__(
166166

167167
def _load(self, name, download=True, globus=True, verbose=False, metadata=None, authorizers=None, interval=None):
168168
"""Load the metadata for a Foundry dataset into the client
169+
169170
Args:
170171
name (str): Name of the foundry dataset
171172
download (bool): If True, download the data associated with the package (default is True)
@@ -228,12 +229,14 @@ def _load(self, name, download=True, globus=True, verbose=False, metadata=None,
228229

229230
def search(self, q=None, limit=None):
230231
"""Search available Foundry datasets
231-
q (str): query string to match
232-
limit (int): maximum number of results to return
233232
234-
Returns
235-
-------
236-
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
233+
Args:
234+
q (str): query string to match
235+
limit (int): maximum number of results to return
236+
237+
Returns:
238+
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication
239+
year, and DOI
237240
"""
238241
if not q:
239242
q = None
@@ -258,22 +261,24 @@ def search(self, q=None, limit=None):
258261

259262
def list(self):
260263
"""List available Foundry datasets
261-
Returns
262-
-------
263-
(pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication year, and DOI
264+
265+
Returns:
266+
(pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication
267+
year, and DOI
264268
"""
265269
return self.search()
266270

267271
def run(self, name, inputs, funcx_endpoint=None, **kwargs):
268-
"""Run a model on data
272+
"""Run a model on inputted data
269273
270274
Args:
271-
name (str): DLHub model name
272-
inputs: Data to send to DLHub as inputs (should be JSON serializable)
273-
funcx_endpoint (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
275+
name (str): DLHub model name
276+
inputs: Data to send to DLHub as inputs (should be JSON serializable, example types include dict, list,
277+
np.ndarray, etc)
278+
funcx_endpoint (str) (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
274279
275280
Returns:
276-
Returns results after invocation via the DLHub service
281+
Results after invocation via the DLHub service
277282
"""
278283
if funcx_endpoint is not None:
279284
self.dlhub_client.fx_endpoint = funcx_endpoint
@@ -291,14 +296,13 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
291296
subclass Foundry and override the load_data function
292297
293298
Args:
294-
inputs (list): List of strings for input columns
295-
targets (list): List of strings for output columns
296-
source_id (string): Relative path to the source file
299+
source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
300+
globus (bool): If True, download using Globus, otherwise, HTTPS
297301
as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
298302
splits (list): Labels of splits to be loaded
299303
300304
Returns:
301-
(dict): a labeled dictionary of tuples
305+
data (dict): a labeled dictionary of tuples
302306
"""
303307
data = {}
304308

@@ -328,6 +332,14 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
328332
"Metadata not loaded into Foundry object, make sure to call load()") from e
329333

330334
def _repr_html_(self) -> str:
335+
"""Format the Foundry object for notebook rendering as HTML output
336+
337+
Args:
338+
self (Foundry)
339+
340+
Returns:
341+
buf (str): buffer containing the HTML to render
342+
"""
331343
if not self.dc:
332344
buf = str(self)
333345
else:
@@ -343,6 +355,16 @@ def _repr_html_(self) -> str:
343355
return buf
344356

345357
def get_citation(self) -> str:
358+
"""Obtain BibTeX citation for the dataset
359+
360+
Uses the dataset currently loaded in the Foundry object described by `self`
361+
362+
Args:
363+
self (Foundry)
364+
365+
Returns:
366+
bibtex (str): The BibTeX citation in string format
367+
"""
346368
subjects = [subject['subject'] for subject in self.dc['subjects']]
347369
doi_str = f"doi = {{{self.dc['identifier']['identifier']}}}"
348370
url_str = f"url = {{https://doi.org/{self.dc['identifier']['identifier']}}}"
@@ -364,6 +386,7 @@ def publish_dataset(
364386
**kwargs: Dict[str, Any],) -> Dict[str, Any]:
365387
"""Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
366388
Transfer using the `globus_data_source` argument. Only one upload method may be specified.
389+
367390
Args:
368391
foundry_metadata (dict): Dict of metadata describing data package
369392
title (string): Title of data package
@@ -396,11 +419,9 @@ def publish_dataset(
396419
related_dois (list): DOIs related to this dataset,
397420
not including the dataset's own DOI (for example, an associated paper's DOI).
398421
399-
Returns
400-
-------
401-
(dict) MDF Connect Response: Response from MDF Connect to allow tracking
402-
of dataset. Contains `source_id`, which can be used to check the
403-
status of the submission
422+
Returns:
423+
res (MDF Connect Response): Response from MDF Connect to allow tracking of dataset. Contains
424+
`source_id`, which can be used to check the status of the submission
404425
"""
405426
# ensure metadata is properly formatted
406427
self.validate_metadata(foundry_metadata)
@@ -463,23 +484,27 @@ def publish_model(self, title, creators, short_name, servable_type, serv_options
463484
"pytorch",
464485
"tensorflow",
465486
"sklearn")
466-
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
467-
https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
468-
``create_model`` signature. use the argument names as keys and their values as the values.
487+
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can
488+
be found at https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html
489+
under the appropriate ``create_model`` signature. use the argument names as keys and their values as
490+
the values.
469491
affiliations (list): list of affiliations for each author
470492
paper_doi (str): DOI of a paper that describes the servable
493+
471494
Returns:
472495
(string): task id of this submission, can be used to check for success
496+
473497
Raises:
474498
ValueError: If the given servable_type is not in the list of acceptable types
475499
Exception: If the serv_options are incomplete or the request to publish results in an error
476500
"""
477-
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations, paper_doi)
501+
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations,
502+
paper_doi)
478503

479504
def check_status(self, source_id, short=False, raw=False):
480505
"""Check the status of your submission.
481506
482-
Arguments:
507+
Args:
483508
source_id (str): The ``source_id`` (``source_name`` + version information) of the
484509
submission to check. Returned in the ``res`` result from ``publish()`` via MDF Connect Client.
485510
short (bool): When ``False``, will print a status summary containing
@@ -493,47 +518,38 @@ def check_status(self, source_id, short=False, raw=False):
493518
**Default:** ``False``
494519
495520
Returns:
496-
If ``raw`` is ``True``, *dict*: The full status result.
521+
(dict): Brief status result of dataset publication. If `raw` is True, the full status result.
497522
"""
498523
return self.connect_client.check_status(source_id, short, raw)
499524

500-
# def check_model_status(self, res):
501-
# """Check status of model or function publication to DLHub
502-
#
503-
# TODO: currently broken on DLHub side of things
504-
# """
505-
# # return self.dlhub_client.get_task_status(res)
506-
# pass
507-
508525
def configure(self, **kwargs):
509526
"""Set Foundry config
510-
Keyword Args:
511-
file (str): Path to the file containing
512-
(default: self.config.metadata_file)
513527
514-
dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
515-
data_file (str): : filename for the data file default:"foundry.hdf5"
516-
destination_endpoint (str): Globus endpoint UUID where Foundry data should move
517-
local_cache_dir (str): Where to place collected data default:"./data"
528+
Keyword Args:
529+
file (str): Path to the file containing (default: self.config.metadata_file)
530+
dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
531+
data_file (str): : filename for the data file default:"foundry.hdf5"
532+
destination_endpoint (str): Globus endpoint UUID where Foundry data should move
533+
local_cache_dir (str): Where to place collected data default:"./data"
518534
519-
Returns
520-
-------
521-
(Foundry): self: for chaining
535+
Returns:
536+
self (Foundry): for chaining
522537
"""
523538
self.config = FoundryConfig(**kwargs)
524539
return self
525540

526-
def download(self, globus: bool = True, interval: int = 20, parallel_https: int = 4, verbose: bool = False) -> 'Foundry':
541+
def download(self, globus: bool = True, interval: int = 20, parallel_https: int = 4, verbose: bool = False) -> \
542+
'Foundry':
527543
"""Download a Foundry dataset
528544
529545
Args:
530-
globus: if True, use Globus to download the data else try HTTPS
531-
interval: How often to wait before checking Globus transfer status
532-
parallel_https: Number of files to download in parallel if using HTTPS
533-
verbose: Produce more debug messages to screen
546+
globus (bool): if True, use Globus to download the data else try HTTPS
547+
interval (int): How often to wait before checking Globus transfer status
548+
parallel_https (int): Number of files to download in parallel if using HTTPS
549+
verbose (bool): Produce more debug messages to screen
534550
535551
Returns:
536-
self, for chaining
552+
self (Foundry): for chaining
537553
"""
538554
# Check if the dir already exists
539555
path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"])
@@ -622,14 +638,14 @@ def download(self, globus: bool = True, interval: int = 20, parallel_https: int
622638
def get_keys(self, type=None, as_object=False):
623639
"""Get keys for a Foundry dataset
624640
625-
Arguments:
641+
Args:
626642
type (str): The type of key to be returned e.g., "input", "target"
627643
as_object (bool): When ``False``, will return a list of keys in as strings
628644
When ``True``, will return the full key objects
629645
**Default:** ``False``
630-
Returns: (list) String representations of keys or if ``as_object``
631-
is False otherwise returns the full key objects.
632-
646+
Returns:
647+
key_list (list): String representations of keys or if ``as_object`` is False otherwise returns the full
648+
key objects
633649
"""
634650

635651
if as_object:
@@ -649,7 +665,25 @@ def get_keys(self, type=None, as_object=False):
649665
key_list = key_list + k
650666
return key_list
651667

652-
def _load_data(self, file=None, source_id=None, globus=True, as_hdf5=False):
668+
def _load_data(self, file=None, source_id=None, as_hdf5=False):
669+
"""Handle the bulk of loading a dataset logic
670+
671+
Args:
672+
file (str): Relative path to the data file (specified via splits). Supported file types include tabular
673+
(eg JSON, JSON lines, csv) and HDF5
674+
source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
675+
as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
676+
677+
Returns:
678+
(Pandas.dataframe): Tabular dataset formatted to Pandas dataframe
679+
tmp_data (dict): HDF5 data (if applicable) reformatted into dict form for easy output
680+
681+
Raises:
682+
ValueError: If path to the data file is valid. Or, if tabular data cannot be read
683+
FileNotFoundError: If no file was found at the expected path
684+
NotImplementedError: If file type inputted is not supported
685+
686+
"""
653687
# Build the path to access the cached data
654688
if source_id:
655689
path = os.path.join(self.config.local_cache_dir, source_id)
@@ -721,7 +755,8 @@ def _get_inputs_targets(self, split: str = None):
721755
split (string): Split to get inputs and outputs from.
722756
**Default:** ``None``
723757
724-
Returns: (Tuple) Tuple of the inputs and outputs
758+
Returns:
759+
(Tuple): Tuple of the inputs and outputs
725760
"""
726761
raw = self.load_data(as_hdf5=False)
727762

@@ -762,10 +797,10 @@ def to_torch(self, split: str = None):
762797
"""Convert Foundry Dataset to a PyTorch Dataset
763798
764799
Arguments:
765-
split (string): Split to create PyTorch Dataset on.
766-
**Default:** ``None``
800+
split (string): Split to create PyTorch Dataset on. Default is None.
767801
768-
Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
802+
Returns:
803+
(TorchDataset): PyTorch Dataset of all the data from the specified split
769804
770805
"""
771806
from foundry.loaders.torch_wrapper import TorchDataset
@@ -777,10 +812,10 @@ def to_tensorflow(self, split: str = None):
777812
"""Convert Foundry Dataset to a Tensorflow Sequence
778813
779814
Arguments:
780-
split (string): Split to create Tensorflow Sequence on.
781-
**Default:** ``None``
815+
split (string): Split to create Tensorflow Sequence on. Default is None.
782816
783-
Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
817+
Returns:
818+
(TensorflowSequence): Tensorflow Sequence of all the data from the specified split
784819
785820
"""
786821
from foundry.loaders.tf_wrapper import TensorflowSequence

0 commit comments

Comments
 (0)