Skip to content

Commit a9707ea

Browse files
authored
Merge pull request #415 from MLMI2-CSSI/joss_docstring_fixes
docstring updates
2 parents 199395a + 601af0b commit a9707ea

2 files changed

Lines changed: 103 additions & 66 deletions

File tree

docs/how-to-contribute/contributing.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ If you want to contribute, start working through the Foundry codebase, navigate
3434
* Tests should follow [testing best practices](https://www..org/community/contribute/tests)
3535

3636
guide.
37+
* Any contributions should include adequate in-line and function-level documentation; docstrings should be in the [Google docstring](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) format
3738

3839
## Pull Request Process
3940

foundry/foundry.py

Lines changed: 102 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,9 @@
3535

3636
class Foundry(FoundryBase):
3737
"""Foundry Client Base Class
38-
TODO:
39-
-------
40-
Add Docstring
4138
39+
Foundry object used for all interactions with Foundry datasets and models. Interfaces with MDF Connect Client,
40+
Globus Compute, Globus Auth, Globus Transfer, Globus Search, DLHub, and relevant Globus Endpoints
4241
"""
4342

4443
dlhub_client: Any
@@ -53,6 +52,7 @@ def __init__(
5352
self, no_browser=False, no_local_server=False, index="mdf", authorizers=None, **data
5453
):
5554
"""Initialize a Foundry client
55+
5656
Args:
5757
no_browser (bool): Whether to open the browser for the Globus Auth URL.
5858
no_local_server (bool): Whether a local server is available.
@@ -147,6 +147,7 @@ def __init__(
147147

148148
def load(self, name, download=True, globus=False, verbose=False, metadata=None, authorizers=None, **kwargs):
149149
"""Load the metadata for a Foundry dataset into the client
150+
150151
Args:
151152
name (str): Name of the foundry dataset
152153
download (bool): If True, download the data associated with the package (default is True)
@@ -210,12 +211,14 @@ def load(self, name, download=True, globus=False, verbose=False, metadata=None,
210211

211212
def search(self, q=None, limit=None):
212213
"""Search available Foundry datasets
213-
q (str): query string to match
214-
limit (int): maximum number of results to return
215214
216-
Returns
217-
-------
218-
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication year, and DOI
215+
Args:
216+
q (str): query string to match
217+
limit (int): maximum number of results to return
218+
219+
Returns:
220+
(pandas.DataFrame): DataFrame with summary list of Foundry data packages including name, title, publication
221+
year, and DOI
219222
"""
220223
if not q:
221224
q = None
@@ -240,22 +243,24 @@ def search(self, q=None, limit=None):
240243

241244
def list(self):
242245
"""List available Foundry datasets
243-
Returns
244-
-------
245-
(pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication year, and DOI
246+
247+
Returns:
248+
(pandas.DataFrame): DataFrame with summary list of Foundry datasets including name, title, publication
249+
year, and DOI
246250
"""
247251
return self.search()
248252

249253
def run(self, name, inputs, funcx_endpoint=None, **kwargs):
250-
"""Run a model on data
254+
"""Run a model on inputted data
251255
252256
Args:
253-
name (str): DLHub model name
254-
inputs: Data to send to DLHub as inputs (should be JSON serializable)
255-
funcx_endpoint (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
257+
name (str): DLHub model name
258+
inputs: Data to send to DLHub as inputs (should be JSON serializable, example types include dict, list,
259+
np.ndarray, etc)
260+
funcx_endpoint (str) (optional): UUID for the funcx endpoint to run the model on, if not the default (eg River)
256261
257262
Returns:
258-
Returns results after invocation via the DLHub service
263+
Results after invocation via the DLHub service
259264
"""
260265
if funcx_endpoint is not None:
261266
self.dlhub_client.fx_endpoint = funcx_endpoint
@@ -273,14 +278,13 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
273278
subclass Foundry and override the load_data function
274279
275280
Args:
276-
inputs (list): List of strings for input columns
277-
targets (list): List of strings for output columns
278-
source_id (string): Relative path to the source file
281+
source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
282+
globus (bool): If True, download using Globus, otherwise, HTTPS
279283
as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
280284
splits (list): Labels of splits to be loaded
281285
282286
Returns:
283-
(dict): a labeled dictionary of tuples
287+
data (dict): a labeled dictionary of tuples
284288
"""
285289
data = {}
286290

@@ -310,6 +314,14 @@ def load_data(self, source_id=None, globus=True, as_hdf5=False, splits=[]):
310314
"Metadata not loaded into Foundry object, make sure to call load()") from e
311315

312316
def _repr_html_(self) -> str:
317+
"""Format the Foundry object for notebook rendering as HTML output
318+
319+
Args:
320+
self (Foundry)
321+
322+
Returns:
323+
buf (str): buffer containing the HTML to render
324+
"""
313325
if not self.dc:
314326
buf = str(self)
315327
else:
@@ -325,6 +337,16 @@ def _repr_html_(self) -> str:
325337
return buf
326338

327339
def get_citation(self) -> str:
340+
"""Obtain BibTeX citation for the dataset
341+
342+
Uses the dataset currently loaded in the Foundry object described by `self`
343+
344+
Args:
345+
self (Foundry)
346+
347+
Returns:
348+
bibtex (str): The BibTeX citation in string format
349+
"""
328350
subjects = [subject['subject'] for subject in self.dc['subjects']]
329351
doi_str = f"doi = {{{self.dc['identifier']['identifier']}}}"
330352
url_str = f"url = {{https://doi.org/{self.dc['identifier']['identifier']}}}"
@@ -346,6 +368,7 @@ def publish_dataset(
346368
**kwargs: Dict[str, Any],) -> Dict[str, Any]:
347369
"""Submit a dataset for publication; can choose to submit via HTTPS using `https_data_path` or via Globus
348370
Transfer using the `globus_data_source` argument. Only one upload method may be specified.
371+
349372
Args:
350373
foundry_metadata (dict): Dict of metadata describing data package
351374
title (string): Title of data package
@@ -378,11 +401,9 @@ def publish_dataset(
378401
related_dois (list): DOIs related to this dataset,
379402
not including the dataset's own DOI (for example, an associated paper's DOI).
380403
381-
Returns
382-
-------
383-
(dict) MDF Connect Response: Response from MDF Connect to allow tracking
384-
of dataset. Contains `source_id`, which can be used to check the
385-
status of the submission
404+
Returns:
405+
res (MDF Connect Response): Response from MDF Connect to allow tracking of dataset. Contains
406+
`source_id`, which can be used to check the status of the submission
386407
"""
387408
# ensure metadata is properly formatted
388409
self.validate_metadata(foundry_metadata)
@@ -461,23 +482,27 @@ def publish_model(self, title, creators, short_name, servable_type, serv_options
461482
"pytorch",
462483
"tensorflow",
463484
"sklearn")
464-
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can be found at
465-
https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html under the appropriate
466-
``create_model`` signature. use the argument names as keys and their values as the values.
485+
serv_options (dict): the servable_type specific arguments that are necessary for publishing. arguments can
486+
be found at https://dlhub-sdk.readthedocs.io/en/latest/source/dlhub_sdk.models.servables.html
487+
under the appropriate ``create_model`` signature. use the argument names as keys and their values as
488+
the values.
467489
affiliations (list): list of affiliations for each author
468490
paper_doi (str): DOI of a paper that describes the servable
491+
469492
Returns:
470493
(string): task id of this submission, can be used to check for success
494+
471495
Raises:
472496
ValueError: If the given servable_type is not in the list of acceptable types
473497
Exception: If the serv_options are incomplete or the request to publish results in an error
474498
"""
475-
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations, paper_doi)
499+
return self.dlhub_client.easy_publish(title, creators, short_name, servable_type, serv_options, affiliations,
500+
paper_doi)
476501

477502
def check_status(self, source_id, short=False, raw=False):
478503
"""Check the status of your submission.
479504
480-
Arguments:
505+
Args:
481506
source_id (str): The ``source_id`` (``source_name`` + version information) of the
482507
submission to check. Returned in the ``res`` result from ``publish()`` via MDF Connect Client.
483508
short (bool): When ``False``, will print a status summary containing
@@ -491,47 +516,38 @@ def check_status(self, source_id, short=False, raw=False):
491516
**Default:** ``False``
492517
493518
Returns:
494-
If ``raw`` is ``True``, *dict*: The full status result.
519+
(dict): Brief status result of dataset publication. If `raw` is True, the full status result.
495520
"""
496521
return self.connect_client.check_status(source_id, short, raw)
497522

498-
# def check_model_status(self, res):
499-
# """Check status of model or function publication to DLHub
500-
#
501-
# TODO: currently broken on DLHub side of things
502-
# """
503-
# # return self.dlhub_client.get_task_status(res)
504-
# pass
505-
506523
def configure(self, **kwargs):
507524
"""Set Foundry config
508-
Keyword Args:
509-
file (str): Path to the file containing
510-
(default: self.config.metadata_file)
511525
512-
dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
513-
data_file (str): : filename for the data file default:"foundry.hdf5"
514-
destination_endpoint (str): Globus endpoint UUID where Foundry data should move
515-
local_cache_dir (str): Where to place collected data default:"./data"
526+
Keyword Args:
527+
file (str): Path to the file containing (default: self.config.metadata_file)
528+
dataframe_file (str): filename for the dataframe file default:"foundry_dataframe.json"
529+
data_file (str): : filename for the data file default:"foundry.hdf5"
530+
destination_endpoint (str): Globus endpoint UUID where Foundry data should move
531+
local_cache_dir (str): Where to place collected data default:"./data"
516532
517-
Returns
518-
-------
519-
(Foundry): self: for chaining
533+
Returns:
534+
self (Foundry): for chaining
520535
"""
521536
self.config = FoundryConfig(**kwargs)
522537
return self
523538

524-
def download(self, globus: bool = True, interval: int = 20, parallel_https: int = 4, verbose: bool = False) -> 'Foundry':
539+
def download(self, globus: bool = True, interval: int = 20, parallel_https: int = 4, verbose: bool = False) -> \
540+
'Foundry':
525541
"""Download a Foundry dataset
526542
527543
Args:
528-
globus: if True, use Globus to download the data else try HTTPS
529-
interval: How often to wait before checking Globus transfer status
530-
parallel_https: Number of files to download in parallel if using HTTPS
531-
verbose: Produce more debug messages to screen
544+
globus (bool): if True, use Globus to download the data else try HTTPS
545+
interval (int): How often to wait before checking Globus transfer status
546+
parallel_https (int): Number of files to download in parallel if using HTTPS
547+
verbose (bool): Produce more debug messages to screen
532548
533549
Returns:
534-
self, for chaining
550+
self (Foundry): for chaining
535551
"""
536552
# Check if the dir already exists
537553
path = os.path.join(self.config.local_cache_dir, self.mdf["source_id"])
@@ -620,14 +636,14 @@ def download(self, globus: bool = True, interval: int = 20, parallel_https: int
620636
def get_keys(self, type=None, as_object=False):
621637
"""Get keys for a Foundry dataset
622638
623-
Arguments:
639+
Args:
624640
type (str): The type of key to be returned e.g., "input", "target"
625641
as_object (bool): When ``False``, will return a list of keys in as strings
626642
When ``True``, will return the full key objects
627643
**Default:** ``False``
628-
Returns: (list) String representations of keys or if ``as_object``
629-
is False otherwise returns the full key objects.
630-
644+
Returns:
645+
key_list (list): String representations of keys or if ``as_object`` is False otherwise returns the full
646+
key objects
631647
"""
632648

633649
if as_object:
@@ -648,6 +664,25 @@ def get_keys(self, type=None, as_object=False):
648664
return key_list
649665

650666
def _load_data(self, file=None, source_id=None, globus=True, as_hdf5=False):
667+
"""Handle the bulk of loading a dataset logic
668+
669+
Args:
670+
file (str): Relative path to the data file (specified via splits). Supported file types include tabular
671+
(eg JSON, JSON lines, csv) and HDF5
672+
source_id (str): Name of the dataset in MDF/Foundry index (``source_name`` + version information)
673+
globus (bool): If True, download using Globus, otherwise, HTTPS. Necessary for test functionality
674+
as_hdf5 (bool): If True and dataset is in hdf5 format, keep data in hdf5 format
675+
676+
Returns:
677+
(Pandas.dataframe): Tabular dataset formatted to Pandas dataframe
678+
tmp_data (dict): HDF5 data (if applicable) reformatted into dict form for easy output
679+
680+
Raises:
681+
ValueError: If path to the data file is valid. Or, if tabular data cannot be read
682+
FileNotFoundError: If no file was found at the expected path
683+
NotImplementedError: If file type inputted is not supported
684+
685+
"""
651686
# Build the path to access the cached data
652687
if source_id:
653688
path = os.path.join(self.config.local_cache_dir, source_id)
@@ -719,7 +754,8 @@ def _get_inputs_targets(self, split: str = None):
719754
split (string): Split to get inputs and outputs from.
720755
**Default:** ``None``
721756
722-
Returns: (Tuple) Tuple of the inputs and outputs
757+
Returns:
758+
(Tuple): Tuple of the inputs and outputs
723759
"""
724760
raw = self.load_data(as_hdf5=False)
725761

@@ -760,10 +796,10 @@ def to_torch(self, split: str = None):
760796
"""Convert Foundry Dataset to a PyTorch Dataset
761797
762798
Arguments:
763-
split (string): Split to create PyTorch Dataset on.
764-
**Default:** ``None``
799+
split (string): Split to create PyTorch Dataset on. Default is None.
765800
766-
Returns: (TorchDataset) PyTorch Dataset of all the data from the specified split
801+
Returns:
802+
(TorchDataset): PyTorch Dataset of all the data from the specified split
767803
768804
"""
769805
from foundry.loaders.torch_wrapper import TorchDataset
@@ -775,10 +811,10 @@ def to_tensorflow(self, split: str = None):
775811
"""Convert Foundry Dataset to a Tensorflow Sequence
776812
777813
Arguments:
778-
split (string): Split to create Tensorflow Sequence on.
779-
**Default:** ``None``
814+
split (string): Split to create Tensorflow Sequence on. Default is None.
780815
781-
Returns: (TensorflowSequence) Tensorflow Sequence of all the data from the specified split
816+
Returns:
817+
(TensorflowSequence): Tensorflow Sequence of all the data from the specified split
782818
783819
"""
784820
from foundry.loaders.tf_wrapper import TensorflowSequence

0 commit comments

Comments
 (0)