From 5411de577cc1556e280581f642b0ba27a47ced86 Mon Sep 17 00:00:00 2001 From: misohu Date: Tue, 23 Apr 2024 15:44:06 +0200 Subject: [PATCH 1/4] Add gpu flag --- src/dss/config.py | 7 ++ src/dss/create_notebook.py | 106 +++++++++--------- src/dss/main.py | 36 ++++-- .../notebook_deployment.yaml.j2 | 5 + src/dss/utils.py | 21 +++- 5 files changed, 110 insertions(+), 65 deletions(-) diff --git a/src/dss/config.py b/src/dss/config.py index 700494d9..a70735bb 100644 --- a/src/dss/config.py +++ b/src/dss/config.py @@ -28,6 +28,13 @@ def format_images_message(images_dict: dict) -> str: RECOMMENDED_IMAGES_MESSAGE = format_images_message(NOTEBOOK_IMAGES_ALIASES) DEFAULT_NOTEBOOK_IMAGE = "kubeflownotebookswg/jupyter-scipy:v1.8.0" +SUPPORTED_GPUS = ["nvidia"] +GPU_DEPLOYMENT_LABEL = { "nvidia": "nvidia.com/gpu"} +NODE_LABELS = { "nvidia": [ + "nvidia.com/gpu.present", + "nvidia.com/gpu.deploy.container-toolkit", + "nvidia.com/gpu.deploy.device-plugin" +]} class DeploymentState(Enum): ACTIVE = "Active" diff --git a/src/dss/create_notebook.py b/src/dss/create_notebook.py index 66c87365..eaddd805 100644 --- a/src/dss/create_notebook.py +++ b/src/dss/create_notebook.py @@ -14,6 +14,8 @@ NOTEBOOK_IMAGES_ALIASES, NOTEBOOK_PVC_NAME, RECOMMENDED_IMAGES_MESSAGE, + NODE_LABELS, + GPU_DEPLOYMENT_LABEL ) from dss.logger import setup_logger from dss.utils import ( @@ -24,51 +26,47 @@ get_mlflow_tracking_uri, get_service_url, wait_for_deployment_ready, + node_has_gpu_labels ) # Set up logger logger = setup_logger("logs/dss.log") -def create_notebook(name: str, image: str, lightkube_client: Client) -> None: +def create_notebook(name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None) -> None: """ - Creates a Notebook server on the Kubernetes cluster. + Creates a Notebook server on the Kubernetes cluster with optional GPU support. Args: name (str): The name of the notebook server. - image (str): The image used for the notebook server. - lightkube_client (Client): The Kubernetes client. + image (str): The Docker image used for the notebook server. + lightkube_client (Client): The Kubernetes client used for server creation. + gpu (Optional[str]): Specifies the GPU type for the notebook if required. Supported: 'nvidia'. + + Raises: + RuntimeError: If there is a failure in notebook creation or GPU label checking. """ - if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist( - lightkube_client - ): - logger.error("Failed to create notebook. DSS was not correctly initialized.\n") - logger.info( - "You might want to run\n" - " dss status to check the current status\n" - " dss logs --all to review all logs\n" - " dss initialize to install dss\n" - ) - return + if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]): + logger.error(f"Failed to create notebook with {gpu} GPU acceleration.") + logger.info("You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup.") + raise RuntimeError() + + if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(lightkube_client): + logger.error("Failed to create notebook. DSS was not correctly initialized.") + logger.info("You might want to run 'dss status' to check the current status, 'dss logs --all' to review all logs, or 'dss initialize' to install DSS.") + raise RuntimeError() + if does_notebook_exist(name, DSS_NAMESPACE, lightkube_client): - # Assumes that the notebook server is exposed by a service of the same name. - logger.error( - f"Failed to create Notebook. Notebook with name '{name}' already exists.\n" - f"Please specify a different name." - ) - url = get_service_url("name", DSS_NAMESPACE, lightkube_client) + logger.error(f"Notebook with name '{name}' already exists. Please specify a different name.") + url = get_service_url(name, DSS_NAMESPACE, lightkube_client) if url: logger.info(f"To connect to the existing notebook, go to {url}.") return - manifests_file = Path( - Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2" - ) - + manifests_file = Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2" image_full_name = _get_notebook_image_name(image) - config = _get_notebook_config(image_full_name, name) + config = _get_notebook_config(image_full_name, name, GPU_DEPLOYMENT_LABEL[gpu] if gpu else None) - # Initialize KubernetesResourceHandler k8s_resource_handler = KubernetesResourceHandler( field_manager=FIELD_MANAGER, labels=DSS_CLI_MANAGER_LABELS, @@ -80,50 +78,52 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None: try: k8s_resource_handler.apply() - wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name) - logger.info(f"Success: Notebook {name} created successfully.") except ApiError as err: - logger.error( - f"Failed to create Notebook with error code {err.status.code}." - " Check the debug logs for more details." - ) + logger.error(f"Failed to create Notebook with error code {err.status.code}. Check the debug logs for more details.") logger.debug(f"Failed to create Notebook {name} with error {err}") - return + raise RuntimeError() except TimeoutError as err: logger.error(str(err)) - logger.warn( - f"Timed out while trying to create Notebook {name}.\n" - "Some resources might be left in the cluster. Check the status with `dss list`." - ) - return + logger.warn(f"Timed out while trying to create Notebook {name}. Some resources might be left in the cluster. Check the status with `dss list`.") + raise RuntimeError() except ImagePullBackOffError as err: - logger.error( - f"Timed out while trying to create Notebook {name}.\n" - f"Image {image_full_name} does not exist or is not accessible.\n" - ) - logger.info( - "Note: You might want to use some of these recommended images:\n" - f"{RECOMMENDED_IMAGES_MESSAGE}" - ) - logger.debug(f"Timed out while trying to create Notebook {name} with error {err}.") - return - # Assumes that the notebook server is exposed by a service of the same name. + logger.error(f"Image {image_full_name} does not exist or is not accessible.") + logger.info("Note: You might want to use some of these recommended images:\n" + RECOMMENDED_IMAGES_MESSAGE) + logger.debug(f"Image pull back off error while creating Notebook {name} with error {err}.") + raise RuntimeError() + url = get_service_url(name, DSS_NAMESPACE, lightkube_client) if url: logger.info(f"Access the notebook at {url}.") -def _get_notebook_config(image: str, name: str) -> dict: - mlflow_tracking_uri = get_mlflow_tracking_uri() +def _get_notebook_config(image: str, name: str, gpu: str = None) -> dict: + """ + Generates the configuration dictionary for creating a notebook deployment. + + Args: + image (str): The Docker image to use for the notebook. + name (str): The name of the notebook. + gpu (str, optional): The type of GPU acceleration ('nvidia'), if applicable. + + Returns: + dict: A dictionary with configuration values for the notebook. + """ + # Base configuration for the notebook context = { - "mlflow_tracking_uri": mlflow_tracking_uri, + "mlflow_tracking_uri": get_mlflow_tracking_uri(), "notebook_name": name, "namespace": DSS_NAMESPACE, "notebook_image": image, "pvc_name": NOTEBOOK_PVC_NAME, } + + # Conditionally add GPU configuration if specified + if gpu: + context["gpu"] = gpu + return context diff --git a/src/dss/main.py b/src/dss/main.py index 68b35768..602ced48 100644 --- a/src/dss/main.py +++ b/src/dss/main.py @@ -1,7 +1,7 @@ import click from lightkube.core.exceptions import ApiError -from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE +from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE, SUPPORTED_GPUS from dss.create_notebook import create_notebook from dss.initialize import initialize from dss.list import list_notebooks @@ -59,23 +59,37 @@ def initialize_command(kubeconfig: str) -> None: "--kubeconfig", help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.", # noqa E501 ) -def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: +@click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.") +@click.option("--gpu", type=click.Choice(SUPPORTED_GPUS), help="Specify the type of GPU acceleration, e.g., 'nvidia'.") +def create_notebook_command(name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str) -> None: """Create a Jupyter notebook in DSS and connect it to MLflow. This command also outputs the URL to access the notebook on success. \b """ - logger.info("Executing create command") - if image == DEFAULT_NOTEBOOK_IMAGE: - logger.info( - f"No image is specified. Using default value {DEFAULT_NOTEBOOK_IMAGE}." - " For more information on using a specific image, see dss create --help." - ) + try: + logger.info("Executing create command") + if image == DEFAULT_NOTEBOOK_IMAGE: + logger.info( + f"No image is specified. Using default value {DEFAULT_NOTEBOOK_IMAGE}." + " For more information on using a specific image, see dss create --help." + ) + + # Check mutual exclusivity + if no_gpu and gpu: + logger.error("You cannot specify both --no-gpu and --gpu options.") + raise click.UsageError("Options --no-gpu and --gpu are mutually exclusive.") - kubeconfig = get_default_kubeconfig(kubeconfig) - lightkube_client = get_lightkube_client(kubeconfig) + kubeconfig = get_default_kubeconfig(kubeconfig) + lightkube_client = get_lightkube_client(kubeconfig) - create_notebook(name=name, image=image, lightkube_client=lightkube_client) + create_notebook(name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu) + except RuntimeError: + click.get_current_context().exit(1) + except Exception as e: + logger.debug(f"Failed to create notebook {name}: {e}.", exc_info=True) + logger.error(f"Failed to create notebook {name}: {str(e)}.") + click.get_current_context().exit(1) create_notebook_command.help += f""" diff --git a/src/dss/manifest_templates/notebook_deployment.yaml.j2 b/src/dss/manifest_templates/notebook_deployment.yaml.j2 index 4eefc0ad..df12c4c9 100644 --- a/src/dss/manifest_templates/notebook_deployment.yaml.j2 +++ b/src/dss/manifest_templates/notebook_deployment.yaml.j2 @@ -23,6 +23,11 @@ spec: - env: - name: MLFLOW_TRACKING_URI value: {{ mlflow_tracking_uri }} + {% if gpu %} + resources: + limits: + {{ gpu }}: 1 + {% endif %} image: {{ notebook_image }} imagePullPolicy: IfNotPresent name: {{ notebook_name }} diff --git a/src/dss/utils.py b/src/dss/utils.py index dea5aa0a..6f180fb7 100644 --- a/src/dss/utils.py +++ b/src/dss/utils.py @@ -1,6 +1,6 @@ import os import time -from typing import Optional +from typing import List, Optional from lightkube import ApiError, Client, KubeConfig from lightkube.resources.apps_v1 import Deployment @@ -32,6 +32,25 @@ def __init__(self, msg: str, *args): self.msg = str(msg) +def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool: + """ + Check if at least one node in the Kubernetes cluster has all the specified labels. + + Args: + lightkube_client (Client): The Kubernetes client. + labels (List[str]): A list of label keys that must be present on the node. + + Returns: + bool: True if at least one node has all the specified labels, False otherwise. + """ + nodes = lightkube_client.list(Node) + for node in nodes: + node_labels = node.metadata.labels + if all(label in node_labels for label in labels): + return True + return False + + def wait_for_deployment_ready( client: Client, namespace: str, From bbf0d4244172f67ada59f39a2f1b0aad62c4c8cd Mon Sep 17 00:00:00 2001 From: misohu Date: Wed, 24 Apr 2024 10:34:32 +0200 Subject: [PATCH 2/4] Implement the gpu support flag functionality --- src/dss/config.py | 15 +-- src/dss/create_notebook.py | 86 ++++++++++------ src/dss/main.py | 16 ++- src/dss/utils.py | 41 +++++--- tests/integration/test_dss.py | 32 ++++++ tests/unit/test_create_notebook.py | 152 +++++++++++++++++++---------- tox.ini | 7 +- 7 files changed, 238 insertions(+), 111 deletions(-) diff --git a/src/dss/config.py b/src/dss/config.py index a70735bb..73df1e6b 100644 --- a/src/dss/config.py +++ b/src/dss/config.py @@ -29,12 +29,15 @@ def format_images_message(images_dict: dict) -> str: DEFAULT_NOTEBOOK_IMAGE = "kubeflownotebookswg/jupyter-scipy:v1.8.0" SUPPORTED_GPUS = ["nvidia"] -GPU_DEPLOYMENT_LABEL = { "nvidia": "nvidia.com/gpu"} -NODE_LABELS = { "nvidia": [ - "nvidia.com/gpu.present", - "nvidia.com/gpu.deploy.container-toolkit", - "nvidia.com/gpu.deploy.device-plugin" -]} +GPU_DEPLOYMENT_LABEL = {"nvidia": "nvidia.com/gpu"} +NODE_LABELS = { + "nvidia": [ + "nvidia.com/gpu.present", + "nvidia.com/gpu.deploy.container-toolkit", + "nvidia.com/gpu.deploy.device-plugin", + ] +} + class DeploymentState(Enum): ACTIVE = "Active" diff --git a/src/dss/create_notebook.py b/src/dss/create_notebook.py index eaddd805..d3a92ead 100644 --- a/src/dss/create_notebook.py +++ b/src/dss/create_notebook.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Dict, Optional from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler from lightkube import Client @@ -10,12 +11,12 @@ DSS_CLI_MANAGER_LABELS, DSS_NAMESPACE, FIELD_MANAGER, + GPU_DEPLOYMENT_LABEL, MANIFEST_TEMPLATES_LOCATION, + NODE_LABELS, NOTEBOOK_IMAGES_ALIASES, NOTEBOOK_PVC_NAME, RECOMMENDED_IMAGES_MESSAGE, - NODE_LABELS, - GPU_DEPLOYMENT_LABEL ) from dss.logger import setup_logger from dss.utils import ( @@ -25,15 +26,17 @@ does_notebook_exist, get_mlflow_tracking_uri, get_service_url, + node_has_gpu_labels, wait_for_deployment_ready, - node_has_gpu_labels ) # Set up logger logger = setup_logger("logs/dss.log") -def create_notebook(name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None) -> None: +def create_notebook( + name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None +) -> None: """ Creates a Notebook server on the Kubernetes cluster with optional GPU support. @@ -41,31 +44,45 @@ def create_notebook(name: str, image: str, lightkube_client: Client, gpu: Option name (str): The name of the notebook server. image (str): The Docker image used for the notebook server. lightkube_client (Client): The Kubernetes client used for server creation. - gpu (Optional[str]): Specifies the GPU type for the notebook if required. Supported: 'nvidia'. - + gpu (Optional[str]): Specifies the GPU type for the notebook if required. + Raises: RuntimeError: If there is a failure in notebook creation or GPU label checking. """ if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]): - logger.error(f"Failed to create notebook with {gpu} GPU acceleration.") - logger.info("You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup.") + logger.error(f"Failed to create notebook with {gpu} GPU acceleration.\n") + logger.info( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup." # noqa E501 + ) raise RuntimeError() - if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(lightkube_client): - logger.error("Failed to create notebook. DSS was not correctly initialized.") - logger.info("You might want to run 'dss status' to check the current status, 'dss logs --all' to review all logs, or 'dss initialize' to install DSS.") + if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist( + lightkube_client + ): + logger.error("Failed to create notebook. DSS was not correctly initialized.\n") + logger.info( + "You might want to run\n" + " dss status to check the current status\n" + " dss logs --all to review all logs\n" + " dss initialize to install dss\n" + ) raise RuntimeError() if does_notebook_exist(name, DSS_NAMESPACE, lightkube_client): - logger.error(f"Notebook with name '{name}' already exists. Please specify a different name.") + logger.error( + f"Failed to create Notebook. Notebook with name '{name}' already exists.\n" + f"Please specify a different name." + ) url = get_service_url(name, DSS_NAMESPACE, lightkube_client) if url: logger.info(f"To connect to the existing notebook, go to {url}.") - return + raise RuntimeError() - manifests_file = Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2" + manifests_file = ( + Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2" + ) image_full_name = _get_notebook_image_name(image) - config = _get_notebook_config(image_full_name, name, GPU_DEPLOYMENT_LABEL[gpu] if gpu else None) + config = _get_notebook_config(image_full_name, name, gpu) k8s_resource_handler = KubernetesResourceHandler( field_manager=FIELD_MANAGER, @@ -78,19 +95,25 @@ def create_notebook(name: str, image: str, lightkube_client: Client, gpu: Option try: k8s_resource_handler.apply() - wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name) + wait_for_deployment_ready( + lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name, timeout_seconds=None + ) logger.info(f"Success: Notebook {name} created successfully.") + if gpu: + logger.info(f"{gpu.title()} GPU attached to notebook.") except ApiError as err: - logger.error(f"Failed to create Notebook with error code {err.status.code}. Check the debug logs for more details.") + logger.error( + f"Failed to create Notebook with error code {err.status.code}." + " Check the debug logs for more details." + ) logger.debug(f"Failed to create Notebook {name} with error {err}") raise RuntimeError() - except TimeoutError as err: - logger.error(str(err)) - logger.warn(f"Timed out while trying to create Notebook {name}. Some resources might be left in the cluster. Check the status with `dss list`.") - raise RuntimeError() except ImagePullBackOffError as err: - logger.error(f"Image {image_full_name} does not exist or is not accessible.") - logger.info("Note: You might want to use some of these recommended images:\n" + RECOMMENDED_IMAGES_MESSAGE) + logger.error(f"Image {image_full_name} does not exist or is not accessible.\n") + logger.info( + "Note: You might want to use some of these recommended images:\n" + + RECOMMENDED_IMAGES_MESSAGE + ) logger.debug(f"Image pull back off error while creating Notebook {name} with error {err}.") raise RuntimeError() @@ -99,17 +122,17 @@ def create_notebook(name: str, image: str, lightkube_client: Client, gpu: Option logger.info(f"Access the notebook at {url}.") -def _get_notebook_config(image: str, name: str, gpu: str = None) -> dict: +def _get_notebook_config(image: str, name: str, gpu: Optional[str] = None) -> Dict[str, str]: """ Generates the configuration dictionary for creating a notebook deployment. Args: image (str): The Docker image to use for the notebook. name (str): The name of the notebook. - gpu (str, optional): The type of GPU acceleration ('nvidia'), if applicable. + gpu (Optional[str]): GPU label for the notebook deployment, if GPU support is enabled. Returns: - dict: A dictionary with configuration values for the notebook. + Dict[str, str]: A dictionary with configuration values for the notebook deployment. """ # Base configuration for the notebook context = { @@ -122,14 +145,19 @@ def _get_notebook_config(image: str, name: str, gpu: str = None) -> dict: # Conditionally add GPU configuration if specified if gpu: - context["gpu"] = gpu + context["gpu"] = GPU_DEPLOYMENT_LABEL[gpu] return context def _get_notebook_image_name(image: str) -> str: """ - Returns the image's full name if the input is a key in `NOTEBOOK_IMAGES_ALIASES` - else it returns the input. + Resolves the full Docker image name from an alias or returns the image if not an alias. + + Args: + image (str): An alias for a notebook image or a full Docker image name. + + Returns: + str: The resolved full Docker image name. """ return NOTEBOOK_IMAGES_ALIASES.get(image, image) diff --git a/src/dss/main.py b/src/dss/main.py index 602ced48..f862c25c 100644 --- a/src/dss/main.py +++ b/src/dss/main.py @@ -60,8 +60,14 @@ def initialize_command(kubeconfig: str) -> None: help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.", # noqa E501 ) @click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.") -@click.option("--gpu", type=click.Choice(SUPPORTED_GPUS), help="Specify the type of GPU acceleration, e.g., 'nvidia'.") -def create_notebook_command(name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str) -> None: +@click.option( + "--gpu", + type=click.Choice(SUPPORTED_GPUS), + help="Specify the type of GPU acceleration, e.g., 'nvidia'.", +) +def create_notebook_command( + name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str +) -> None: """Create a Jupyter notebook in DSS and connect it to MLflow. This command also outputs the URL to access the notebook on success. @@ -74,7 +80,7 @@ def create_notebook_command(name: str, image: str, kubeconfig: str, no_gpu: bool f"No image is specified. Using default value {DEFAULT_NOTEBOOK_IMAGE}." " For more information on using a specific image, see dss create --help." ) - + # Check mutual exclusivity if no_gpu and gpu: logger.error("You cannot specify both --no-gpu and --gpu options.") @@ -83,7 +89,9 @@ def create_notebook_command(name: str, image: str, kubeconfig: str, no_gpu: bool kubeconfig = get_default_kubeconfig(kubeconfig) lightkube_client = get_lightkube_client(kubeconfig) - create_notebook(name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu) + create_notebook( + name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu + ) except RuntimeError: click.get_current_context().exit(1) except Exception as e: diff --git a/src/dss/utils.py b/src/dss/utils.py index 6f180fb7..ee67787f 100644 --- a/src/dss/utils.py +++ b/src/dss/utils.py @@ -55,21 +55,23 @@ def wait_for_deployment_ready( client: Client, namespace: str, deployment_name: str, - timeout_seconds: int = 180, + timeout_seconds: Optional[int] = 180, interval_seconds: int = 10, ) -> None: """ - Waits for a Kubernetes deployment to be ready. + Waits for a Kubernetes deployment to be ready. Can wait indefinitely if timeout_seconds is None. Args: client (Client): The Kubernetes client. namespace (str): The namespace of the deployment. deployment_name (str): The name of the deployment. - timeout_seconds (int): Timeout in seconds. Defaults to 600. + timeout_seconds (Optional[int]): Timeout in seconds, or None for no timeout. + Defaults to 180. interval_seconds (int): Interval between checks in seconds. Defaults to 10. - Returns: - None + Raises: + ImagePullBackOffError: If there is an issue pulling the deployment image. + TimeoutError: If the timeout is reached before the deployment is ready. """ logger.info( f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." @@ -77,32 +79,41 @@ def wait_for_deployment_ready( start_time = time.time() while True: deployment: Deployment = client.get(Deployment, namespace=namespace, name=deployment_name) - if deployment.status.availableReplicas == deployment.spec.replicas: + if deployment.status and deployment.status.availableReplicas == deployment.spec.replicas: logger.info(f"Deployment {deployment_name} in namespace {namespace} is ready") break - elif time.time() - start_time >= timeout_seconds: - # Surround the following block with try-except? - # ----Block-start---- - pod: Pod = list( + try: + pods = list( client.list( Pod, namespace=namespace, labels={"canonical.com/dss-notebook": deployment_name}, ) - )[0] - reason = pod.status.containerStatuses[0].state.waiting.reason + ) + except ApiError as e: + if e.response.status_code == 404: + pods = [] + + if pods: + reason = ( + pods[0].status.containerStatuses[0].state.waiting.reason + if pods[0].status.containerStatuses + and pods[0].status.containerStatuses[0].state.waiting + else "Unknown" + ) if reason in ["ImagePullBackOff", "ErrImagePull"]: raise ImagePullBackOffError( f"Failed to create Deployment {deployment_name} with {reason}" ) - # ----Block-end---- + + if timeout_seconds is not None and time.time() - start_time >= timeout_seconds: raise TimeoutError( f"Timeout waiting for deployment {deployment_name} in namespace {namespace} to be ready" # noqa E501 ) else: time.sleep(interval_seconds) - logger.info( - f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." + logger.debug( + f"Still waiting for deployment {deployment_name} in namespace {namespace} to be ready..." # noqa E501 ) diff --git a/tests/integration/test_dss.py b/tests/integration/test_dss.py index 8ae575c6..2bee471a 100644 --- a/tests/integration/test_dss.py +++ b/tests/integration/test_dss.py @@ -75,6 +75,38 @@ def test_initialize_creates_dss(cleanup_after_initialize) -> None: assert "notebooks" in kubectl_result.stdout +def test_create_notebook_gpu_failure(cleanup_after_initialize) -> None: + """ + Tests that `dss create` fails to creates a notebook on machine without GPU + (its expected to be run on GH runner without GPU). + + Must be run after `dss initialize` + """ + + result = subprocess.run( + [ + DSS_NAMESPACE, + "create", + NOTEBOOK_NAME, + "--image", + NOTEBOOK_IMAGE, + "--kubeconfig", + KUBECONFIG, + "--gpu=nvidia", + ], + capture_output=True, + text=True, + timeout=60 * 4, + ) + + # Check if the command executed successfully + assert result.returncode == 1 + assert ( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster." # noqa E501 + in result.stderr + ) + + def test_create_notebook(cleanup_after_initialize) -> None: """ Tests that `dss create` successfully creates a notebook as expected. diff --git a/tests/unit/test_create_notebook.py b/tests/unit/test_create_notebook.py index e375b40e..6e86e777 100644 --- a/tests/unit/test_create_notebook.py +++ b/tests/unit/test_create_notebook.py @@ -8,6 +8,13 @@ from dss.utils import ImagePullBackOffError +@pytest.fixture +def mock_client() -> MagicMock: + """Mock Kubernetes Client.""" + with patch("dss.list.Client") as mock: + yield mock.return_value + + @pytest.fixture def mock_get_service_url() -> MagicMock: """ @@ -48,6 +55,7 @@ def test_create_notebook_success( mock_get_service_url: MagicMock, mock_resource_handler: MagicMock, mock_logger: MagicMock, + mock_client: MagicMock, ) -> None: """ Test case to verify successful create_notebook call. @@ -56,9 +64,6 @@ def test_create_notebook_success( notebook_image = "test-image" notebook_url = "http://somewhere.com:1234/notebook/namespace/name/lab" - # Mock the behavior of Client - mock_client_instance = MagicMock() - mock_get_service_url.return_value = notebook_url # Mock the behavior of KubernetesResourceHandler @@ -69,14 +74,15 @@ def test_create_notebook_success( "dss.create_notebook.does_notebook_exist", return_value=False ), patch("dss.create_notebook.wait_for_deployment_ready") as mock_wait_for_deployment_ready: # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + create_notebook(name=notebook_name, image=notebook_image, lightkube_client=mock_client) # Assertions mock_resource_handler_instance.apply.assert_called_once() mock_wait_for_deployment_ready.assert_called_once_with( - mock_client_instance, namespace=DSS_NAMESPACE, deployment_name=notebook_name + mock_client, + namespace=DSS_NAMESPACE, + deployment_name=notebook_name, + timeout_seconds=None, ) mock_logger.info.assert_called_with(f"Access the notebook at {notebook_url}.") @@ -97,9 +103,10 @@ def test_create_notebook_failure_pvc_does_not_exist( "dss.create_notebook.does_notebook_exist", return_value=False ): # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance + ) # Assertions mock_logger.error.assert_called_with( @@ -129,9 +136,10 @@ def test_create_notebook_failure_mlflow_does_not_exist( "dss.create_notebook.does_notebook_exist", return_value=False ): # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance + ) # Assertions mock_logger.error.assert_called_with( @@ -164,9 +172,10 @@ def test_create_notebook_failure_notebook_exists( "dss.create_notebook.does_notebook_exist", return_value=True ): # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance + ) # Assertions mock_logger.error.assert_called_with( @@ -198,9 +207,10 @@ def test_create_notebook_failure_api( "dss.create_notebook.does_notebook_exist", return_value=False ): # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance + ) # Assertions mock_logger.error.assert_called_with( @@ -213,37 +223,6 @@ def test_create_notebook_failure_api( mock_logger.info.assert_not_called() -def test_create_notebook_failure_time_out( - mock_logger: MagicMock, mock_wait_for_deployment_ready: MagicMock -) -> None: - """ - Test case to verify behavior when a TimeoutError is raised. - """ - notebook_name = "test-notebook" - notebook_image = "test-image" - - # Mock the behavior of Client - mock_client_instance = MagicMock() - - # Mock the behavior of wait_for_deployment_ready - mock_wait_for_deployment_ready.side_effect = TimeoutError("test-exception-message") - - with patch("dss.create_notebook.does_dss_pvc_exist", return_value=True), patch( - "dss.create_notebook.does_notebook_exist", return_value=False - ): - # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) - - # Assertions - mock_logger.warn.assert_called_with( - f"Timed out while trying to create Notebook {notebook_name}.\n" - "Some resources might be left in the cluster. Check the status with `dss list`." - ) - mock_logger.info.assert_not_called() - - def test_create_notebook_failure_image_pull( mock_logger: MagicMock, mock_wait_for_deployment_ready: MagicMock ) -> None: @@ -263,13 +242,13 @@ def test_create_notebook_failure_image_pull( "dss.create_notebook.does_notebook_exist", return_value=False ), patch("dss.create_notebook._get_notebook_image_name", return_value=notebook_image): # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance + ) # Assertions mock_logger.error.assert_called_with( - f"Timed out while trying to create Notebook {notebook_name}.\n" f"Image {notebook_image} does not exist or is not accessible.\n" ) mock_logger.info.assert_called_with( @@ -295,3 +274,68 @@ def test_get_notebook_config() -> None: with patch("dss.create_notebook.get_mlflow_tracking_uri", return_value=mlflow_tracking_uri): actual_context = _get_notebook_config(notebook_image, notebook_name) assert actual_context == expected_context + + +def test_create_notebook_success_with_gpu( + mock_get_service_url: MagicMock, + mock_resource_handler: MagicMock, + mock_logger: MagicMock, + mock_client: MagicMock, +) -> None: + """ + Test case to verify successful notebook creation with GPU support. + """ + notebook_name = "gpu-notebook" + notebook_image = "gpu-image" + notebook_url = "http://somewhere.com:1234/notebook/namespace/name/lab" + gpu_type = "nvidia" + + # Set up mocks + mock_get_service_url.return_value = notebook_url + mock_resource_handler_instance = MagicMock() + mock_resource_handler.return_value = mock_resource_handler_instance + with patch("dss.create_notebook.node_has_gpu_labels", return_value=True), patch( + "dss.create_notebook.does_dss_pvc_exist", return_value=True + ), patch("dss.create_notebook.does_notebook_exist", return_value=False), patch( + "dss.create_notebook.wait_for_deployment_ready" + ) as mock_wait_for_deployment_ready: + # Act + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client, gpu=gpu_type + ) + + # Assert + mock_resource_handler_instance.apply.assert_called_once() + mock_wait_for_deployment_ready.assert_called_once() + mock_logger.info.assert_called_with(f"Access the notebook at {notebook_url}.") + mock_logger.info.assert_any_call("Nvidia GPU attached to notebook.") + + +def test_create_notebook_failure_no_gpu_labels( + mock_logger: MagicMock, mock_client: MagicMock +) -> None: + """ + Test case to verify failure in notebook creation due to missing GPU labels on the cluster. + """ + notebook_name = "gpu-notebook" + notebook_image = "gpu-image" + gpu_type = "nvidia" + + # Set up mocks + with patch("dss.create_notebook.node_has_gpu_labels", return_value=False), patch( + "dss.create_notebook.does_dss_pvc_exist", return_value=True + ), patch("dss.create_notebook.does_notebook_exist", return_value=False): + # Act & Assert + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, + image=notebook_image, + lightkube_client=mock_client, + gpu=gpu_type, + ) + mock_logger.error.assert_called_with( + f"Failed to create notebook with {gpu_type} GPU acceleration.\n" + ) + mock_logger.info.assert_called_with( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup." # noqa E501 + ) diff --git a/tox.ini b/tox.ini index 31eb62de..481f2bc5 100644 --- a/tox.ini +++ b/tox.ini @@ -59,11 +59,12 @@ commands = --skip {toxinidir}/./venv \ --skip {toxinidir}/./.mypy_cache \ --skip {toxinidir}/.poc \ + --skip {toxinidir}/./docs \ --skip {toxinidir}/./icon.svg --skip *.json.tmpl # pflake8 wrapper supports config from pyproject.toml - pflake8 {[vars]all_path} - isort --check-only --diff {[vars]all_path} - black --check --diff {[vars]all_path} + pflake8 {[vars]all_path} --exclude={toxinidir}/docs + isort --check-only --diff {[vars]all_path} --skip {toxinidir}/docs + black --check --diff {[vars]all_path} --exclude {toxinidir}/docs deps = -r requirements-lint.txt description = Check code against coding style standards From 14a892a8c0f9c0be8755768ae4e36f83b3d0272c Mon Sep 17 00:00:00 2001 From: misohu Date: Wed, 24 Apr 2024 10:42:25 +0200 Subject: [PATCH 3/4] Expand create help message --- src/dss/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dss/main.py b/src/dss/main.py index f862c25c..d5650752 100644 --- a/src/dss/main.py +++ b/src/dss/main.py @@ -104,6 +104,7 @@ def create_notebook_command( Examples dss create my-notebook --image=pytorch dss create my-notebook --image={DEFAULT_NOTEBOOK_IMAGE} + dss create my-notebook --image=charmedkubeflow/jupyter-pytorch-cuda-full:1.8.0 --gpu=nvidia \b\n{RECOMMENDED_IMAGES_MESSAGE} """ From d72f8e56d9ab5fb07087e251914641066230b66d Mon Sep 17 00:00:00 2001 From: misohu Date: Wed, 24 Apr 2024 14:01:57 +0200 Subject: [PATCH 4/4] Refactor --- tox.ini | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tox.ini b/tox.ini index 481f2bc5..31eb62de 100644 --- a/tox.ini +++ b/tox.ini @@ -59,12 +59,11 @@ commands = --skip {toxinidir}/./venv \ --skip {toxinidir}/./.mypy_cache \ --skip {toxinidir}/.poc \ - --skip {toxinidir}/./docs \ --skip {toxinidir}/./icon.svg --skip *.json.tmpl # pflake8 wrapper supports config from pyproject.toml - pflake8 {[vars]all_path} --exclude={toxinidir}/docs - isort --check-only --diff {[vars]all_path} --skip {toxinidir}/docs - black --check --diff {[vars]all_path} --exclude {toxinidir}/docs + pflake8 {[vars]all_path} + isort --check-only --diff {[vars]all_path} + black --check --diff {[vars]all_path} deps = -r requirements-lint.txt description = Check code against coding style standards