Improve DataFrame assignment and add MLflow artifacts

Jakee4488 · Jakee4488 · commit 0833ed89a287 · 2025-10-29T20:13:47.000Z
Refactored DataProcessor to use .loc and .copy() for safer DataFrame assignments and filtering, preventing pandas SettingWithCopy warnings. Added new MLflow experiment and run artifacts to downloaded_artifacts, including model metrics and a meme image. Minor notebook comment update and added Databricks type stubs.
diff --git a/demo_artifacts/mlflow_experiment.json b/demo_artifacts/mlflow_experiment.json
@@ -1,14 +1,14 @@
 {
-    "_experiment_id": "3837500115335923",
+    "_experiment_id": "2752203067320598",
     "_name": "/Shared/marvel-demo",
-    "_artifact_location": "dbfs:/databricks/mlflow-tracking/3837500115335923",
+    "_artifact_location": "dbfs:/databricks/mlflow-tracking/2752203067320598",
     "_lifecycle_stage": "active",
     "_tags": {
-        "mlflow.ownerId": "7047569734437579",
+        "mlflow.ownerId": "72084928190694",
         "mlflow.experiment.sourceName": "/Shared/marvel-demo",
-        "mlflow.ownerEmail": "maria@marvelousmlops.io",
+        "mlflow.ownerEmail": "jacobbinu4488code@gmail.com",
         "mlflow.experimentType": "MLFLOW_EXPERIMENT"
     },
-    "_creation_time": 1752709929428,
-    "_last_update_time": 1752709929428
-}
+    "_creation_time": 1761518431668,
+    "_last_update_time": 1761518431668
+}
diff --git a/demo_artifacts/run_info.json b/demo_artifacts/run_info.json
@@ -1,12 +1,12 @@
 {
     "info": {
-        "artifact_uri": "dbfs:/databricks/mlflow-tracking/3837500115335923/bbf07cc406e14d8aa6aeccba585a8e6c/artifacts",
-        "end_time": 1752710219114,
-        "experiment_id": "3837500115335923",
+        "artifact_uri": "dbfs:/databricks/mlflow-tracking/2752203067320598/13cb23f9399b4444b79062bec8d69bb9/artifacts",
+        "end_time": 1761518830484,
+        "experiment_id": "2752203067320598",
         "lifecycle_stage": "active",
-        "run_id": "bbf07cc406e14d8aa6aeccba585a8e6c",
+        "run_id": "13cb23f9399b4444b79062bec8d69bb9",
         "run_name": "marvel-demo-run",
-        "start_time": 1752710218386,
+        "start_time": 1761518829254,
         "status": "FINISHED",
         "user_id": ""
     },
@@ -20,15 +20,16 @@
         },
         "tags": {
             "git_sha": "1234567890abcd",
-            "mlflow.databricks.cluster.id": "0716-232806-8b8yqouz-v2n",
-            "mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"16.4.x-photon-scala2.12\",\"autotermination_minutes\":120}",
+            "mlflow.databricks.cluster.id": "1026-223927-mv3fx41w-v2n",
+            "mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"17.2.x-photon-scala2.13\",\"autotermination_minutes\":120}",
             "mlflow.databricks.cluster.libraries": "{\"installable\":[],\"redacted\":[]}",
             "mlflow.note.content": "marvel character prediction demo run",
             "mlflow.runColor": "#479a5f",
             "mlflow.runName": "marvel-demo-run",
-            "mlflow.source.name": "/Users/mariavechtomova/Marvelous/marvel-characters/.venv/lib/python3.12/site-packages/ipykernel_launcher.py",
+            "mlflow.source.name": "c:\\Users\\jacob\\Desktop\\DataBricks_Projects\\MLOps_Databricks_Project\\.venv\\Lib\\site-packages\\ipykernel_launcher.py",
             "mlflow.source.type": "LOCAL",
-            "mlflow.user": "maria@marvelousmlops.io"
+            "mlflow.user": "jacobbinu4488code@gmail.com",
+            "mlflow.artifacts.spn": "/WorkspaceInternal/Mlflow/Artifacts/2752203067320598/Runs/13cb23f9399b4444b79062bec8d69bb9/"
         }
     },
     "inputs": {
@@ -38,4 +39,4 @@
     "outputs": {
         "model_outputs": []
     }
-}
+}
diff --git a/downloaded_artifacts/demo_artifacts/logged_model.json b/downloaded_artifacts/demo_artifacts/logged_model.json
@@ -0,0 +1,154 @@
+{
+    "artifact_location": "dbfs:/databricks/mlflow-tracking/2569811775525074/logged_models/m-99dfe5eb37f74d92883800434589c858/artifacts",
+    "creation_timestamp": 1752868911676,
+    "experiment_id": "2569811775525074",
+    "last_updated_timestamp": 1752868922336,
+    "metrics": [
+        {
+            "_key": "score",
+            "_value": 0.7650172860847018,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "true_negatives",
+            "_value": 378.0,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "false_positives",
+            "_value": 4184.0,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "false_negatives",
+            "_value": 166.0,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "true_positives",
+            "_value": 13784.0,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "example_count",
+            "_value": 18512.0,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "accuracy_score",
+            "_value": 0.7650172860847018,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "recall_score",
+            "_value": 0.9881003584229391,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "precision_score",
+            "_value": 0.7671415850400712,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "f1_score",
+            "_value": 0.8637132652421831,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "log_loss",
+            "_value": 0.5082923607513762,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "roc_auc",
+            "_value": 0.6992098589092691,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        },
+        {
+            "_key": "precision_recall_auc",
+            "_value": 0.8743929477866188,
+            "_timestamp": 1752868943267,
+            "_step": 0,
+            "_model_id": "m-99dfe5eb37f74d92883800434589c858",
+            "_dataset_name": "dataset",
+            "_dataset_digest": "daaa439d",
+            "_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
+        }
+    ],
+    "model_id": "m-99dfe5eb37f74d92883800434589c858",
+    "model_type": "",
+    "name": "lightgbm-pipeline-model",
+    "params": {},
+    "source_run_id": "4582b093fcb74b7da7adbc8b2dc0324f",
+    "status": 2,
+    "status_message": "",
+    "tags": {
+        "mlflow.databricks.cluster.id": "0718-194323-xh5euuuy-v2n",
+        "mlflow.loggedModel.artifactLocation": "dbfs:/databricks/mlflow-tracking/2569811775525074/logged_models/m-99dfe5eb37f74d92883800434589c858/artifacts",
+        "mlflow.loggedModel.name": "lightgbm-pipeline-model",
+        "mlflow.source.name": "/Users/beskili/Desktop/marvel-characters/.venv/lib/python3.12/site-packages/ipykernel_launcher.py",
+        "mlflow.source.type": "LOCAL",
+        "mlflow.artifacts.spn": "/Mlflow/Artifacts/2569811775525074/LoggedModels/m-99dfe5eb37f74d92883800434589c858/",
+        "mlflow.user": "basak@marvelousmlops.io"
+    }
+}
diff --git a/downloaded_artifacts/demo_artifacts/mlflow_experiment.json b/downloaded_artifacts/demo_artifacts/mlflow_experiment.json
@@ -0,0 +1,14 @@
+{
+    "_experiment_id": "2752203067320598",
+    "_name": "/Shared/marvel-demo",
+    "_artifact_location": "dbfs:/databricks/mlflow-tracking/2752203067320598",
+    "_lifecycle_stage": "active",
+    "_tags": {
+        "mlflow.ownerId": "72084928190694",
+        "mlflow.experiment.sourceName": "/Shared/marvel-demo",
+        "mlflow.ownerEmail": "jacobbinu4488code@gmail.com",
+        "mlflow.experimentType": "MLFLOW_EXPERIMENT"
+    },
+    "_creation_time": 1761518431668,
+    "_last_update_time": 1761518431668
+}
diff --git a/downloaded_artifacts/demo_artifacts/mlflow_meme.jpeg b/downloaded_artifacts/demo_artifacts/mlflow_meme.jpeg
diff --git a/downloaded_artifacts/demo_artifacts/run_info.json b/downloaded_artifacts/demo_artifacts/run_info.json
@@ -0,0 +1,42 @@
+{
+    "info": {
+        "artifact_uri": "dbfs:/databricks/mlflow-tracking/2752203067320598/13cb23f9399b4444b79062bec8d69bb9/artifacts",
+        "end_time": 1761518830484,
+        "experiment_id": "2752203067320598",
+        "lifecycle_stage": "active",
+        "run_id": "13cb23f9399b4444b79062bec8d69bb9",
+        "run_name": "marvel-demo-run",
+        "start_time": 1761518829254,
+        "status": "FINISHED",
+        "user_id": ""
+    },
+    "data": {
+        "metrics": {
+            "metric1": 1.0,
+            "metric2": 2.0
+        },
+        "params": {
+            "type": "marvel_demo"
+        },
+        "tags": {
+            "git_sha": "1234567890abcd",
+            "mlflow.databricks.cluster.id": "1026-223927-mv3fx41w-v2n",
+            "mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"17.2.x-photon-scala2.13\",\"autotermination_minutes\":120}",
+            "mlflow.databricks.cluster.libraries": "{\"installable\":[],\"redacted\":[]}",
+            "mlflow.note.content": "marvel character prediction demo run",
+            "mlflow.runColor": "#479a5f",
+            "mlflow.runName": "marvel-demo-run",
+            "mlflow.source.name": "c:\\Users\\jacob\\Desktop\\DataBricks_Projects\\MLOps_Databricks_Project\\.venv\\Lib\\site-packages\\ipykernel_launcher.py",
+            "mlflow.source.type": "LOCAL",
+            "mlflow.user": "jacobbinu4488code@gmail.com",
+            "mlflow.artifacts.spn": "/WorkspaceInternal/Mlflow/Artifacts/2752203067320598/Runs/13cb23f9399b4444b79062bec8d69bb9/"
+        }
+    },
+    "inputs": {
+        "model_inputs": [],
+        "dataset_inputs": []
+    },
+    "outputs": {
+        "model_outputs": []
+    }
+}
diff --git a/notebooks/lecture3.mlflow_experiment_tracking.py b/notebooks/lecture3.mlflow_experiment_tracking.py
@@ -96,6 +96,7 @@ def is_databricks() -> bool:
 
 # COMMAND ----------
 # this will fail: not allowed to overwrite value
+# this will work in databrciks notebooks but not upadte
 mlflow.log_param("type", "marvel_demo2")
 # COMMAND ----------
 mlflow.log_param(key="purpose", value="get_certified")
diff --git a/src/marvel_characters/data_processor.py b/src/marvel_characters/data_processor.py
@@ -31,40 +31,54 @@ def preprocess(self) -> None:
         num_features = self.config.num_features
         target = self.config.target
 
-        self.df.rename(columns={"Height (m)": "Height"}, inplace=True)
-        self.df.rename(columns={"Weight (kg)": "Weight"}, inplace=True)
+        # Use .rename() on self.df directly, which is generally safe.
+        self.df.rename(columns={"Height (m)": "Height", "Weight (kg)": "Weight"}, inplace=True)
 
-        # Universe
-        self.df["Universe"] = self.df["Universe"].fillna("Unknown")
+        # --- Universe ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Universe"] = self.df["Universe"].fillna("Unknown")
         counts = self.df["Universe"].value_counts()
         small_universes = counts[counts < 50].index
-        self.df["Universe"] = self.df["Universe"].replace(small_universes, "Other")
-
-        # Teams
-        self.df["Teams"] = self.df["Teams"].notna().astype("int")
-
-        # Origin
-        self.df["Origin"] = self.df["Origin"].fillna("Unknown")
-
-        # Identity
-        self.df["Identity"] = self.df["Identity"].fillna("Unknown")
-        self.df = self.df[self.df["Identity"].isin(["Public", "Secret", "Unknown"])]
-
-        # Gender
-        self.df["Gender"] = self.df["Gender"].fillna("Unknown")
-        self.df["Gender"] = self.df["Gender"].where(self.df["Gender"].isin(["Male", "Female"]), other="Other")
-
-        # Marital status
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Universe"] = self.df["Universe"].replace(small_universes, "Other")
+
+        # --- Teams ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Teams"] = self.df["Teams"].notna().astype("int")
+
+        # --- Origin ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Origin"] = self.df["Origin"].fillna("Unknown")
+
+        # --- Identity ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Identity"] = self.df["Identity"].fillna("Unknown")
+        # Fix: When filtering (slicing), explicitly use .copy() to stop tracking history.
+        self.df = self.df[self.df["Identity"].isin(["Public", "Secret", "Unknown"])].copy()
+
+        # --- Gender ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Gender"] = self.df["Gender"].fillna("Unknown")
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Gender"] = self.df["Gender"].where(self.df["Gender"].isin(["Male", "Female"]), other="Other")
+
+        # --- Marital status ---
+        # Fix: Rename first to get a clean column name for the next operations.
         self.df.rename(columns={"Marital Status": "Marital_Status"}, inplace=True)
-        self.df["Marital_Status"] = self.df["Marital_Status"].fillna("Unknown")
-        self.df["Marital_Status"] = self.df["Marital_Status"].replace("Widow", "Widowed")
-        self.df = self.df[self.df["Marital_Status"].isin(["Single", "Married", "Widowed", "Engaged", "Unknown"])]
-
-        # Magic
-        self.df["Magic"] = self.df["Origin"].str.lower().apply(lambda x: int("magic" in x))
-
-        # Mutant
-        self.df["Mutant"] = self.df["Origin"].str.lower().apply(lambda x: int("mutate" in x or "mutant" in x))
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Marital_Status"] = self.df["Marital_Status"].fillna("Unknown")
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Marital_Status"] = self.df["Marital_Status"].replace("Widow", "Widowed")
+        # Fix: When filtering (slicing), explicitly use .copy() to stop tracking history.
+        self.df = self.df[self.df["Marital_Status"].isin(["Single", "Married", "Widowed", "Engaged", "Unknown"])].copy()
+
+        # --- Magic ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Magic"] = self.df["Origin"].str.lower().apply(lambda x: int("magic" in x))
+
+        # --- Mutant ---
+        # Fix: Use .loc to ensure direct assignment to the column.
+        self.df.loc[:, "Mutant"] = self.df["Origin"].str.lower().apply(lambda x: int("mutate" in x or "mutant" in x))
 
         # Normalize origin
         def normalize_origin(x: str) -> str:
diff --git a/typings/__builtins__.pyi b/typings/__builtins__.pyi
@@ -0,0 +1,18 @@
+
+from databricks.sdk.runtime import *
+from pyspark.sql.session import SparkSession
+from pyspark.sql.functions import udf as U
+from pyspark.sql.context import SQLContext
+
+udf = U
+spark: SparkSession
+sc = spark.sparkContext
+sqlContext: SQLContext
+sql = sqlContext.sql
+table = sqlContext.table
+getArgument = dbutils.widgets.getArgument
+
+def displayHTML(html): ...
+
+def display(input=None, *args, **kwargs): ...
+