Skip to content

Commit 0833ed8

Browse files
committed
Improve DataFrame assignment and add MLflow artifacts
Refactored DataProcessor to use .loc and .copy() for safer DataFrame assignments and filtering, preventing pandas SettingWithCopy warnings. Added new MLflow experiment and run artifacts to downloaded_artifacts, including model metrics and a meme image. Minor notebook comment update and added Databricks type stubs.
1 parent e97d1fa commit 0833ed8

9 files changed

Lines changed: 291 additions & 47 deletions

File tree

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
2-
"_experiment_id": "3837500115335923",
2+
"_experiment_id": "2752203067320598",
33
"_name": "/Shared/marvel-demo",
4-
"_artifact_location": "dbfs:/databricks/mlflow-tracking/3837500115335923",
4+
"_artifact_location": "dbfs:/databricks/mlflow-tracking/2752203067320598",
55
"_lifecycle_stage": "active",
66
"_tags": {
7-
"mlflow.ownerId": "7047569734437579",
7+
"mlflow.ownerId": "72084928190694",
88
"mlflow.experiment.sourceName": "/Shared/marvel-demo",
9-
"mlflow.ownerEmail": "maria@marvelousmlops.io",
9+
"mlflow.ownerEmail": "jacobbinu4488code@gmail.com",
1010
"mlflow.experimentType": "MLFLOW_EXPERIMENT"
1111
},
12-
"_creation_time": 1752709929428,
13-
"_last_update_time": 1752709929428
14-
}
12+
"_creation_time": 1761518431668,
13+
"_last_update_time": 1761518431668
14+
}

demo_artifacts/run_info.json

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
{
22
"info": {
3-
"artifact_uri": "dbfs:/databricks/mlflow-tracking/3837500115335923/bbf07cc406e14d8aa6aeccba585a8e6c/artifacts",
4-
"end_time": 1752710219114,
5-
"experiment_id": "3837500115335923",
3+
"artifact_uri": "dbfs:/databricks/mlflow-tracking/2752203067320598/13cb23f9399b4444b79062bec8d69bb9/artifacts",
4+
"end_time": 1761518830484,
5+
"experiment_id": "2752203067320598",
66
"lifecycle_stage": "active",
7-
"run_id": "bbf07cc406e14d8aa6aeccba585a8e6c",
7+
"run_id": "13cb23f9399b4444b79062bec8d69bb9",
88
"run_name": "marvel-demo-run",
9-
"start_time": 1752710218386,
9+
"start_time": 1761518829254,
1010
"status": "FINISHED",
1111
"user_id": ""
1212
},
@@ -20,15 +20,16 @@
2020
},
2121
"tags": {
2222
"git_sha": "1234567890abcd",
23-
"mlflow.databricks.cluster.id": "0716-232806-8b8yqouz-v2n",
24-
"mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"16.4.x-photon-scala2.12\",\"autotermination_minutes\":120}",
23+
"mlflow.databricks.cluster.id": "1026-223927-mv3fx41w-v2n",
24+
"mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"17.2.x-photon-scala2.13\",\"autotermination_minutes\":120}",
2525
"mlflow.databricks.cluster.libraries": "{\"installable\":[],\"redacted\":[]}",
2626
"mlflow.note.content": "marvel character prediction demo run",
2727
"mlflow.runColor": "#479a5f",
2828
"mlflow.runName": "marvel-demo-run",
29-
"mlflow.source.name": "/Users/mariavechtomova/Marvelous/marvel-characters/.venv/lib/python3.12/site-packages/ipykernel_launcher.py",
29+
"mlflow.source.name": "c:\\Users\\jacob\\Desktop\\DataBricks_Projects\\MLOps_Databricks_Project\\.venv\\Lib\\site-packages\\ipykernel_launcher.py",
3030
"mlflow.source.type": "LOCAL",
31-
"mlflow.user": "maria@marvelousmlops.io"
31+
"mlflow.user": "jacobbinu4488code@gmail.com",
32+
"mlflow.artifacts.spn": "/WorkspaceInternal/Mlflow/Artifacts/2752203067320598/Runs/13cb23f9399b4444b79062bec8d69bb9/"
3233
}
3334
},
3435
"inputs": {
@@ -38,4 +39,4 @@
3839
"outputs": {
3940
"model_outputs": []
4041
}
41-
}
42+
}
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
{
2+
"artifact_location": "dbfs:/databricks/mlflow-tracking/2569811775525074/logged_models/m-99dfe5eb37f74d92883800434589c858/artifacts",
3+
"creation_timestamp": 1752868911676,
4+
"experiment_id": "2569811775525074",
5+
"last_updated_timestamp": 1752868922336,
6+
"metrics": [
7+
{
8+
"_key": "score",
9+
"_value": 0.7650172860847018,
10+
"_timestamp": 1752868943267,
11+
"_step": 0,
12+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
13+
"_dataset_name": "dataset",
14+
"_dataset_digest": "daaa439d",
15+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
16+
},
17+
{
18+
"_key": "true_negatives",
19+
"_value": 378.0,
20+
"_timestamp": 1752868943267,
21+
"_step": 0,
22+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
23+
"_dataset_name": "dataset",
24+
"_dataset_digest": "daaa439d",
25+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
26+
},
27+
{
28+
"_key": "false_positives",
29+
"_value": 4184.0,
30+
"_timestamp": 1752868943267,
31+
"_step": 0,
32+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
33+
"_dataset_name": "dataset",
34+
"_dataset_digest": "daaa439d",
35+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
36+
},
37+
{
38+
"_key": "false_negatives",
39+
"_value": 166.0,
40+
"_timestamp": 1752868943267,
41+
"_step": 0,
42+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
43+
"_dataset_name": "dataset",
44+
"_dataset_digest": "daaa439d",
45+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
46+
},
47+
{
48+
"_key": "true_positives",
49+
"_value": 13784.0,
50+
"_timestamp": 1752868943267,
51+
"_step": 0,
52+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
53+
"_dataset_name": "dataset",
54+
"_dataset_digest": "daaa439d",
55+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
56+
},
57+
{
58+
"_key": "example_count",
59+
"_value": 18512.0,
60+
"_timestamp": 1752868943267,
61+
"_step": 0,
62+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
63+
"_dataset_name": "dataset",
64+
"_dataset_digest": "daaa439d",
65+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
66+
},
67+
{
68+
"_key": "accuracy_score",
69+
"_value": 0.7650172860847018,
70+
"_timestamp": 1752868943267,
71+
"_step": 0,
72+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
73+
"_dataset_name": "dataset",
74+
"_dataset_digest": "daaa439d",
75+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
76+
},
77+
{
78+
"_key": "recall_score",
79+
"_value": 0.9881003584229391,
80+
"_timestamp": 1752868943267,
81+
"_step": 0,
82+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
83+
"_dataset_name": "dataset",
84+
"_dataset_digest": "daaa439d",
85+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
86+
},
87+
{
88+
"_key": "precision_score",
89+
"_value": 0.7671415850400712,
90+
"_timestamp": 1752868943267,
91+
"_step": 0,
92+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
93+
"_dataset_name": "dataset",
94+
"_dataset_digest": "daaa439d",
95+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
96+
},
97+
{
98+
"_key": "f1_score",
99+
"_value": 0.8637132652421831,
100+
"_timestamp": 1752868943267,
101+
"_step": 0,
102+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
103+
"_dataset_name": "dataset",
104+
"_dataset_digest": "daaa439d",
105+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
106+
},
107+
{
108+
"_key": "log_loss",
109+
"_value": 0.5082923607513762,
110+
"_timestamp": 1752868943267,
111+
"_step": 0,
112+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
113+
"_dataset_name": "dataset",
114+
"_dataset_digest": "daaa439d",
115+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
116+
},
117+
{
118+
"_key": "roc_auc",
119+
"_value": 0.6992098589092691,
120+
"_timestamp": 1752868943267,
121+
"_step": 0,
122+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
123+
"_dataset_name": "dataset",
124+
"_dataset_digest": "daaa439d",
125+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
126+
},
127+
{
128+
"_key": "precision_recall_auc",
129+
"_value": 0.8743929477866188,
130+
"_timestamp": 1752868943267,
131+
"_step": 0,
132+
"_model_id": "m-99dfe5eb37f74d92883800434589c858",
133+
"_dataset_name": "dataset",
134+
"_dataset_digest": "daaa439d",
135+
"_run_id": "4582b093fcb74b7da7adbc8b2dc0324f"
136+
}
137+
],
138+
"model_id": "m-99dfe5eb37f74d92883800434589c858",
139+
"model_type": "",
140+
"name": "lightgbm-pipeline-model",
141+
"params": {},
142+
"source_run_id": "4582b093fcb74b7da7adbc8b2dc0324f",
143+
"status": 2,
144+
"status_message": "",
145+
"tags": {
146+
"mlflow.databricks.cluster.id": "0718-194323-xh5euuuy-v2n",
147+
"mlflow.loggedModel.artifactLocation": "dbfs:/databricks/mlflow-tracking/2569811775525074/logged_models/m-99dfe5eb37f74d92883800434589c858/artifacts",
148+
"mlflow.loggedModel.name": "lightgbm-pipeline-model",
149+
"mlflow.source.name": "/Users/beskili/Desktop/marvel-characters/.venv/lib/python3.12/site-packages/ipykernel_launcher.py",
150+
"mlflow.source.type": "LOCAL",
151+
"mlflow.artifacts.spn": "/Mlflow/Artifacts/2569811775525074/LoggedModels/m-99dfe5eb37f74d92883800434589c858/",
152+
"mlflow.user": "basak@marvelousmlops.io"
153+
}
154+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"_experiment_id": "2752203067320598",
3+
"_name": "/Shared/marvel-demo",
4+
"_artifact_location": "dbfs:/databricks/mlflow-tracking/2752203067320598",
5+
"_lifecycle_stage": "active",
6+
"_tags": {
7+
"mlflow.ownerId": "72084928190694",
8+
"mlflow.experiment.sourceName": "/Shared/marvel-demo",
9+
"mlflow.ownerEmail": "jacobbinu4488code@gmail.com",
10+
"mlflow.experimentType": "MLFLOW_EXPERIMENT"
11+
},
12+
"_creation_time": 1761518431668,
13+
"_last_update_time": 1761518431668
14+
}
54.9 KB
Loading
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"info": {
3+
"artifact_uri": "dbfs:/databricks/mlflow-tracking/2752203067320598/13cb23f9399b4444b79062bec8d69bb9/artifacts",
4+
"end_time": 1761518830484,
5+
"experiment_id": "2752203067320598",
6+
"lifecycle_stage": "active",
7+
"run_id": "13cb23f9399b4444b79062bec8d69bb9",
8+
"run_name": "marvel-demo-run",
9+
"start_time": 1761518829254,
10+
"status": "FINISHED",
11+
"user_id": ""
12+
},
13+
"data": {
14+
"metrics": {
15+
"metric1": 1.0,
16+
"metric2": 2.0
17+
},
18+
"params": {
19+
"type": "marvel_demo"
20+
},
21+
"tags": {
22+
"git_sha": "1234567890abcd",
23+
"mlflow.databricks.cluster.id": "1026-223927-mv3fx41w-v2n",
24+
"mlflow.databricks.cluster.info": "{\"cluster_name\":\"\",\"spark_version\":\"17.2.x-photon-scala2.13\",\"autotermination_minutes\":120}",
25+
"mlflow.databricks.cluster.libraries": "{\"installable\":[],\"redacted\":[]}",
26+
"mlflow.note.content": "marvel character prediction demo run",
27+
"mlflow.runColor": "#479a5f",
28+
"mlflow.runName": "marvel-demo-run",
29+
"mlflow.source.name": "c:\\Users\\jacob\\Desktop\\DataBricks_Projects\\MLOps_Databricks_Project\\.venv\\Lib\\site-packages\\ipykernel_launcher.py",
30+
"mlflow.source.type": "LOCAL",
31+
"mlflow.user": "jacobbinu4488code@gmail.com",
32+
"mlflow.artifacts.spn": "/WorkspaceInternal/Mlflow/Artifacts/2752203067320598/Runs/13cb23f9399b4444b79062bec8d69bb9/"
33+
}
34+
},
35+
"inputs": {
36+
"model_inputs": [],
37+
"dataset_inputs": []
38+
},
39+
"outputs": {
40+
"model_outputs": []
41+
}
42+
}

notebooks/lecture3.mlflow_experiment_tracking.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def is_databricks() -> bool:
9696

9797
# COMMAND ----------
9898
# this will fail: not allowed to overwrite value
99+
# this will work in databrciks notebooks but not upadte
99100
mlflow.log_param("type", "marvel_demo2")
100101
# COMMAND ----------
101102
mlflow.log_param(key="purpose", value="get_certified")

src/marvel_characters/data_processor.py

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,40 +31,54 @@ def preprocess(self) -> None:
3131
num_features = self.config.num_features
3232
target = self.config.target
3333

34-
self.df.rename(columns={"Height (m)": "Height"}, inplace=True)
35-
self.df.rename(columns={"Weight (kg)": "Weight"}, inplace=True)
34+
# Use .rename() on self.df directly, which is generally safe.
35+
self.df.rename(columns={"Height (m)": "Height", "Weight (kg)": "Weight"}, inplace=True)
3636

37-
# Universe
38-
self.df["Universe"] = self.df["Universe"].fillna("Unknown")
37+
# --- Universe ---
38+
# Fix: Use .loc to ensure direct assignment to the column.
39+
self.df.loc[:, "Universe"] = self.df["Universe"].fillna("Unknown")
3940
counts = self.df["Universe"].value_counts()
4041
small_universes = counts[counts < 50].index
41-
self.df["Universe"] = self.df["Universe"].replace(small_universes, "Other")
42-
43-
# Teams
44-
self.df["Teams"] = self.df["Teams"].notna().astype("int")
45-
46-
# Origin
47-
self.df["Origin"] = self.df["Origin"].fillna("Unknown")
48-
49-
# Identity
50-
self.df["Identity"] = self.df["Identity"].fillna("Unknown")
51-
self.df = self.df[self.df["Identity"].isin(["Public", "Secret", "Unknown"])]
52-
53-
# Gender
54-
self.df["Gender"] = self.df["Gender"].fillna("Unknown")
55-
self.df["Gender"] = self.df["Gender"].where(self.df["Gender"].isin(["Male", "Female"]), other="Other")
56-
57-
# Marital status
42+
# Fix: Use .loc to ensure direct assignment to the column.
43+
self.df.loc[:, "Universe"] = self.df["Universe"].replace(small_universes, "Other")
44+
45+
# --- Teams ---
46+
# Fix: Use .loc to ensure direct assignment to the column.
47+
self.df.loc[:, "Teams"] = self.df["Teams"].notna().astype("int")
48+
49+
# --- Origin ---
50+
# Fix: Use .loc to ensure direct assignment to the column.
51+
self.df.loc[:, "Origin"] = self.df["Origin"].fillna("Unknown")
52+
53+
# --- Identity ---
54+
# Fix: Use .loc to ensure direct assignment to the column.
55+
self.df.loc[:, "Identity"] = self.df["Identity"].fillna("Unknown")
56+
# Fix: When filtering (slicing), explicitly use .copy() to stop tracking history.
57+
self.df = self.df[self.df["Identity"].isin(["Public", "Secret", "Unknown"])].copy()
58+
59+
# --- Gender ---
60+
# Fix: Use .loc to ensure direct assignment to the column.
61+
self.df.loc[:, "Gender"] = self.df["Gender"].fillna("Unknown")
62+
# Fix: Use .loc to ensure direct assignment to the column.
63+
self.df.loc[:, "Gender"] = self.df["Gender"].where(self.df["Gender"].isin(["Male", "Female"]), other="Other")
64+
65+
# --- Marital status ---
66+
# Fix: Rename first to get a clean column name for the next operations.
5867
self.df.rename(columns={"Marital Status": "Marital_Status"}, inplace=True)
59-
self.df["Marital_Status"] = self.df["Marital_Status"].fillna("Unknown")
60-
self.df["Marital_Status"] = self.df["Marital_Status"].replace("Widow", "Widowed")
61-
self.df = self.df[self.df["Marital_Status"].isin(["Single", "Married", "Widowed", "Engaged", "Unknown"])]
62-
63-
# Magic
64-
self.df["Magic"] = self.df["Origin"].str.lower().apply(lambda x: int("magic" in x))
65-
66-
# Mutant
67-
self.df["Mutant"] = self.df["Origin"].str.lower().apply(lambda x: int("mutate" in x or "mutant" in x))
68+
# Fix: Use .loc to ensure direct assignment to the column.
69+
self.df.loc[:, "Marital_Status"] = self.df["Marital_Status"].fillna("Unknown")
70+
# Fix: Use .loc to ensure direct assignment to the column.
71+
self.df.loc[:, "Marital_Status"] = self.df["Marital_Status"].replace("Widow", "Widowed")
72+
# Fix: When filtering (slicing), explicitly use .copy() to stop tracking history.
73+
self.df = self.df[self.df["Marital_Status"].isin(["Single", "Married", "Widowed", "Engaged", "Unknown"])].copy()
74+
75+
# --- Magic ---
76+
# Fix: Use .loc to ensure direct assignment to the column.
77+
self.df.loc[:, "Magic"] = self.df["Origin"].str.lower().apply(lambda x: int("magic" in x))
78+
79+
# --- Mutant ---
80+
# Fix: Use .loc to ensure direct assignment to the column.
81+
self.df.loc[:, "Mutant"] = self.df["Origin"].str.lower().apply(lambda x: int("mutate" in x or "mutant" in x))
6882

6983
# Normalize origin
7084
def normalize_origin(x: str) -> str:

typings/__builtins__.pyi

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
2+
from databricks.sdk.runtime import *
3+
from pyspark.sql.session import SparkSession
4+
from pyspark.sql.functions import udf as U
5+
from pyspark.sql.context import SQLContext
6+
7+
udf = U
8+
spark: SparkSession
9+
sc = spark.sparkContext
10+
sqlContext: SQLContext
11+
sql = sqlContext.sql
12+
table = sqlContext.table
13+
getArgument = dbutils.widgets.getArgument
14+
15+
def displayHTML(html): ...
16+
17+
def display(input=None, *args, **kwargs): ...
18+

0 commit comments

Comments
 (0)