feat/spark-4.x-hist: Add Spark History Server (#173)

iobruno · web-flow · commit cee48acdff4d · 2026-02-20T06:08:17.000-03:00
* Add Spark History Server to the Spark Standalone cluster
* Configure eventLog for Spark Master and Workers
* Set SPARK_PUBLIC_DNS=localhost so Web UI worker links are accessible from the host
* Pin PySpark to 4.0.1 to prevent version mismatches between Driver and Workers
* Update README with spark-submit instructions
diff --git a/module5-batch-processing/compose.spark-4.0-standalone.yaml b/module5-batch-processing/compose.spark-4.0-standalone.yaml
@@ -7,12 +7,15 @@ x-spark-common:
   image: *spark-image
   environment:
     &spark-common-env
-    SPARK_NO_DAEMONIZE: true    # Forces the process to run in foreground (req. for Docker)
+    SPARK_NO_DAEMONIZE: true      # Forces the process to run in foreground (req. for Docker)
+    SPARK_PUBLIC_DNS: localhost   # Ensures Web UI links point to localhost instead of container IPs
+    GOOGLE_APPLICATION_CREDENTIALS: "/secrets/gcp_credentials.json"
   volumes:
     &spark-common-vol
-    - vol-spark-extra-jars:/opt/spark/extra-jars/
+    - ./logs/:/opt/spark/logs/
     - ./spark-4.0-standalone.conf:/opt/spark/conf/spark-standalone.conf
     - ~/.gcp/spark_credentials.json:/secrets/gcp_credentials.json
+    - vol-spark-extra-jars:/opt/spark/extra-jars/
   depends_on:
     &spark-common-depends-on
     spark-init:
@@ -77,7 +80,24 @@ services:
     depends_on:
       spark-master:
         condition: service_started
-    restart: on-failure:3
+    restart: on-failure:5
+
+  spark-history-server:
+    <<: *spark-common
+    container_name: spark-history-server
+    command: |
+      /opt/spark/sbin/start-history-server.sh
+      --properties-file /opt/spark/conf/spark-standalone.conf
+    environment:
+      <<: *spark-common-env
+      SPARK_HISTORY_OPTS: >-
+        -Dspark.history.fs.logDirectory=/opt/spark/logs/
+    ports:
+      - '18080:18080'
+    depends_on:
+      spark-master:
+        condition: service_started
+    restart: on-failure:5
 
   hive-db:
     image: *postgres-image
@@ -130,6 +150,7 @@ services:
       - |
         apt-get update && apt-get install curl -y
         curl --create-dirs -O --output-dir /opt/spark/extra-jars/ https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/4.0.2/gcs-connector-4.0.2-shaded.jar
+        chown -R 185:185 /opt/spark/extra-jars/
     volumes:
       - vol-spark-extra-jars:/opt/spark/extra-jars/
 
diff --git a/module5-batch-processing/pyspark-4.x/README.md b/module5-batch-processing/pyspark-4.x/README.md
@@ -37,6 +37,31 @@ pre-commit install
 docker compose -f ../compose.yaml up -d
 ```
 
+**5.** Spark Web UI
+- Spark Master Web UI can be accessed at [http://localhost:4040](http://localhost:4040)
+- Spark History Server can be accessed at [http://localhost:18080](http://localhost:18080)
+
+
+## Spark-submit Application
+
+### Local (Spark Driver running on local machine)
+
+With `--deploy-mode client` (default), the Spark Driver runs locally and doesn't pick up [spark-4.0-standalone.conf](../compose.spark-4.0-standalone.yaml), so the `--conf spark.hadoop.*` options must be set explicitly.
+
+```shell
+spark-submit \
+    --master spark://localhost:7077 \
+    --jars https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/4.0.2/gcs-connector-4.0.2-shaded.jar \
+    --conf spark.eventLog.enabled=true \
+    --conf spark.eventLog.dir=file://$(pwd)/../logs/ \
+    --conf spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem \
+    --conf spark.hadoop.fs.AbstractFileSystem.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS \
+    --conf spark.hadoop.google.cloud.auth.type=APPLICATION_DEFAULT \
+    fhv_zones_gcs.py
+```
+
+> **Note:** `APPLICATION_DEFAULT` is recommended here. `spark.hadoop.*` confs set via `spark-submit` propagate to both the driver and the executors. With `SERVICE_ACCOUNT_JSON_KEYFILE`, the keyfile path must be valid on **both** the local machine (driver) and inside the Docker containers (executors). Since the executors already have their own SA keyfile configured via [spark-4.0-standalone.conf](../spark-4.0-standalone.conf), using `APPLICATION_DEFAULT` lets the driver authenticate with local ADC (`gcloud auth application-default login`) while the executors fall back to their cluster-side SA config.
+
 
 ## Compatibility Matrix for GCS
 
diff --git a/module5-batch-processing/pyspark-4.x/pyproject.toml b/module5-batch-processing/pyspark-4.x/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.12,<3.14"
 
 dependencies = [
-    "pyspark[connect]>=4.0.1,<4.1",
+    "pyspark[connect]==4.0.1",
     "pyarrow>=23.0.0,<24.0",
 ]
 
diff --git a/module5-batch-processing/pyspark-4.x/uv.lock b/module5-batch-processing/pyspark-4.x/uv.lock
diff --git a/module5-batch-processing/spark-4.0-standalone.conf b/module5-batch-processing/spark-4.0-standalone.conf
@@ -14,6 +14,10 @@ spark.worker.cleanup.interval=600
 spark.shuffle.service.db.enabled=true
 spark.shuffle.service.db.backend=ROCKSDB
 
+# Event Log (History Server)
+spark.eventLog.enabled=true
+spark.eventLog.dir=/opt/spark/logs/
+
 # Classpath
 spark.driver.extraClassPath=/opt/spark/extra-jars/*
 spark.executor.extraClassPath=/opt/spark/extra-jars/*

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ readme = "README.md"`
`6`	`6`	`requires-python = ">=3.12,<3.14"`
`7`	`7`
`8`	`8`	`dependencies = [`
`9`		`- "pyspark[connect]>=4.0.1,<4.1",`
	`9`	`+ "pyspark[connect]==4.0.1",`
`10`	`10`	`"pyarrow>=23.0.0,<24.0",`
`11`	`11`	`]`
`12`	`12`