File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1- FROM apache/spark-py:latest AS spark
1+ FROM apache/spark-py:v3.4.0 AS spark
22
33FROM apache/airflow:$AIRFLOW_VERSION
44
@@ -22,8 +22,9 @@ RUN chown -R airflow /opt/spark
2222ENV SPARK_HOME="/opt/spark"
2323ENV PATH="$PATH:$SPARK_HOME/bin"
2424
25- # Install Postgres driver for Spark
26- RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar -o /opt/spark/jars/postgresql-42.5.0.jar
25+ # Install Postgres driver and Iceberg for Spark
26+ RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar -o /opt/spark/jars/postgresql-42.5.0.jar && \
27+ curl -L https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar -o /opt/spark/jars/iceberg-spark-runtime-3.4_2.12-1.4.2.jar
2728
2829# Install Hadoop
2930RUN curl https://dlcdn.apache.org/hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz -o hadoop-3.3.4.tar.gz && \
Original file line number Diff line number Diff line change 1- spark.hadoop.hive.exec.dynamic.partition true
2- spark.hadoop.hive.exec.dynamic.partition.mode nonstrict
3- spark.sql.sources.partitionOverwriteMode dynamic
1+ spark.hadoop.hive.exec.dynamic.partition true
2+ spark.hadoop.hive.exec.dynamic.partition.mode nonstrict
3+ spark.sql.sources.partitionOverwriteMode dynamic
4+
5+ spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
6+ spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
7+ spark.sql.catalog.spark_catalog.type hive
Original file line number Diff line number Diff line change 5454 },
5555 )
5656 ),
57- model_defaults = ModelDefaultsConfig (dialect = "duckdb" ),
57+ model_defaults = ModelDefaultsConfig (dialect = "duckdb" , storage_format = "iceberg" ),
5858)
5959
6060
6161# Due to a 3.7 mypy bug we ignore. Can remove once 3.7 support is dropped.
6262airflow_config_docker = Config ( # type: ignore
6363 default_scheduler = AirflowSchedulerConfig (airflow_url = "http://airflow-webserver:8080/" ),
6464 gateways = GatewayConfig (connection = SparkConnectionConfig ()),
65- model_defaults = ModelDefaultsConfig (dialect = "duckdb" ),
65+ model_defaults = ModelDefaultsConfig (dialect = "duckdb" , storage_format = "iceberg" ),
6666)
6767
6868# A DuckDB config with a physical schema map.
You can’t perform that action at this time.
0 commit comments