lint

max-ostapenko · max-ostapenko · commit bd1cf17c2eed · 2025-03-03T00:04:29.000+01:00
diff --git a/infra/bigquery_export_spark/Dockerfile b/infra/bigquery_export_spark/Dockerfile
@@ -7,7 +7,9 @@ FROM debian:12-slim
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install utilities required by Spark scripts.
-RUN apt-get update && apt-get install -y procps tini libjemalloc2
+RUN apt-get update && apt-get install -y procps=\* tini=\* libjemalloc2=\* \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
 
 # Enable jemalloc2 as default memory allocator
 ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@@ -23,10 +25,10 @@ RUN bash Miniforge3-Linux-x86_64.sh -b -p /opt/miniforge3 \
   && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict
 
 WORKDIR /app
-COPY . .
+COPY requirements.txt .
 
 # Install pip packages.
-RUN ${PYSPARK_PYTHON} -m pip install --no-cache-dir -r app/requirements.txt
+RUN ${PYSPARK_PYTHON} -m pip install --no-cache-dir -r requirements.txt
 
 # Create the 'spark' group/user.
 # The GID and UID must be 1099. Home directory is required.
diff --git a/infra/bigquery_export_spark/requirements.txt b/infra/bigquery_export_spark/requirements.txt
@@ -1,3 +1,4 @@
 google-cloud-bigquery==3.23
 google-cloud-storage==2.16
 google-cloud-firestore==2.20.1
+# pyspark==3.5.5
diff --git a/infra/bigquery_export_spark/src/firestore.py b/infra/bigquery_export_spark/src/firestore.py
@@ -0,0 +1,114 @@
+from pyspark.sql import SparkSession
+from google.cloud import firestore
+from google.cloud import bigquery
+import json
+import os
+
+class FirestoreBatch:
+    def __init__(self):
+        self.firestore = firestore.Client()
+        self.bigquery = bigquery.Client()
+        self.batch_size = 500
+        self.max_concurrent_batches = 200
+        self.current_batch = []
+        self.batch_promises = []
+        self.spark = SparkSession.builder.appName("FirestoreBatchProcessor").getOrCreate()
+
+    def queue_batch(self, operation):
+        batch = self.firestore.batch()
+
+        for doc in self.current_batch:
+            if operation == "delete":
+                batch.delete(doc.reference)
+            elif operation == "set":
+                doc_ref = self.firestore.collection(self.collection_name).document()
+                batch.set(doc_ref, doc)
+            else:
+                raise ValueError("Invalid operation")
+
+        self.batch_promises.append(batch.commit())
+        self.current_batch = []
+
+    def commit_batches(self):
+        print(f"Committing {len(self.batch_promises)} batches to {self.collection_name}")
+        for batch_promise in self.batch_promises:
+            try:
+                batch_promise
+            except Exception as e:
+                print(f"Error committing batch: {e}")
+                raise
+        self.batch_promises = []
+
+    def final_flush(self, operation):
+        if self.current_batch:
+            self.queue_batch(operation)
+        if self.batch_promises:
+            self.commit_batches()
+
+    def batch_delete(self):
+        print("Starting batch deletion...")
+        start_time = self.spark.sparkContext.startTime
+        self.current_batch = []
+        self.batch_promises = []
+        total_docs_deleted = 0
+
+        collection_ref = self.firestore.collection(self.collection_name)
+        if self.collection_type == "report":
+            print(f"Deleting documents from {self.collection_name} for date {self.date}")
+            query = collection_ref.where("date", "==", self.date)
+        elif self.collection_type == "dict":
+            print(f"Deleting documents from {self.collection_name}")
+            query = collection_ref
+        else:
+            raise ValueError("Invalid collection type")
+
+        while True:
+            docs = list(query.limit(self.batch_size * self.max_concurrent_batches).stream())
+            if not docs:
+                break
+
+            for doc in docs:
+                self.current_batch.append(doc)
+                if len(self.current_batch) >= self.batch_size:
+                    self.queue_batch("delete")
+                if len(self.batch_promises) >= self.max_concurrent_batches:
+                    self.commit_batches()
+                total_docs_deleted += 1
+
+        self.final_flush("delete")
+        duration = (self.spark.sparkContext.startTime - start_time) / 1000
+        print(f"Deletion complete. Total docs deleted: {total_docs_deleted}. Time: {duration} seconds")
+
+    def stream_from_bigquery(self, query):
+        print("Starting BigQuery to Firestore transfer...")
+        start_time = self.spark.sparkContext.startTime
+        total_rows_processed = 0
+
+        df = self.spark.read.format("bigquery").option("query", query).load()
+
+        for row in df.collect():
+            self.current_batch.append(row.asDict())
+            if len(self.current_batch) >= self.batch_size:
+                self.queue_batch("set")
+            if len(self.batch_promises) >= self.max_concurrent_batches:
+                self.commit_batches()
+            total_rows_processed += 1
+
+        self.final_flush("set")
+        duration = (self.spark.sparkContext.startTime - start_time) / 1000
+        print(f"Transfer to {self.collection_name} complete. Total rows processed: {total_rows_processed}. Time: {duration} seconds")
+
+    def export(self):
+        export_config = json.loads('{"name": "technologies", "type": "dict", "environment": "dev"}')
+        query = str(json.loads("SELECT * FROM report.tech_report_technologies"))
+
+        self.date = getattr(export_config, "date", "")
+        self.collection_name = export_config["name"]
+        self.collection_type = export_config["type"]
+
+        self.batch_delete()
+        self.stream_from_bigquery(query)
+
+if __name__ == "__main__":
+    processor = FirestoreBatch()
+    processor.export()
diff --git a/infra/tf/dataform_export/main.tf b/infra/tf/dataform_export/main.tf
@@ -49,11 +49,11 @@ resource "google_cloudfunctions2_function" "dataform_export" {
 }
 
 resource "google_bigquery_routine" "run_export_job" {
-  dataset_id = "reports"
-  routine_id = "run_export_job"
-  routine_type = "SCALAR_FUNCTION"
+  dataset_id      = "reports"
+  routine_id      = "run_export_job"
+  routine_type    = "SCALAR_FUNCTION"
   definition_body = ""
-  description = <<EOT
+  description     = <<EOT
   Export data from Google BigQuery.
   Example payload JSON: {"dataform_trigger": "tech_report_complete", "date": "2025-01-01", "name": "adoption", "type": "report"}
   EOT
diff --git a/infra/tf/main.tf b/infra/tf/main.tf
@@ -27,11 +27,11 @@ provider "google" {
 module "dataform_export" {
   source = "./dataform_export"
 
-  project_number    = local.project_number
-  region            = local.region
-  function_identity = "cloud-function@httparchive.iam.gserviceaccount.com"
-  function_name     = "dataform-export"
-  remote_functions_connection  = google_bigquery_connection.remote-functions.id
+  project_number              = local.project_number
+  region                      = local.region
+  function_identity           = "cloud-function@httparchive.iam.gserviceaccount.com"
+  function_name               = "dataform-export"
+  remote_functions_connection = google_bigquery_connection.remote-functions.id
 }
 
 module "dataform_trigger" {
diff --git a/package.json b/package.json
@@ -5,7 +5,7 @@
     "@dataform/core": "3.0.14"
   },
   "scripts": {
-    "format": "npx standard --fix; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint . --fix; terraform -chdir=infra/tf fmt",
+    "format": "npx standard --fix; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint . --fix; terraform -chdir=infra/tf fmt -recursive",
     "lint": "npx standard; npx markdownlint --ignore-path .gitignore --config package.json --configPointer /markdownlint .; dataform compile"
   },
   "standard": {