Skip to content

Commit c2a8aae

Browse files
Merge pull request #12 from DataLabTechTV/dev
feat: migrate DuckLake catalog from SQLite to PostgreSQL
2 parents 133e481 + 0894362 commit c2a8aae

15 files changed

Lines changed: 355 additions & 103 deletions

File tree

.env.example

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ S3_REGION=eu-west-1
1616
# PostgreSQL
1717
# ==========
1818

19-
PGPASSWORD=datalabtech
19+
PSQL_ROOT_PASSWORD=datalabtech
2020

2121
# ========
2222
# Data Lab
@@ -38,11 +38,23 @@ S3_BACKUPS_PREFIX=backups
3838
# - All paths are relative to local/
3939

4040
ENGINE_DB=engine.duckdb
41+
42+
# Deprecated: only use to migrate from SQLite to PostgreSQL
4143
STAGE_DB=stage.sqlite
4244
SECURE_STAGE_DB=secure_stage.sqlite
4345
GRAPHS_MART_DB=marts/graphs.sqlite
4446
ANALYTICS_MART_DB=marts/analytics.sqlite
4547

48+
PSQL_CATALOG_HOST=docker-shared
49+
PSQL_CATALOG_PORT=5432
50+
PSQL_CATALOG_DB=lakehouse
51+
PSQL_CATALOG_USER=lakehouse
52+
PSQL_CATALOG_PASSWORD=lakehouse
53+
PSQL_CATALOG_STAGE_SCHEMA=stage
54+
PSQL_CATALOG_SECURE_STAGE_SCHEMA=secure_stage
55+
PSQL_CATALOG_GRAPHS_MART_SCHEMA=graphs
56+
PSQL_CATALOG_ANALYTICS_MART_SCHEMA=analytics
57+
4658
# ====
4759
# Kuzu
4860
# ====

README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,7 @@ s3://lakehouse/
181181
│ └── catalog/
182182
│ ├── YYYY_MM_DD/
183183
│ │ └── HH_mm_SS_sss/
184-
│ │ ├── engine.duckdb
185-
│ │ ├── stage.sqlite
186-
│ │ └── marts/*.sqlite
184+
│ │ └── lakehouse.dump
187185
│ └── manifest.json
188186
├── raw/
189187
│ └── <dataset-name>/
@@ -238,7 +236,7 @@ S3_REGION=eu-west-1
238236
#### PostgreSQL
239237
240238
```bash
241-
PGPASSWORD=datalabtech
239+
PSQL_ROOT_PASSWORD=datalabtech
242240
```
243241
244242
Set this to the `root` user password of your PostgreSQL database—only used when deploying your on-premise infrastructure, so that databases and credentials can be provisioned at a later stage. Otherwise not accessed.
@@ -440,9 +438,7 @@ dlctl backup create
440438
In order to restore a backup, just run:
441439
442440
```bash
443-
dlctl backup restore \
444-
--source "<YYYY-mm-ddTHH:MM:SS.sss>" \
445-
--target "<target-dir>"
441+
dlctl backup restore --source "<YYYY-mm-ddTHH:MM:SS.sss>"
446442
```
447443
448444
Omitting `--source` will restore the latest backup.

dlctl/cli.py

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
2+
import subprocess
23
import sys
4+
import tempfile
35
from datetime import datetime
46
from importlib.metadata import version
57
from pathlib import Path
@@ -15,7 +17,7 @@
1517
from ingest.cli import ingest
1618
from ml.cli import ml
1719
from shared.cache import cache_usage, expunge_cache
18-
from shared.settings import LOCAL_DIR, MART_DB_VARS, env
20+
from shared.settings import LOCAL_DIR, MART_SCHEMA_VARS, env
1921
from shared.storage import Storage, StoragePrefix
2022

2123
LOG_FILE = Path(__file__).resolve().parents[1] / "logs/datalab.log"
@@ -86,22 +88,25 @@ def backup():
8688
def backup_create():
8789
log.info("Creating a catalog backup")
8890

89-
source_files = [env.str("ENGINE_DB"), env.str("STAGE_DB")]
90-
source_files += (env.str(varname) for varname in MART_DB_VARS)
91+
os.environ["PGHOST"] = env.str("PSQL_CATALOG_HOST")
92+
os.environ["PGPORT"] = env.str("PSQL_CATALOG_PORT")
93+
os.environ["PGDATABASE"] = env.str("PSQL_CATALOG_DB")
94+
os.environ["PGUSER"] = env.str("PSQL_CATALOG_USER")
95+
os.environ["PGPASSWORD"] = env.str("PSQL_CATALOG_PASSWORD")
9196

92-
for source_file in source_files:
93-
source_path = os.path.join(LOCAL_DIR, source_file)
97+
with tempfile.NamedTemporaryFile(
98+
prefix="datalab-lakehouse-",
99+
suffix=".dump",
100+
) as tmp:
101+
log.debug("Dumping to temporary file: {}", tmp.name)
102+
subprocess.run(["pg_dump", "-Fc", "-f", tmp.name], check=True)
94103

95-
if not os.path.exists(source_path):
96-
log.error("source path doesn't exist: {}", source_path)
97-
return
98-
99-
s = Storage(prefix=StoragePrefix.BACKUPS)
100-
s3_backup_path = s.get_dir("catalog", dated=True)
101-
s.upload_files(LOCAL_DIR, source_files, s3_backup_path)
102-
s.upload_manifest("catalog", latest=s3_backup_path)
104+
s = Storage(prefix=StoragePrefix.BACKUPS)
105+
s3_backup_path = s.get_dir("catalog", dated=True)
106+
s.upload_file(tmp.name, f"{s3_backup_path}/lakehouse.dump")
107+
s.upload_manifest("catalog", latest=s3_backup_path)
103108

104-
log.info("Catalog backup created: {}", s3_backup_path)
109+
log.info("Catalog backup created: {}", s3_backup_path)
105110

106111

107112
@backup.command(name="restore", help="Restore engine and catalog into a directory")
@@ -111,9 +116,13 @@ def backup_create():
111116
type=click.DateTime(formats=["%Y-%m-%dT%H:%M:%S.%f"]),
112117
help="Timestamp for backup source (YYYY-mm-ddTHH:MM:SS.sss)",
113118
)
114-
@click.option("--target", default=LOCAL_DIR, type=click.STRING, help="Target directory")
115-
def backup_restore(source_date: Optional[datetime], target: str):
116-
os.makedirs(target, exist_ok=True)
119+
def backup_restore(source_date: Optional[datetime]):
120+
os.environ["PGHOST"] = env.str("PSQL_CATALOG_HOST")
121+
os.environ["PGPORT"] = env.str("PSQL_CATALOG_PORT")
122+
os.environ["PGUSER"] = env.str("PSQL_CATALOG_USER")
123+
os.environ["PGPASSWORD"] = env.str("PSQL_CATALOG_PASSWORD")
124+
125+
db = env.str("PSQL_CATALOG_DB")
117126

118127
s = Storage(prefix=StoragePrefix.BACKUPS)
119128

@@ -130,8 +139,20 @@ def backup_restore(source_date: Optional[datetime], target: str):
130139
time = source_date.strftime("%H_%M_%S_%f")[:-3]
131140
s3_path = s.to_s3_path(f"{env.str('S3_BACKUPS_PREFIX')}/catalog/{date}/{time}")
132141

133-
log.info("Restoring backup from {} into {}", s3_path, target)
134-
s.download_dir(s3_path, target)
142+
log.info("Restoring backup from {}", s3_path)
143+
144+
with tempfile.NamedTemporaryFile(
145+
prefix="datalab-lakehouse-",
146+
suffix=".dump",
147+
) as tmp:
148+
log.debug("Downloading dump to temporary file: {}", tmp.name)
149+
s.download_file(f"{s3_path}/lakehouse.dump", tmp.name)
150+
151+
log.debug("Restoring dump from temporary file: {}", tmp.name)
152+
subprocess.run(
153+
["pg_restore", "-c", "--if-exists", "-d", db, tmp.name],
154+
check=True,
155+
)
135156

136157

137158
@backup.command(name="ls", help="List catalog backups")

dlctl/dbt_handler.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,6 @@ def mkdirs(self):
3535
engine_db_dir = os.path.dirname(os.path.join(LOCAL_DIR, env.str("ENGINE_DB")))
3636
os.makedirs(engine_db_dir, exist_ok=True)
3737

38-
stage_db_dir = os.path.dirname(os.path.join(LOCAL_DIR, env.str("STAGE_DB")))
39-
os.makedirs(stage_db_dir, exist_ok=True)
40-
41-
for name, value in os.environ.items():
42-
if name.endswith("_MART_DB"):
43-
mart_db_dir = os.path.dirname(os.path.join(LOCAL_DIR, value))
44-
os.makedirs(mart_db_dir, exist_ok=True)
45-
4638
def deps(self):
4739
self.dbt.invoke(["deps"] + self.PROJECT_ARGS)
4840

infra/apps/docker/compose.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,31 @@ services:
88
environment:
99
MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI}
1010
KAFKA_BROKER_ENDPOINT: ${KAFKA_BROKER_ENDPOINT}
11+
12+
S3_ENDPOINT: ${S3_ENDPOINT}
13+
S3_USE_SSL: ${S3_USE_SSL}
14+
S3_URL_STYLE: ${S3_URL_STYLE}
15+
S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID}
16+
S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY}
17+
S3_REGION: ${S3_REGION}
18+
19+
S3_BUCKET: ${S3_BUCKET}
20+
S3_STAGE_PREFIX: ${S3_STAGE_PREFIX}
21+
S3_SECURE_STAGE_PREFIX: ${S3_SECURE_STAGE_PREFIX}
22+
S3_GRAPHS_MART_PREFIX: ${S3_GRAPHS_MART_PREFIX}
23+
S3_ANALYTICS_MART_PREFIX: ${S3_ANALYTICS_MART_PREFIX}
24+
S3_EXPORTS_PREFIX: ${S3_EXPORTS_PREFIX}
25+
S3_BACKUPS_PREFIX: ${S3_BACKUPS_PREFIX}
26+
27+
PSQL_CATALOG_HOST: ${PSQL_CATALOG_HOST}
28+
PSQL_CATALOG_PORT: ${PSQL_CATALOG_PORT}
29+
PSQL_CATALOG_DB: ${PSQL_CATALOG_DB}
30+
PSQL_CATALOG_USER: ${PSQL_CATALOG_USER}
31+
PSQL_CATALOG_PASSWORD: ${PSQL_CATALOG_PASSWORD}
32+
PSQL_CATALOG_STAGE_SCHEMA: ${PSQL_CATALOG_STAGE_SCHEMA}
33+
PSQL_CATALOG_SECURE_STAGE_SCHEMA: ${PSQL_CATALOG_SECURE_STAGE_SCHEMA}
34+
PSQL_CATALOG_GRAPHS_MART_SCHEMA: ${PSQL_CATALOG_GRAPHS_MART_SCHEMA}
35+
PSQL_CATALOG_ANALYTICS_MART_SCHEMA: ${PSQL_CATALOG_ANALYTICS_MART_SCHEMA}
1136
healthcheck:
1237
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
1338
interval: 10s

infra/services/docker/compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ services:
4242
- "5432:5432"
4343
environment:
4444
POSTGRES_USER: root
45-
POSTGRES_PASSWORD: ${PGPASSWORD}
45+
POSTGRES_PASSWORD: ${PSQL_ROOT_PASSWORD}
4646
volumes:
4747
- postgres:/var/lib/postgresql/data
4848
networks:

0 commit comments

Comments
 (0)