Skip to content
This repository was archived by the owner on Mar 23, 2026. It is now read-only.

Commit 55585b9

Browse files
committed
updates necessary for dto rr analysis
1 parent 2eba367 commit 55585b9

9 files changed

Lines changed: 570 additions & 946 deletions

File tree

docs/tutorials/virtual_db_tutorial.ipynb

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,7 @@
147147
"cell_type": "markdown",
148148
"id": "cell-3",
149149
"metadata": {},
150-
"source": [
151-
"## Initializing VirtualDB\n",
152-
"\n",
153-
"Creating a VirtualDB instance loads and validates the config but does\n",
154-
"**not** download any data yet. Views are registered lazily on the first\n",
155-
"`query()`, `tables()`, or `describe()` call."
156-
]
150+
"source": "## Initializing VirtualDB\n\nCreating a VirtualDB instance loads and validates the config, downloads any\nnecessary data, and registers all views immediately."
157151
},
158152
{
159153
"cell_type": "code",
@@ -5100,4 +5094,4 @@
51005094
},
51015095
"nbformat": 4,
51025096
"nbformat_minor": 5
5103-
}
5097+
}

docs/virtual_db.md

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,21 @@ and the [tutorial](tutorials/virtual_db_tutorial.ipynb) for usage examples.
2525

2626
## Advanced Usage
2727

28-
After any public method is called (e.g. `vdb.tables()`), the underlying DuckDB
29-
connection is available as `vdb._db`. You can use `_db` to execute any SQL
30-
on the database, eg creating more views, or creating a table in memory
28+
The underlying DuckDB connection is available as `vdb._conn`. You can use
29+
`_conn` to execute any SQL on the database, eg creating more views, or
30+
creating a table in memory.
3131

3232
Custom **views** created this way appear in `tables()`, `describe()`, and
3333
`get_fields()` automatically because those methods query DuckDB's
3434
`information_schema`. Custom **tables** do not appear in `tables()` (which
3535
only lists views), but are fully queryable via `vdb.query()`.
3636

37-
Call at least one public method first to ensure the connection is initialized
38-
before accessing `_db` directly.
39-
4037
Example -- create a materialized analysis table::
4138

42-
# Trigger view registration
43-
vdb.tables()
44-
4539
# Create a persistent in-memory table from a complex query.
4640
# This example selects one "best" Hackett-2020 sample per regulator
4741
# using a priority system: ZEV+P > GEV+P > GEV+M.
48-
vdb._db.execute("""
42+
vdb._conn.execute("""
4943
CREATE OR REPLACE TABLE hackett_analysis_set AS
5044
WITH regulator_tiers AS (
5145
SELECT

docs/virtual_db_configuration.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,7 @@ for more detailed explanation of comparative datasets and composite IDs.
255255
## Internal Structure
256256

257257
VirtualDB uses an in-memory DuckDB database to construct a layered hierarchy
258-
of SQL views over locally cached Parquet files. Views are created lazily on
259-
first query and are not persisted to disk.
258+
of SQL views over locally cached Parquet files. Views are created on initialization and are not persisted to disk.
260259

261260
### View Hierarchy
262261

tfbpapi/datacard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def _build_metadata_fields_map(self) -> None:
307307
]
308308
break
309309
else:
310-
self.logger.warning(
310+
self.logger.info(
311311
"No metadata fields found for data config '%s' "
312312
"in repo '%s' -- no embedded metadata_fields and "
313313
"no metadata config with applies_to",

tfbpapi/models.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
1010
"""
1111

12+
import logging
1213
from enum import Enum
1314
from functools import cached_property
1415
from pathlib import Path
@@ -29,6 +30,9 @@
2930
FactorAliases: TypeAlias = dict[str, dict[str, list[str | int | float | bool]]]
3031

3132

33+
logger = logging.getLogger(__name__)
34+
35+
3236
class DatasetType(str, Enum):
3337
"""Supported dataset types."""
3438

@@ -761,6 +765,23 @@ def validate_factor_aliases(cls, v: FactorAliases) -> FactorAliases:
761765
)
762766
return v
763767

768+
@model_validator(mode="after")
769+
def validate_repositories_have_datasets(self) -> "MetadataConfig":
770+
"""
771+
Validate that every repository defines at least one dataset.
772+
773+
:return: The validated MetadataConfig instance
774+
:raises ValueError: If any repository has no datasets defined
775+
776+
"""
777+
for repo_id, repo_config in self.repositories.items():
778+
if not repo_config.dataset:
779+
raise ValueError(
780+
f"Repository '{repo_id}' must define at least one "
781+
"dataset under the 'dataset' key."
782+
)
783+
return self
784+
764785
@model_validator(mode="after")
765786
def validate_unique_db_names(self) -> "MetadataConfig":
766787
"""
@@ -791,13 +812,19 @@ def validate_unique_db_names(self) -> "MetadataConfig":
791812

792813
@model_validator(mode="before")
793814
@classmethod
794-
def parse_repositories(cls, data: Any) -> dict[str, Any]:
815+
def parse_config(cls, data: Any) -> dict[str, Any]:
795816
"""
796-
Parse repository configurations from 'repositories' key.
817+
Parse and validate all top-level sections of the VirtualDB configuration.
818+
819+
Handles the four top-level sections: ``repositories`` (required),
820+
``factor_aliases``, ``missing_value_labels``, and ``description``
821+
(all optional). Logs an INFO message for each optional section that
822+
is absent from the configuration.
797823
798824
:param data: Raw configuration data
799-
:return: Processed configuration with parsed repositories
800-
:raises ValueError: If repositories are invalid or missing
825+
:return: Processed configuration dict ready for Pydantic field validation
826+
:raises ValueError: If ``repositories`` is missing or empty, or if
827+
any repository config is invalid
801828
802829
"""
803830
if not isinstance(data, dict):
@@ -811,6 +838,13 @@ def parse_repositories(cls, data: Any) -> dict[str, Any]:
811838
"with at least one repository"
812839
)
813840

841+
for optional_key in ("factor_aliases", "missing_value_labels", "description"):
842+
if not data.get(optional_key):
843+
logger.info(
844+
"No '%s' section found in VirtualDB configuration.",
845+
optional_key,
846+
)
847+
814848
# Parse each repository config
815849
repositories = {}
816850
for repo_id, repo_config in repositories_data.items():

0 commit comments

Comments
 (0)