Skip to content
This repository was archived by the owner on Mar 23, 2026. It is now read-only.

Commit cb4cbba

Browse files
committed
adding tags
1 parent b06aed9 commit cb4cbba

5 files changed

Lines changed: 397 additions & 23 deletions

File tree

docs/tutorials/virtual_db_tutorial.ipynb

Lines changed: 86 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,18 @@
3333
"name": "stdout",
3434
"output_type": "stream",
3535
"text": [
36-
"Config saved to: /tmp/tmp60wlr_qk/vdb_config.yaml\n"
36+
"Config saved to: /tmp/tmpnx5tjaud/vdb_config.yaml\n"
3737
]
3838
}
3939
],
4040
"source": [
4141
"config_yaml = \"\"\"\n",
4242
"repositories:\n",
4343
" BrentLab/harbison_2004:\n",
44+
" tags:\n",
45+
" assay: binding\n",
46+
" method: chip-chip\n",
47+
" organism: yeast\n",
4448
" dataset:\n",
4549
" harbison_2004:\n",
4650
" db_name: harbison\n",
@@ -59,6 +63,10 @@
5963
" field: regulator_symbol\n",
6064
"\n",
6165
" BrentLab/kemmeren_2014:\n",
66+
" tags:\n",
67+
" assay: perturbation\n",
68+
" method: microarray\n",
69+
" organism: yeast\n",
6270
" dataset:\n",
6371
" kemmeren_2014:\n",
6472
" db_name: kemmeren\n",
@@ -75,8 +83,17 @@
7583
" field: regulator_symbol\n",
7684
"\n",
7785
" BrentLab/hackett_2020:\n",
86+
" # Repo-level tags apply to all datasets in this repository\n",
87+
" tags:\n",
88+
" method: test_overwrite\n",
89+
" organism: yeast\n",
7890
" dataset:\n",
7991
" hackett_2020:\n",
92+
" # Dataset-level tags: 'assay' is new,\n",
93+
" # 'method' overrides the repo-level value\n",
94+
" tags:\n",
95+
" assay: perturbation\n",
96+
" method: overexpression\n",
8097
" db_name: hackett\n",
8198
" sample_id:\n",
8299
" field: sample_id\n",
@@ -126,6 +143,56 @@
126143
"print(f\"Config saved to: {temp_config}\")"
127144
]
128145
},
146+
{
147+
"cell_type": "markdown",
148+
"id": "ftysbe3fwz4",
149+
"metadata": {},
150+
"source": [
151+
"## Tags\n",
152+
"\n",
153+
"Tags are arbitrary key/value annotations defined in the configuration. They\n",
154+
"follow the same hierarchy as property mappings: repo-level tags apply to all\n",
155+
"datasets in that repository, and dataset-level tags override repo-level tags\n",
156+
"with the same key.\n",
157+
"\n",
158+
"Use `config.get_tags(repo_id, config_name)` to retrieve the merged tags for\n",
159+
"any dataset."
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": 2,
165+
"id": "d0ulj4mzmf7",
166+
"metadata": {},
167+
"outputs": [
168+
{
169+
"ename": "NameError",
170+
"evalue": "name 'vdb' is not defined",
171+
"output_type": "error",
172+
"traceback": [
173+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
174+
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
175+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Tags are accessible directly from the VirtualDB instance using the db_name.\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# No need to import MetadataConfig or specify repo_id.\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mharbison tags:\u001b[39m\u001b[33m\"\u001b[39m, \u001b[43mvdb\u001b[49m.get_tags(\u001b[33m\"\u001b[39m\u001b[33mharbison\u001b[39m\u001b[33m\"\u001b[39m))\n\u001b[32m 4\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33mkemmeren tags:\u001b[39m\u001b[33m\"\u001b[39m, vdb.get_tags(\u001b[33m\"\u001b[39m\u001b[33mkemmeren\u001b[39m\u001b[33m\"\u001b[39m))\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# Hackett has tags at both levels:\u001b[39;00m\n\u001b[32m 7\u001b[39m \u001b[38;5;66;03m# 'organism' comes from the repo level only,\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# 'assay' is added at the dataset level only,\u001b[39;00m\n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m# 'method' is defined at both levels -- the dataset value wins.\u001b[39;00m\n",
176+
"\u001b[31mNameError\u001b[39m: name 'vdb' is not defined"
177+
]
178+
}
179+
],
180+
"source": [
181+
"# Tags are accessible directly from the VirtualDB instance using the db_name.\n",
182+
"# No need to import MetadataConfig or specify repo_id.\n",
183+
"print(\"harbison tags:\", vdb.get_tags(\"harbison\"))\n",
184+
"print(\"kemmeren tags:\", vdb.get_tags(\"kemmeren\"))\n",
185+
"\n",
186+
"# Hackett has tags at both levels:\n",
187+
"# 'organism' comes from the repo level only,\n",
188+
"# 'assay' is added at the dataset level only,\n",
189+
"# 'method' is defined at both levels -- the dataset value wins.\n",
190+
"print(\"hackett tags:\", vdb.get_tags(\"hackett\"))\n",
191+
"\n",
192+
"# Dataset with no tags returns empty dict\n",
193+
"print(\"dto tags:\", vdb.get_tags(\"dto\"))"
194+
]
195+
},
129196
{
130197
"cell_type": "markdown",
131198
"id": "cell-3",
@@ -140,7 +207,7 @@
140207
},
141208
{
142209
"cell_type": "code",
143-
"execution_count": 2,
210+
"execution_count": null,
144211
"id": "cell-4",
145212
"metadata": {},
146213
"outputs": [
@@ -187,7 +254,7 @@
187254
},
188255
{
189256
"cell_type": "code",
190-
"execution_count": 3,
257+
"execution_count": null,
191258
"id": "cell-6",
192259
"metadata": {},
193260
"outputs": [
@@ -236,7 +303,7 @@
236303
},
237304
{
238305
"cell_type": "code",
239-
"execution_count": 4,
306+
"execution_count": null,
240307
"id": "pdebujnqb9q",
241308
"metadata": {},
242309
"outputs": [
@@ -469,7 +536,7 @@
469536
},
470537
{
471538
"cell_type": "code",
472-
"execution_count": 5,
539+
"execution_count": null,
473540
"id": "9deee334",
474541
"metadata": {},
475542
"outputs": [
@@ -806,7 +873,7 @@
806873
},
807874
{
808875
"cell_type": "code",
809-
"execution_count": 6,
876+
"execution_count": null,
810877
"id": "cell-9",
811878
"metadata": {},
812879
"outputs": [
@@ -840,7 +907,7 @@
840907
},
841908
{
842909
"cell_type": "code",
843-
"execution_count": 7,
910+
"execution_count": null,
844911
"id": "1a705f1c",
845912
"metadata": {},
846913
"outputs": [
@@ -1056,7 +1123,7 @@
10561123
},
10571124
{
10581125
"cell_type": "code",
1059-
"execution_count": 8,
1126+
"execution_count": null,
10601127
"id": "cell-17",
10611128
"metadata": {},
10621129
"outputs": [
@@ -1362,7 +1429,7 @@
13621429
},
13631430
{
13641431
"cell_type": "code",
1365-
"execution_count": 9,
1432+
"execution_count": null,
13661433
"id": "cell-19",
13671434
"metadata": {},
13681435
"outputs": [
@@ -1786,7 +1853,7 @@
17861853
},
17871854
{
17881855
"cell_type": "code",
1789-
"execution_count": 10,
1856+
"execution_count": null,
17901857
"id": "cell-21",
17911858
"metadata": {},
17921859
"outputs": [
@@ -2064,7 +2131,7 @@
20642131
},
20652132
{
20662133
"cell_type": "code",
2067-
"execution_count": 11,
2134+
"execution_count": null,
20682135
"id": "cell-22",
20692136
"metadata": {},
20702137
"outputs": [
@@ -2427,7 +2494,7 @@
24272494
},
24282495
{
24292496
"cell_type": "code",
2430-
"execution_count": 12,
2497+
"execution_count": null,
24312498
"id": "cell-23",
24322499
"metadata": {},
24332500
"outputs": [
@@ -2691,7 +2758,7 @@
26912758
},
26922759
{
26932760
"cell_type": "code",
2694-
"execution_count": 13,
2761+
"execution_count": null,
26952762
"id": "f03e942a",
26962763
"metadata": {},
26972764
"outputs": [
@@ -3387,7 +3454,7 @@
33873454
},
33883455
{
33893456
"cell_type": "code",
3390-
"execution_count": 14,
3457+
"execution_count": null,
33913458
"id": "4d869036",
33923459
"metadata": {},
33933460
"outputs": [
@@ -3603,7 +3670,7 @@
36033670
},
36043671
{
36053672
"cell_type": "code",
3606-
"execution_count": 15,
3673+
"execution_count": null,
36073674
"id": "89408d2b",
36083675
"metadata": {},
36093676
"outputs": [
@@ -3631,7 +3698,7 @@
36313698
},
36323699
{
36333700
"cell_type": "code",
3634-
"execution_count": 16,
3701+
"execution_count": null,
36353702
"id": "5a3b802b",
36363703
"metadata": {},
36373704
"outputs": [
@@ -3651,7 +3718,7 @@
36513718
},
36523719
{
36533720
"cell_type": "code",
3654-
"execution_count": 17,
3721+
"execution_count": null,
36553722
"id": "abed8bc2",
36563723
"metadata": {},
36573724
"outputs": [
@@ -4927,7 +4994,7 @@
49274994
},
49284995
{
49294996
"cell_type": "code",
4930-
"execution_count": 18,
4997+
"execution_count": null,
49314998
"id": "cell-25",
49324999
"metadata": {},
49335000
"outputs": [
@@ -5017,7 +5084,7 @@
50175084
},
50185085
{
50195086
"cell_type": "code",
5020-
"execution_count": 19,
5087+
"execution_count": null,
50215088
"id": "cell-26",
50225089
"metadata": {},
50235090
"outputs": [],

docs/virtual_db_configuration.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,62 @@ during metadata extraction and query filtering.
147147
2. **Type consistency**: When source data might be extracted with incorrect type
148148
3. **Performance**: Helps with query optimization and prevents type mismatches
149149
150+
## Tags
151+
152+
Tags are arbitrary string key/value pairs for annotating datasets. They follow
153+
the same hierarchy as property mappings: repo-level tags apply to all datasets
154+
in the repository, dataset-level tags apply only to that dataset, and
155+
dataset-level tags override repo-level tags with the same key.
156+
157+
```yaml
158+
repositories:
159+
BrentLab/harbison_2004:
160+
# Repo-level tags apply to all datasets in this repository
161+
tags:
162+
assay: binding
163+
organism: yeast
164+
dataset:
165+
harbison_2004:
166+
sample_id:
167+
field: sample_id
168+
# Dataset-level tags override repo-level tags with the same key
169+
tags:
170+
assay: chip-chip
171+
172+
BrentLab/kemmeren_2014:
173+
tags:
174+
assay: perturbation
175+
organism: yeast
176+
dataset:
177+
kemmeren_2014:
178+
sample_id:
179+
field: sample_id
180+
```
181+
182+
Access merged tags via `vdb.get_tags(db_name)`, identifying datasets by
183+
their name as it appears in `vdb.tables()`:
184+
185+
```python
186+
from tfbpapi.virtual_db import VirtualDB
187+
188+
vdb = VirtualDB("datasets.yaml")
189+
190+
# Returns {"assay": "chip-chip", "organism": "yeast"}
191+
# (dataset-level assay overrides repo-level)
192+
vdb.get_tags("harbison")
193+
194+
# Returns {"assay": "perturbation", "organism": "yeast"}
195+
vdb.get_tags("kemmeren")
196+
```
197+
198+
The underlying `MetadataConfig` (available as `vdb.config`) exposes the same
199+
data via `(repo_id, config_name)` pairs for programmatic or developer use:
200+
201+
```python
202+
# Equivalent to vdb.get_tags("harbison") above
203+
vdb.config.get_tags("BrentLab/harbison_2004", "harbison_2004")
204+
```
205+
150206
## Comparative Datasets
151207
152208
Comparative datasets differ from other dataset types in that they represent

0 commit comments

Comments
 (0)