-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsemantic_layer.py
More file actions
224 lines (222 loc) · 8.54 KB
/
semantic_layer.py
File metadata and controls
224 lines (222 loc) · 8.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# INTERN NOTE: Semantic layer concept
# Raw table/column names like "order_total_usd" or "is_active_member" are
# meaningful to engineers but ambiguous to LLMs without business context.
# The semantic layer maps each column to its business meaning, valid value
# ranges, and usage rules. When injected into the LLM prompt, it dramatically
# reduces hallucinated SQL (wrong aggregation functions, wrong join keys).
# Think of it as a "data dictionary" that the LLM reads before writing SQL.
# In production, this can be extended with ownership, SLAs, and PII tags.
from typing import Any
SEMANTIC_SCHEMA: list[dict[str, Any]] = [
{
"table_name": "fact_orders",
"description": (
"Central fact table. Each row is one order-item event in the Olist marketplace. "
"Use this table for all revenue, freight, and order-status analyses. "
"Always join to dimension tables via the appropriate *_id foreign key."
),
"columns": [
{
"name": "order_id",
"description": (
"Unique identifier for a customer order. "
"An order can appear multiple times if it contains multiple items."
),
},
{
"name": "user_id",
"description": (
"Foreign key to dim_users. Identifies the purchasing customer."
),
},
{
"name": "product_id",
"description": (
"Foreign key to dim_products. Identifies the specific product purchased."
),
},
{
"name": "seller_id",
"description": (
"Foreign key to dim_sellers. Identifies the seller who fulfilled the item."
),
},
{
"name": "order_total_usd",
"description": (
"Final post-tax revenue in USD for this line item. "
"Always use this for GMV (Gross Merchandise Value) calculations. "
"Never use freight_value_usd as a revenue proxy."
),
},
{
"name": "freight_value_usd",
"description": (
"Shipping cost in USD paid by the customer for this item. "
"Use for logistics cost analysis, not for revenue."
),
},
{
"name": "order_status",
"description": (
"Current fulfillment status. Values: delivered, shipped, canceled, "
"processing, invoiced, unavailable, approved, created. "
"For completed revenue, filter WHERE order_status = 'delivered'."
),
},
{
"name": "created_at",
"description": (
"Timestamp when the order was placed. Use for time-series and date-range queries. "
"Use strftime('%Y-%m', created_at) to group by month in SQLite."
),
},
],
},
{
"table_name": "dim_users",
"description": (
"Customer dimension. One row per unique customer. "
"Use for geographic segmentation, cohort analysis, and active-member filtering."
),
"columns": [
{
"name": "user_id",
"description": "Primary key. Unique customer identifier. Matches fact_orders.user_id.",
},
{
"name": "city",
"description": "Customer's billing city (Brazilian city name in Portuguese).",
},
{
"name": "state",
"description": "Brazilian state abbreviation, e.g. SP, RJ, MG. Use for state-level analysis.",
},
{
"name": "signup_country_code",
"description": (
"ISO 3166-1 alpha-2 country code. Almost always 'BR' for Brazil. "
"Use for international expansion analysis."
),
},
{
"name": "is_active_member",
"description": (
"Boolean flag. True if the customer has placed an order in the last 6 months. "
"Use WHERE is_active_member = 1 to filter to engaged users."
),
},
],
},
{
"table_name": "dim_products",
"description": (
"Product dimension. One row per unique product SKU. "
"Use for category performance, product catalog, and merchandising analysis."
),
"columns": [
{
"name": "product_id",
"description": "Primary key. Unique product identifier. Matches fact_orders.product_id.",
},
{
"name": "category_name",
"description": (
"English product category label, e.g. 'electronics', 'furniture', 'health_beauty'. "
"Use for category-level revenue roll-ups."
),
},
{
"name": "photos_qty",
"description": (
"Number of product photos in the listing. "
"Higher values correlate with higher conversion rates."
),
},
],
},
{
"table_name": "dim_sellers",
"description": (
"Seller dimension. One row per registered Olist seller. "
"Use for seller performance, geographic distribution, and supply analysis."
),
"columns": [
{
"name": "seller_id",
"description": "Primary key. Unique seller identifier. Matches fact_orders.seller_id.",
},
{
"name": "seller_city",
"description": "City where the seller is registered (Brazilian city name).",
},
{
"name": "seller_state",
"description": "Brazilian state abbreviation for the seller's registered address.",
},
],
},
{
"table_name": "dim_geography",
"description": (
"Geolocation lookup table mapping Brazilian zip code prefixes to cities, "
"states, and lat/lng coordinates. Use for distance calculations and map visualizations. "
"Join to dim_users or dim_sellers via city/state, not directly via foreign key."
),
"columns": [
{
"name": "geo_id",
"description": "Surrogate primary key. Auto-incremented integer.",
},
{
"name": "zip_code_prefix",
"description": "First 5 digits of a Brazilian CEP (postal code).",
},
{
"name": "city",
"description": "City name corresponding to this zip prefix.",
},
{
"name": "state",
"description": "State abbreviation corresponding to this zip prefix.",
},
{
"name": "lat",
"description": "Latitude of the zip code centroid (decimal degrees, WGS84).",
},
{
"name": "lng",
"description": "Longitude of the zip code centroid (decimal degrees, WGS84).",
},
],
},
{
"table_name": "dim_reviews",
"description": (
"Customer review dimension. One row per review submitted. "
"Use for NPS analysis, satisfaction scoring, and sentiment filtering. "
"Join to fact_orders via order_id."
),
"columns": [
{
"name": "review_id",
"description": "Primary key. Unique review identifier.",
},
{
"name": "order_id",
"description": "Foreign key to fact_orders. Links the review to its order.",
},
{
"name": "review_score",
"description": (
"Customer satisfaction score from 1 (worst) to 5 (best). "
"Use AVG(review_score) for mean satisfaction, COUNT(*) WHERE review_score <= 2 for complaint rate."
),
},
{
"name": "review_comment",
"description": "Free-text comment left by the customer. May be NULL.",
},
],
},
]