Skip to content

Commit d0657d2

Browse files
authored
[v0/v1 migration] /bulk/info/variable-group (#6199)
## Issue [b/491885197] (https://b.corp.google.com/issues/491885197) ## Description This PR implements the migration of `v1/bulk/info/variable-group` to `v2`. The change is gated behind the `use_v2_api` flag. ## Notes The core of the migration is very simple: a flag-mediated gate that determines which endpoint is called. The complication comes (that makes up most of the diff) comes from the fact that the `v2` endpoint is no longer able to provide a `definition` along with each stat var. This definition is used (relatively rarely) in the natural language search, in order to find sibling stat vars to provide further exploration topics for the user. Because this is no longer available in the `v2` endpoint, the functionality had to be reconstructed via direct v2 calls. A discussion of the methodology, testing, fidelity and latency considerations of this can be found [at this link](https://docs.google.com/document/d/1LEZI_6-2wkwpmv0ELYIKg28wqJJpsyZSZohl7CJ3-HE/edit?resourcekey=0-_UcPAttv9jeT4qas_rIGhw&tab=t.0). (Message if access is required). This document describes how the `definition` functionality is used and analyses the latency and fidelity implications of moving that functionality to Flask and `v2`. ## Testing There are two aspects of the NL search that are affected by the "definitions". These are described in the document as Flow 1 and Flow2. Flow 1 is rarely invoked, but can be seen in the following query (which should produce the same results for "Related" charts populated at the bottom of the results section). * [V2](http://localhost:8080/explore#q=population+of+people+aged+85+and+over+in+Seattle&client=ui_query) * [V1](http://localhost:8080/explore?disable_feature=use_v2_api#q=population+of+people+aged+85+and+over+in+Seattle&client=ui_query) Flow 2 is much more common, and is the primary driver of latency discrepancies between `v1` and `v2`. This flow is invoked on a standard query such as: [Query]() ## Goldens This PR also includes explicit directives to the integration tests to use v1. The goldens would have to be regenerated for v2 at some point before the feature flag is dropped.
1 parent 6c73eb0 commit d0657d2

8 files changed

Lines changed: 487 additions & 35 deletions

File tree

server/integration_tests/explore_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class ExploreTest(NLWebServerTestCase):
3737
def run_fulfillment(self, test_dir, req_json, failure='', test='', i18n=''):
3838
resp = requests.post(
3939
self.get_server_url() +
40-
f'/api/explore/fulfill?test={test}&i18n={i18n}&client=test_fulfill',
40+
f'/api/explore/fulfill?test={test}&i18n={i18n}&client=test_fulfill&disable_feature=use_v2_api',
4141
json=req_json,
4242
headers=TEST_SURFACE_HEADER).json()
4343
self.handle_response(json.dumps(req_json), resp, test_dir, '', failure)
@@ -56,7 +56,7 @@ def run_detection(self,
5656
for q in queries:
5757
resp = requests.post(
5858
self.get_server_url() +
59-
f'/api/explore/detect?q={q}&test={test}&i18n={i18n}&client=test_detect&idx={idx}&reranker={reranker}',
59+
f'/api/explore/detect?q={q}&test={test}&i18n={i18n}&client=test_detect&idx={idx}&reranker={reranker}&disable_feature=use_v2_api',
6060
json={
6161
'contextHistory': ctx,
6262
'dc': dc,
@@ -85,7 +85,7 @@ def run_detect_and_fulfill(self,
8585
for (index, q) in enumerate(queries):
8686
resp = requests.post(
8787
self.get_server_url() +
88-
f'/api/explore/detect-and-fulfill?q={q}&test={test}&i18n={i18n}&mode={mode}&client=test_detect-and-fulfill&default_place={default_place}&idx={idx}&varThreshold={var_threshold}',
88+
f'/api/explore/detect-and-fulfill?q={q}&test={test}&i18n={i18n}&mode={mode}&client=test_detect-and-fulfill&default_place={default_place}&idx={idx}&varThreshold={var_threshold}&disable_feature=use_v2_api',
8989
json={
9090
'contextHistory': ctx,
9191
'dc': dc,

server/integration_tests/nl_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def run_sequence(self,
5555
print('Issuing ', test_dir, f'query[{i}]', q)
5656
resp = requests.post(
5757
self.get_server_url() +
58-
f'/api/explore/detect-and-fulfill?q={q}&idx={idx}&detector={detector}&test={test}&i18n={i18n}&mode={mode}&client=test',
58+
f'/api/explore/detect-and-fulfill?q={q}&idx={idx}&detector={detector}&test={test}&i18n={i18n}&mode={mode}&client=test&disable_feature=use_v2_api',
5959
json={
6060
'contextHistory': ctx
6161
}).json()

server/lib/nl/common/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
SPECIAL_PLACE_REPLACEMENTS: Dict[str, str] = {'us': 'United States'}
2323

24+
DEFAULT_STAT_TYPE = "measuredValue"
25+
2426
SPECIAL_DCIDS_TO_PLACES: Dict[str, List[str]] = {
2527
'Earth': ['earth', 'world'],
2628
# Continents
@@ -1003,3 +1005,16 @@
10031005
"dc/evcytmdmc9xgd",
10041006
"WagesTotal_Worker_NAICSNonclassifiable",
10051007
])
1008+
1009+
PROP_TO_SHORT_KEY = {
1010+
"populationType": "pt",
1011+
"measuredProperty": "mp",
1012+
"statType": "st",
1013+
"measurementDenominator": "md",
1014+
"measurementQualifier": "mq"
1015+
}
1016+
1017+
CORE_PROPS = [
1018+
"populationType", "measuredProperty", "statType", "measurementDenominator",
1019+
"measurementQualifier"
1020+
]

server/lib/nl/common/variable.py

Lines changed: 132 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,13 @@
1515

1616
from dataclasses import dataclass
1717
from dataclasses import field
18-
from typing import Dict, List, Set
18+
from typing import Dict, List, Set, Tuple
1919

2020
from flask import current_app
21+
from flask import request
2122

23+
from server.lib.feature_flags import is_feature_enabled
24+
from server.lib.feature_flags import USE_V2_API
2225
import server.lib.fetch as fetch
2326
import server.lib.nl.common.constants as constants
2427
import server.lib.nl.common.topic as topic
@@ -57,7 +60,9 @@ def parse_sv(id: str, sv_definition: str) -> SV:
5760
return res
5861
parts = sv_definition.split(",")
5962
for part in parts:
60-
k, v = part.split("=")
63+
if "=" not in part:
64+
continue
65+
k, v = part.split("=", 1)
6166
if k == "pt":
6267
res.pt = v
6368
elif k == "mp":
@@ -98,7 +103,9 @@ def _is_compatible(sv_obj: SV, new_sv: Dict) -> bool:
98103
new_sv_obj = parse_sv(new_sv['id'], new_sv['definition'])
99104
if new_sv_obj.mp != sv_obj.mp:
100105
return False
101-
if new_sv_obj.st != sv_obj.st:
106+
st_obj = sv_obj.st if sv_obj.st else constants.DEFAULT_STAT_TYPE
107+
st_new = new_sv_obj.st if new_sv_obj.st else constants.DEFAULT_STAT_TYPE
108+
if st_new != st_obj:
102109
return False
103110
if new_sv_obj.pt != sv_obj.pt:
104111
return False
@@ -121,6 +128,74 @@ def limit_extended_svs(sv: str, ext_svs: Set[str], limit: int) -> Dict:
121128
return res
122129

123130

131+
def _fetch_indirect_siblings(
132+
svs_needing_indirect: List[str], sv2svg: Dict[str, str], use_v2: bool
133+
) -> Tuple[Dict[str, str], Dict[str, List[str]], Dict[str, List[Dict]]]:
134+
"""Fetches indirect siblings (parent SVG -> sibling SVGs -> sibling SVs)."""
135+
# Batch 1: Fetch parents for all identified SVGs
136+
svgs_to_expand = list({sv2svg[sv] for sv in svs_needing_indirect})
137+
parents_resp = {}
138+
if svgs_to_expand:
139+
parents_resp = fetch.property_values(svgs_to_expand, "specializationOf",
140+
True)
141+
142+
# Collect parents
143+
svg_to_parent = {}
144+
all_parents = []
145+
for sv in svs_needing_indirect:
146+
svg = sv2svg[sv]
147+
parents = parents_resp.get(svg, [])
148+
if parents:
149+
svg_to_parent[svg] = parents[0]
150+
all_parents.append(parents[0])
151+
152+
# Batch 2: Fetch siblings for all parents
153+
siblings_resp = {}
154+
if all_parents:
155+
siblings_resp = fetch.property_values(all_parents, "specializationOf",
156+
False)
157+
158+
# Collect all sibling SVGs
159+
parent_to_siblings = {}
160+
all_sibling_svgs = set()
161+
for parent in all_parents:
162+
siblings = siblings_resp.get(parent, [])
163+
parent_to_siblings[parent] = siblings
164+
all_sibling_svgs.update(siblings)
165+
166+
# Batch 3: Fetch variable group info for all sibling SVGs
167+
svg_siblings_info = {'data': []}
168+
if all_sibling_svgs:
169+
svg_siblings_info = dc.get_variable_group_info(list(all_sibling_svgs), [])
170+
171+
# Collect all child SVs from sibling groups
172+
all_sibling_child_svs = set()
173+
for item in svg_siblings_info.get('data', []):
174+
for c in item.get('info', {}).get('childStatVars', []):
175+
if 'id' in c:
176+
all_sibling_child_svs.add(c['id'])
177+
178+
# Batch 4: Fetch variable definitions for all child SVs
179+
sibling_sv_definitions = {}
180+
if use_v2 and all_sibling_child_svs:
181+
sibling_sv_definitions = dc.get_variable_definitions(
182+
list(all_sibling_child_svs))
183+
184+
# Populate definitions
185+
for item in svg_siblings_info.get('data', []):
186+
for c in item.get('info', {}).get('childStatVars', []):
187+
if 'id' in c and c['id'] in sibling_sv_definitions:
188+
c['definition'] = sibling_sv_definitions[c['id']]
189+
190+
# Map from sibling SVG to its children for quick lookup
191+
svg_to_children = {}
192+
for item in svg_siblings_info.get('data', []):
193+
svg_to_children[item['node']] = item.get('info',
194+
{}).get('childStatVars', [])
195+
196+
return svg_to_parent, parent_to_siblings, svg_to_children
197+
198+
124199
def extend_svs(svs: List[str]):
125200
"""Extend stat vars by finding siblings.
126201
@@ -151,45 +226,80 @@ def extend_svs(svs: List[str]):
151226
if 'data' not in svginfo:
152227
return {}
153228
for item in svginfo['data']:
154-
svg2childsvs[item['node']] = item['info'].get('childStatVars', [])
229+
children = item['info'].get('childStatVars', [])
230+
svg2childsvs[item['node']] = children
231+
use_v2 = is_feature_enabled(USE_V2_API, app=current_app, request=request)
232+
if use_v2:
233+
all_child_svs = set()
234+
for children in svg2childsvs.values():
235+
all_child_svs.update(c['id'] for c in children if 'id' in c)
236+
237+
sv_definitions = dc.get_variable_definitions(list(all_child_svs))
238+
239+
for children in svg2childsvs.values():
240+
for c in children:
241+
if 'id' in c and c['id'] in sv_definitions:
242+
c['definition'] = sv_definitions[c['id']]
155243

156244
res = {}
157245
# Extended SV member -> Extended SV list
158246
reverse_map = {}
247+
248+
# Maps to store parsed objects (to avoid re-parsing)
249+
sv_to_svg_obj = {}
250+
sv_to_sv_obj = {}
251+
252+
# SVs that need indirect sibling expansion
253+
svs_needing_indirect = []
254+
255+
# First Pass: Parse and identify candidates
159256
for sv, svg in sv2svg.items():
160-
if sv in reverse_map:
161-
res[sv] = reverse_map[sv]
162-
continue
163-
res[sv] = []
164257
svg_obj = parse_svg(svg)
258+
sv_to_svg_obj[sv] = svg_obj
259+
165260
sv_obj = None
166-
for child_sv in svg2childsvs[svg]:
261+
for child_sv in svg2childsvs.get(svg, []):
167262
if child_sv['id'] == sv:
168263
if 'definition' in child_sv:
169264
sv_obj = parse_sv(child_sv['id'], child_sv['definition'])
170265
break
171266
if not sv_obj:
172267
continue
268+
sv_to_sv_obj[sv] = sv_obj
269+
173270
if len(svg_obj.pvs) == len(sv_obj.pvs):
174-
# There are no direct siblings of this sv in the current svg.
175-
# need to look for in-direct siblings
176-
svg_parents = fetch.property_values([svg], "specializationOf", True)[svg]
177-
if not svg_parents:
178-
continue
179-
svg_parent = svg_parents[0]
180-
svg_siblings = fetch.property_values([svg_parent], "specializationOf",
181-
False)[svg_parent]
182-
if not svg_siblings:
271+
svs_needing_indirect.append(sv)
272+
273+
# Extract batch fetching logic into a helper
274+
svg_to_parent, parent_to_siblings, svg_to_children = _fetch_indirect_siblings(
275+
svs_needing_indirect, sv2svg, use_v2)
276+
277+
# Final Pass: Process results
278+
for sv, svg in sv2svg.items():
279+
if sv in reverse_map:
280+
res[sv] = reverse_map[sv]
281+
continue
282+
283+
res[sv] = []
284+
if sv not in sv_to_sv_obj:
285+
continue
286+
sv_obj = sv_to_sv_obj[sv]
287+
svg_obj = sv_to_svg_obj[sv]
288+
289+
if sv in svs_needing_indirect:
290+
# Use batched data for indirect siblings
291+
parent = svg_to_parent.get(svg)
292+
if not parent:
183293
continue
184-
svg_siblings_info = dc.get_variable_group_info(svg_siblings, [])
185-
for item in svg_siblings_info['data']:
186-
for sv_info in item['info'].get('childStatVars', []):
294+
siblings = parent_to_siblings.get(parent, [])
295+
for sib_svg in siblings:
296+
for sv_info in svg_to_children.get(sib_svg, []):
187297
if _is_compatible(sv_obj, sv_info):
188298
res[sv].append(sv_info['id'])
189299
else:
190300
# Can use the direct siblings of this sv, nevertheless perform
191301
# SV compatibility check!
192-
for new_sv_info in svg2childsvs[svg]:
302+
for new_sv_info in svg2childsvs.get(svg, []):
193303
if _is_compatible(sv_obj, new_sv_info):
194304
res[sv].append(new_sv_info['id'])
195305
for sv2 in res[sv]:

0 commit comments

Comments
 (0)