Skip to content

Commit ed349af

Browse files
authored
Implemented an endpoint for Generated Follow Up Questions. (datacommonsorg#5212)
Added a new endpoint to the Flask server which provides follow up questions generated by Gemini based on a query and related topics.
1 parent a597964 commit ed349af

3 files changed

Lines changed: 310 additions & 0 deletions

File tree

server/lib/nl/explore/related.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,15 @@
1414
"""Module for related things."""
1515

1616
from dataclasses import dataclass
17+
import logging
1718
import re
1819
import time
1920
from typing import cast, Dict, List, Set
2021

22+
from flask import current_app
23+
from google import genai
24+
from pydantic import BaseModel
25+
2126
import server.lib.nl.common.topic as topic
2227
import server.lib.nl.common.utils as utils
2328
import server.lib.nl.detection.types as dtypes
@@ -39,6 +44,50 @@ class PlottedOrigVar:
3944
svs: List[Node]
4045

4146

47+
class FollowUpQuestions(BaseModel):
48+
"""The follow up questions generated for a query and its related topics.
49+
50+
Attributes:
51+
questions: A list of strings containing questions generated by Gemini based on a query's related topics. Each question corresponds to exactly one related topic.
52+
"""
53+
questions: List[str]
54+
55+
56+
_GEMINI_CALL_RETRIES = 3
57+
58+
_GEMINI_MODEL = "gemini-2.5-pro-preview-06-05"
59+
60+
_RELATED_QUESTIONS_PROMPT = """
61+
Imagine you are a dynamic, trusted, and factual UI copywriter. Use the following tone of voice guidelines as an approach to this task.
62+
63+
Informative: The primary goal is to present data and facts clearly and directly.
64+
Neutral / objective: The language avoids emotional or subjective statements. The focus is on presenting the numbers without bias, opinions or judgments.
65+
Data-driven and factual: The emphasis is on presenting statistical and factual data supported by source citations.
66+
Concise and purposeful: Aim to explain the connection between the variable and the initial user research question. The sentences are generally short and focused on the key relationship between the variable and the research question, while maintaining neutrality and avoiding implications of direct causation.
67+
Straightforward: The writing is clear and to the point, avoiding jargon or overly complex language. The information is presented in a way that is understandable to an entry level data analyst or data enthusiast.
68+
69+
Write up related follow up questions that the user might find interesting to broaden their research question.
70+
The original research question from the user is: {initial_query}.
71+
The follow up questions should be based on the following list of topics and statistical variables for the same location.
72+
RELATED TOPICS START: {related_topics}. RELATED TOPICS END.
73+
If no related topics are given, do not return anything.
74+
75+
Generate only one question per topic in the related topics list. Make them succinct, general, and simple in relation to the topic provided.
76+
77+
Avoid any other words apart from the generated follow up questions. Your entire response should only be the questions themselves, with each question on a new line.
78+
79+
Avoid asking for data within a specific year or range of years. The questions should be timeless.
80+
81+
Avoid questions that ask about correlations, relationships, or comparisons between the original question and the given topics. Each question must be simple and focus on a single variable.
82+
For example, the question "How does income equity compare to wealth equity in countries with the lowest Gini index?" should instead be a simpler version such as "What is the income equity across countries?"
83+
84+
Avoid questions that ask for a metric about places that meet a certain condition.
85+
For example, the question "What is the sustainability performance of countries with the most greenhouse gas emissions?" should instead be a simpler version such as "What is the sustainability performance of countries?"
86+
If you are referring to a place or entity from the initial query, be sure to explicitly state the place or entity in the generated questions.
87+
88+
"""
89+
90+
4291
def compute_related_things(state: ftypes.PopulateState,
4392
plotted_orig_vars: List[PlottedOrigVar],
4493
explore_peer_groups: Dict[str, Dict[str,
@@ -265,3 +314,41 @@ def _trim(l):
265314
for p in places:
266315
res.append({'dcid': p.dcid, 'name': p.name, 'types': [p.place_type]})
267316
return _trim(res)
317+
318+
319+
def generate_follow_up_questions(query: str,
320+
related_topics: List[str]) -> List[str]:
321+
""" Generates follow up questions based on the initial query and related topics.
322+
323+
Args:
324+
query: The initial query made by the user.
325+
related_topics: The topics related to the initial query.
326+
327+
Returns:
328+
A list of follow up questions as strings.
329+
"""
330+
if not related_topics or not query:
331+
return []
332+
333+
gemini_api_key = current_app.config["LLM_API_KEY"]
334+
gemini = genai.Client(api_key=gemini_api_key)
335+
for _ in range(_GEMINI_CALL_RETRIES):
336+
try:
337+
gemini_response = gemini.models.generate_content(
338+
model=_GEMINI_MODEL,
339+
contents=_RELATED_QUESTIONS_PROMPT.format(
340+
initial_query=query, related_topics=related_topics),
341+
config={
342+
"response_mime_type": "application/json",
343+
"response_schema": FollowUpQuestions
344+
})
345+
346+
generated_questions = gemini_response.parsed.questions
347+
return generated_questions
348+
except Exception as e:
349+
logging.error(
350+
f'[explore_follow_up_questions]: Initial Query: {query} | Related Topics: {related_topics} | Exception Caught: {e}',
351+
exc_info=True)
352+
continue
353+
354+
return []

server/routes/explore/api.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,26 @@
1414
"""Endpoints for Datacommons NL"""
1515

1616
import copy
17+
import json
1718
import time
1819
from typing import Dict
1920

2021
import flask
2122
from flask import Blueprint
2223
from flask import current_app
2324
from flask import request
25+
from flask import Response
2426

2527
from server.lib.nl.common import serialize
28+
import server.lib.nl.common.bad_words as bad_words
2629
import server.lib.nl.common.constants as constants
2730
import server.lib.nl.common.counters as ctr
2831
import server.lib.nl.common.utils as utils
2932
import server.lib.nl.common.utterance as nl_utterance
3033
import server.lib.nl.config_builder.base as config_builder
3134
import server.lib.nl.detection.detector as nl_detector
3235
from server.lib.nl.detection.utils import create_utterance
36+
from server.lib.nl.explore import related
3337
import server.lib.nl.explore.fulfiller_bridge as nl_fulfillment
3438
from server.lib.nl.explore.params import Clients
3539
from server.lib.nl.explore.params import DCNames
@@ -130,6 +134,40 @@ def detect_and_fulfill():
130134
return _fulfill_with_chart_config(utterance, debug_logs)
131135

132136

137+
#
138+
# The follow up question endpoint that generates questions
139+
# based off of the initial query and topics found in the related topics.
140+
#
141+
@bp.route('/follow-up-questions', methods=['POST'])
142+
def follow_up_questions():
143+
144+
initial_query = request.get_json().get('q', '')
145+
related_topics = request.get_json().get('relatedTopics', [])
146+
147+
if not initial_query:
148+
return Response(json.dumps({'error': 'Missing query in request.'}),
149+
400,
150+
mimetype="application/json")
151+
if not related_topics:
152+
return Response(json.dumps({'error': 'Missing related topics in request.'}),
153+
400,
154+
mimetype="application/json")
155+
156+
generated_questions = related.generate_follow_up_questions(
157+
query=initial_query, related_topics=related_topics)
158+
159+
# Checks for adversarial questions
160+
safe_generated_questions = [
161+
question for question in generated_questions
162+
if bad_words.is_safe(query=question,
163+
bad_words=current_app.config['NL_BAD_WORDS'])
164+
]
165+
166+
return Response(json.dumps({'follow_up_questions': safe_generated_questions}),
167+
200,
168+
mimetype="application/json")
169+
170+
133171
#
134172
# Given an utterance constructed either from a query or from dcids,
135173
# fulfills it into charts.
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import unittest
16+
from unittest.mock import DEFAULT
17+
from unittest.mock import patch
18+
19+
from server.lib.nl.common.bad_words import EMPTY_BANNED_WORDS
20+
from server.lib.nl.explore.related import generate_follow_up_questions
21+
from web_app import app
22+
23+
24+
class TestFollowUpQuestions(unittest.TestCase):
25+
26+
@patch('server.routes.explore.api.related.generate_follow_up_questions',
27+
autospec=True)
28+
def test_follow_up_questions_typical(self, mock):
29+
query = "What is the rate of education in El Paso?"
30+
related_topics = [
31+
"Educational Attachment", "School Type", "Housing", "Commute"
32+
]
33+
34+
expected_questions = [
35+
'What is the school dropout rate in El Paso?',
36+
'What is the distribution of students by school type in El Paso?',
37+
'What is the rate of homeownership in El Paso?',
38+
'What is the average commute time in El Paso?'
39+
]
40+
mock.return_value = expected_questions
41+
42+
resp = app.test_client().post('api/explore/follow-up-questions',
43+
json={
44+
'q': query,
45+
'relatedTopics': related_topics
46+
})
47+
48+
assert resp.status_code == 200
49+
assert resp.json['follow_up_questions'] == expected_questions
50+
51+
@patch('server.routes.explore.api.bad_words.is_safe', autospec=True)
52+
@patch('server.routes.explore.api.related.generate_follow_up_questions',
53+
autospec=True)
54+
def test_follow_up_questions_excludes_adversarial(self, mock_gemini,
55+
mock_safe):
56+
query = "What is the rate of education in El Paso?"
57+
related_topics = [
58+
"Educational Attachment", "School Type", "Housing", "Commute"
59+
]
60+
61+
expected_questions = [
62+
'What is the school dropout rate in El Paso?',
63+
'What is the distribution of students by school type in El Paso?',
64+
'What is the rate of homeownership in El Paso?',
65+
'What is the average commute time in El Paso?'
66+
]
67+
mock_gemini.return_value = expected_questions
68+
#Labels do not match the actual adversarial label of each query, it is simply for testing purposes.
69+
mock_safe.side_effect = [False, True, False, True]
70+
71+
resp = app.test_client().post('api/explore/follow-up-questions',
72+
json={
73+
'q': query,
74+
'relatedTopics': related_topics
75+
})
76+
77+
assert resp.status_code == 200
78+
assert resp.json['follow_up_questions'] == expected_questions[1:4:2]
79+
80+
def test_follow_up_questions_empty_related_topics(self):
81+
query = "What is the rate of education in El Paso?"
82+
related_topics = []
83+
resp = app.test_client().post('api/explore/follow-up-questions',
84+
json={
85+
'q': query,
86+
'relatedTopics': related_topics
87+
})
88+
89+
assert resp.status_code == 400
90+
assert resp.json['error'] == 'Missing related topics in request.'
91+
92+
def test_follow_up_questions_empty_query(self):
93+
query = ""
94+
related_topics = ["Housing", "Commute"]
95+
resp = app.test_client().post('api/explore/follow-up-questions',
96+
json={
97+
'q': query,
98+
'relatedTopics': related_topics
99+
})
100+
101+
assert resp.status_code == 400
102+
assert resp.json['error'] == 'Missing query in request.'
103+
104+
@patch('server.routes.explore.api.related.generate_follow_up_questions',
105+
autospec=True)
106+
def test_follow_up_questions_error_gemini_call(self, mock):
107+
query = "What is the rate of education in El Paso?"
108+
related_topics = [
109+
"Educational Attachment", "School Type", "Housing", "Commute"
110+
]
111+
expected_questions = []
112+
113+
mock.return_value = expected_questions
114+
app.config['NL_BAD_WORDS'] = EMPTY_BANNED_WORDS
115+
resp = app.test_client().post('api/explore/follow-up-questions',
116+
json={
117+
'q': query,
118+
'relatedTopics': related_topics
119+
})
120+
121+
assert resp.status_code == 200
122+
assert resp.json['follow_up_questions'] == expected_questions
123+
124+
@patch('google.genai.Client', autospec=True)
125+
def test_generate_follow_up_questions_typical(self, mock_gemini):
126+
query = "What is the rate of education in El Paso?"
127+
related_topics = [
128+
"Educational Attachment", "School Type", "Housing", "Commute"
129+
]
130+
expected_questions = [
131+
'What is the school dropout rate in El Paso?',
132+
'What is the distribution of students by school type in El Paso?',
133+
'What is the rate of homeownership in El Paso?',
134+
'What is the average commute time in El Paso?'
135+
]
136+
mock_gemini.return_value.models.generate_content.return_value.parsed.questions = expected_questions
137+
app.config['LLM_API_KEY'] = ""
138+
with app.app_context():
139+
assert expected_questions == generate_follow_up_questions(
140+
query=query, related_topics=related_topics)
141+
142+
@patch('google.genai.Client', autospec=True)
143+
def test_generate_follow_up_questions_retry_once(self, mock_gemini):
144+
query = "What is the rate of education in El Paso?"
145+
related_topics = [
146+
"Educational Attachment", "School Type", "Housing", "Commute"
147+
]
148+
expected_questions = [
149+
'What is the school dropout rate in El Paso?',
150+
'What is the distribution of students by school type in El Paso?',
151+
'What is the rate of homeownership in El Paso?',
152+
'What is the average commute time in El Paso?'
153+
]
154+
mock_gemini.return_value.models.generate_content.return_value.parsed.questions = expected_questions
155+
mock_gemini.return_value.models.generate_content.side_effect = [
156+
None, DEFAULT
157+
]
158+
app.config['LLM_API_KEY'] = ""
159+
with app.app_context():
160+
assert expected_questions == generate_follow_up_questions(
161+
query=query, related_topics=related_topics)
162+
163+
@patch('google.genai.Client', autospec=True)
164+
def test_generate_follow_up_questions_error_request(self, mock_gemini):
165+
query = "What is the rate of education in Mountain View?"
166+
related_topics = ["Education"]
167+
mock_gemini.return_value.models.generate_content.side_effect = [
168+
None, None, None
169+
]
170+
app.config['LLM_API_KEY'] = ""
171+
with app.app_context():
172+
assert [] == generate_follow_up_questions(query=query,
173+
related_topics=related_topics)
174+
175+
def test_generate_follow_up_questions_empty_related_topics(self):
176+
query = "What is the rate of education in Mountain View?"
177+
related_topics = []
178+
assert [] == generate_follow_up_questions(query=query,
179+
related_topics=related_topics)
180+
181+
def test_generate_follow_up_questions_empty_query(self):
182+
query = ""
183+
related_topics = ["Housing", "Commute"]
184+
assert [] == generate_follow_up_questions(query=query,
185+
related_topics=related_topics)

0 commit comments

Comments
 (0)