Implemented an endpoint for Generated Follow Up Questions. (datacommonsorg#5212)

javi-vaz · web-flow · commit ed349af7595f · 2025-06-17T15:41:34.000-07:00
Added a new endpoint to the Flask server which provides follow up
questions generated by Gemini based on a query and related topics.
diff --git a/server/lib/nl/explore/related.py b/server/lib/nl/explore/related.py
@@ -14,10 +14,15 @@
 """Module for related things."""
 
 from dataclasses import dataclass
+import logging
 import re
 import time
 from typing import cast, Dict, List, Set
 
+from flask import current_app
+from google import genai
+from pydantic import BaseModel
+
 import server.lib.nl.common.topic as topic
 import server.lib.nl.common.utils as utils
 import server.lib.nl.detection.types as dtypes
@@ -39,6 +44,50 @@ class PlottedOrigVar:
   svs: List[Node]
 
 
+class FollowUpQuestions(BaseModel):
+  """The follow up questions generated for a query and its related topics.
+
+  Attributes:
+    questions: A list of strings containing questions generated by Gemini based on a query's related topics. Each question corresponds to exactly one related topic.
+  """
+  questions: List[str]
+
+
+_GEMINI_CALL_RETRIES = 3
+
+_GEMINI_MODEL = "gemini-2.5-pro-preview-06-05"
+
+_RELATED_QUESTIONS_PROMPT = """
+Imagine you are a dynamic, trusted, and factual UI copywriter. Use the following tone of voice guidelines as an approach to this task.
+
+Informative: The primary goal is to present data and facts clearly and directly.
+Neutral / objective: The language avoids emotional or subjective statements. The focus is on presenting the numbers without bias, opinions or judgments.
+Data-driven and factual: The emphasis is on presenting statistical and factual data supported by source citations.
+Concise and purposeful: Aim to explain the connection between the variable and the initial user research question. The sentences are generally short and focused on the key relationship between the variable and the research question, while maintaining neutrality and avoiding implications of direct causation.
+Straightforward: The writing is clear and to the point, avoiding jargon or overly complex language.  The information is presented in a way that is understandable to an entry level data analyst or data enthusiast.
+
+Write up related follow up questions that the user might find interesting to broaden their research question.
+The original research question from the user  is: {initial_query}.
+The follow up questions should be based on the following list of topics and statistical variables for the same location.
+RELATED TOPICS START: {related_topics}. RELATED TOPICS END.
+If no related topics are given, do not return anything.
+
+Generate only one question per topic in the related topics list. Make them succinct, general, and simple in relation to the topic provided.
+
+Avoid any other words apart from the generated follow up questions. Your entire response should only be the questions themselves, with each question on a new line.
+
+Avoid asking for data within a specific year or range of years. The questions should be timeless.
+
+Avoid questions that ask about correlations, relationships, or comparisons between the original question and the given topics. Each question must be simple and focus on a single variable.
+For example, the question "How does income equity compare to wealth equity in countries with the lowest Gini index?" should instead be a simpler version such as "What is the income equity across countries?"
+
+Avoid questions that ask for a metric about places that meet a certain condition.
+For example, the question "What is the sustainability performance of countries with the most greenhouse gas emissions?" should instead be a simpler version such as "What is the sustainability performance of countries?"
+If you are referring to a place or entity from the initial query, be sure to explicitly state the place or entity in the generated questions.
+
+"""
+
+
 def compute_related_things(state: ftypes.PopulateState,
                            plotted_orig_vars: List[PlottedOrigVar],
                            explore_peer_groups: Dict[str, Dict[str,
@@ -265,3 +314,41 @@ def _trim(l):
   for p in places:
     res.append({'dcid': p.dcid, 'name': p.name, 'types': [p.place_type]})
   return _trim(res)
+
+
+def generate_follow_up_questions(query: str,
+                                 related_topics: List[str]) -> List[str]:
+  """ Generates follow up questions based on the initial query and related topics.
+
+      Args:
+      query: The initial query made by the user.
+      related_topics: The topics related to the initial query.
+
+      Returns:
+      A list of follow up questions as strings.
+  """
+  if not related_topics or not query:
+    return []
+
+  gemini_api_key = current_app.config["LLM_API_KEY"]
+  gemini = genai.Client(api_key=gemini_api_key)
+  for _ in range(_GEMINI_CALL_RETRIES):
+    try:
+      gemini_response = gemini.models.generate_content(
+          model=_GEMINI_MODEL,
+          contents=_RELATED_QUESTIONS_PROMPT.format(
+              initial_query=query, related_topics=related_topics),
+          config={
+              "response_mime_type": "application/json",
+              "response_schema": FollowUpQuestions
+          })
+
+      generated_questions = gemini_response.parsed.questions
+      return generated_questions
+    except Exception as e:
+      logging.error(
+          f'[explore_follow_up_questions]: Initial Query: {query} | Related Topics: {related_topics} | Exception Caught: {e}',
+          exc_info=True)
+      continue
+
+  return []
diff --git a/server/routes/explore/api.py b/server/routes/explore/api.py
@@ -14,22 +14,26 @@
 """Endpoints for Datacommons NL"""
 
 import copy
+import json
 import time
 from typing import Dict
 
 import flask
 from flask import Blueprint
 from flask import current_app
 from flask import request
+from flask import Response
 
 from server.lib.nl.common import serialize
+import server.lib.nl.common.bad_words as bad_words
 import server.lib.nl.common.constants as constants
 import server.lib.nl.common.counters as ctr
 import server.lib.nl.common.utils as utils
 import server.lib.nl.common.utterance as nl_utterance
 import server.lib.nl.config_builder.base as config_builder
 import server.lib.nl.detection.detector as nl_detector
 from server.lib.nl.detection.utils import create_utterance
+from server.lib.nl.explore import related
 import server.lib.nl.explore.fulfiller_bridge as nl_fulfillment
 from server.lib.nl.explore.params import Clients
 from server.lib.nl.explore.params import DCNames
@@ -130,6 +134,40 @@ def detect_and_fulfill():
   return _fulfill_with_chart_config(utterance, debug_logs)
 
 
+#
+# The follow up question endpoint that generates questions
+# based off of the initial query and topics found in the related topics.
+#
+@bp.route('/follow-up-questions', methods=['POST'])
+def follow_up_questions():
+
+  initial_query = request.get_json().get('q', '')
+  related_topics = request.get_json().get('relatedTopics', [])
+
+  if not initial_query:
+    return Response(json.dumps({'error': 'Missing query in request.'}),
+                    400,
+                    mimetype="application/json")
+  if not related_topics:
+    return Response(json.dumps({'error': 'Missing related topics in request.'}),
+                    400,
+                    mimetype="application/json")
+
+  generated_questions = related.generate_follow_up_questions(
+      query=initial_query, related_topics=related_topics)
+
+  # Checks for adversarial questions
+  safe_generated_questions = [
+      question for question in generated_questions
+      if bad_words.is_safe(query=question,
+                           bad_words=current_app.config['NL_BAD_WORDS'])
+  ]
+
+  return Response(json.dumps({'follow_up_questions': safe_generated_questions}),
+                  200,
+                  mimetype="application/json")
+
+
 #
 # Given an utterance constructed either from a query or from dcids,
 # fulfills it into charts.
diff --git a/server/tests/routes/api/explore_test.py b/server/tests/routes/api/explore_test.py
@@ -0,0 +1,185 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest.mock import DEFAULT
+from unittest.mock import patch
+
+from server.lib.nl.common.bad_words import EMPTY_BANNED_WORDS
+from server.lib.nl.explore.related import generate_follow_up_questions
+from web_app import app
+
+
+class TestFollowUpQuestions(unittest.TestCase):
+
+  @patch('server.routes.explore.api.related.generate_follow_up_questions',
+         autospec=True)
+  def test_follow_up_questions_typical(self, mock):
+    query = "What is the rate of education in El Paso?"
+    related_topics = [
+        "Educational Attachment", "School Type", "Housing", "Commute"
+    ]
+
+    expected_questions = [
+        'What is the school dropout rate in El Paso?',
+        'What is the distribution of students by school type in El Paso?',
+        'What is the rate of homeownership in El Paso?',
+        'What is the average commute time in El Paso?'
+    ]
+    mock.return_value = expected_questions
+
+    resp = app.test_client().post('api/explore/follow-up-questions',
+                                  json={
+                                      'q': query,
+                                      'relatedTopics': related_topics
+                                  })
+
+    assert resp.status_code == 200
+    assert resp.json['follow_up_questions'] == expected_questions
+
+  @patch('server.routes.explore.api.bad_words.is_safe', autospec=True)
+  @patch('server.routes.explore.api.related.generate_follow_up_questions',
+         autospec=True)
+  def test_follow_up_questions_excludes_adversarial(self, mock_gemini,
+                                                    mock_safe):
+    query = "What is the rate of education in El Paso?"
+    related_topics = [
+        "Educational Attachment", "School Type", "Housing", "Commute"
+    ]
+
+    expected_questions = [
+        'What is the school dropout rate in El Paso?',
+        'What is the distribution of students by school type in El Paso?',
+        'What is the rate of homeownership in El Paso?',
+        'What is the average commute time in El Paso?'
+    ]
+    mock_gemini.return_value = expected_questions
+    #Labels do not match the actual adversarial label of each query, it is simply for testing purposes.
+    mock_safe.side_effect = [False, True, False, True]
+
+    resp = app.test_client().post('api/explore/follow-up-questions',
+                                  json={
+                                      'q': query,
+                                      'relatedTopics': related_topics
+                                  })
+
+    assert resp.status_code == 200
+    assert resp.json['follow_up_questions'] == expected_questions[1:4:2]
+
+  def test_follow_up_questions_empty_related_topics(self):
+    query = "What is the rate of education in El Paso?"
+    related_topics = []
+    resp = app.test_client().post('api/explore/follow-up-questions',
+                                  json={
+                                      'q': query,
+                                      'relatedTopics': related_topics
+                                  })
+
+    assert resp.status_code == 400
+    assert resp.json['error'] == 'Missing related topics in request.'
+
+  def test_follow_up_questions_empty_query(self):
+    query = ""
+    related_topics = ["Housing", "Commute"]
+    resp = app.test_client().post('api/explore/follow-up-questions',
+                                  json={
+                                      'q': query,
+                                      'relatedTopics': related_topics
+                                  })
+
+    assert resp.status_code == 400
+    assert resp.json['error'] == 'Missing query in request.'
+
+  @patch('server.routes.explore.api.related.generate_follow_up_questions',
+         autospec=True)
+  def test_follow_up_questions_error_gemini_call(self, mock):
+    query = "What is the rate of education in El Paso?"
+    related_topics = [
+        "Educational Attachment", "School Type", "Housing", "Commute"
+    ]
+    expected_questions = []
+
+    mock.return_value = expected_questions
+    app.config['NL_BAD_WORDS'] = EMPTY_BANNED_WORDS
+    resp = app.test_client().post('api/explore/follow-up-questions',
+                                  json={
+                                      'q': query,
+                                      'relatedTopics': related_topics
+                                  })
+
+    assert resp.status_code == 200
+    assert resp.json['follow_up_questions'] == expected_questions
+
+  @patch('google.genai.Client', autospec=True)
+  def test_generate_follow_up_questions_typical(self, mock_gemini):
+    query = "What is the rate of education in El Paso?"
+    related_topics = [
+        "Educational Attachment", "School Type", "Housing", "Commute"
+    ]
+    expected_questions = [
+        'What is the school dropout rate in El Paso?',
+        'What is the distribution of students by school type in El Paso?',
+        'What is the rate of homeownership in El Paso?',
+        'What is the average commute time in El Paso?'
+    ]
+    mock_gemini.return_value.models.generate_content.return_value.parsed.questions = expected_questions
+    app.config['LLM_API_KEY'] = ""
+    with app.app_context():
+      assert expected_questions == generate_follow_up_questions(
+          query=query, related_topics=related_topics)
+
+  @patch('google.genai.Client', autospec=True)
+  def test_generate_follow_up_questions_retry_once(self, mock_gemini):
+    query = "What is the rate of education in El Paso?"
+    related_topics = [
+        "Educational Attachment", "School Type", "Housing", "Commute"
+    ]
+    expected_questions = [
+        'What is the school dropout rate in El Paso?',
+        'What is the distribution of students by school type in El Paso?',
+        'What is the rate of homeownership in El Paso?',
+        'What is the average commute time in El Paso?'
+    ]
+    mock_gemini.return_value.models.generate_content.return_value.parsed.questions = expected_questions
+    mock_gemini.return_value.models.generate_content.side_effect = [
+        None, DEFAULT
+    ]
+    app.config['LLM_API_KEY'] = ""
+    with app.app_context():
+      assert expected_questions == generate_follow_up_questions(
+          query=query, related_topics=related_topics)
+
+  @patch('google.genai.Client', autospec=True)
+  def test_generate_follow_up_questions_error_request(self, mock_gemini):
+    query = "What is the rate of education in Mountain View?"
+    related_topics = ["Education"]
+    mock_gemini.return_value.models.generate_content.side_effect = [
+        None, None, None
+    ]
+    app.config['LLM_API_KEY'] = ""
+    with app.app_context():
+      assert [] == generate_follow_up_questions(query=query,
+                                                related_topics=related_topics)
+
+  def test_generate_follow_up_questions_empty_related_topics(self):
+    query = "What is the rate of education in Mountain View?"
+    related_topics = []
+    assert [] == generate_follow_up_questions(query=query,
+                                              related_topics=related_topics)
+
+  def test_generate_follow_up_questions_empty_query(self):
+    query = ""
+    related_topics = ["Housing", "Commute"]
+    assert [] == generate_follow_up_questions(query=query,
+                                              related_topics=related_topics)