Skip to content

Commit 50dd2f7

Browse files
jsb2092claude
andcommitted
Add duplicate detection for AI question generation
Two-phase duplicate detection: 1. Pre-generation: Tells Claude about existing questions to avoid 2. Post-generation: Filters out questions >85% similar to existing ones Frontend now sends course_id and tag_name for duplicate detection. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 2a5a595 commit 50dd2f7

2 files changed

Lines changed: 63 additions & 10 deletions

File tree

questionbank/ai_tools/views.py

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from rest_framework import status
44
from rest_framework.parsers import MultiPartParser, FormParser
55
from django.conf import settings
6-
from questions.models import Question
6+
from questions.models import Question, Course
77
from questions.serializers import QuestionDetailSerializer
88
import json
99
import io
10+
import difflib
1011

1112

1213
class GenerateQuestionsView(APIView):
@@ -19,20 +20,38 @@ def post(self, request):
1920
count = min(int(request.data.get('count', 5)), 20) # Max 20 at a time
2021
difficulty = request.data.get('difficulty', 'medium')
2122
examples = request.data.get('examples', []) # Example questions for style
23+
course_id = request.data.get('course_id') # For duplicate detection
24+
tag_name = request.data.get('tag_name') # For duplicate detection
2225

2326
if not content:
2427
return Response({'error': 'Content is required'}, status=status.HTTP_400_BAD_REQUEST)
2528

29+
# Fetch existing questions for duplicate detection
30+
existing_questions = []
31+
if course_id:
32+
qs = Question.objects.filter(course_id=course_id, deleted_at__isnull=True)
33+
if tag_name:
34+
qs = qs.filter(tags__name=tag_name)
35+
existing_questions = list(qs.values_list('text', flat=True)[:100]) # Limit to 100 for prompt size
36+
2637
try:
27-
print(f"[AI Generate] Provider: {provider}, Type: {question_type}, Count: {count}, Content length: {len(content)}")
38+
print(f"[AI Generate] Provider: {provider}, Type: {question_type}, Count: {count}, Content length: {len(content)}, Existing: {len(existing_questions)}")
2839

2940
if provider == 'claude':
30-
questions = self._generate_with_claude(content, question_type, count, difficulty, examples)
41+
questions = self._generate_with_claude(content, question_type, count, difficulty, examples, existing_questions)
3142
elif provider == 'openai':
32-
questions = self._generate_with_openai(content, question_type, count, difficulty, examples)
43+
questions = self._generate_with_openai(content, question_type, count, difficulty, examples, existing_questions)
3344
else:
3445
return Response({'error': 'Invalid provider'}, status=status.HTTP_400_BAD_REQUEST)
3546

47+
# Post-generation duplicate filtering
48+
if existing_questions:
49+
original_count = len(questions)
50+
questions = self._filter_duplicates(questions, existing_questions)
51+
filtered_count = original_count - len(questions)
52+
if filtered_count > 0:
53+
print(f"[AI Generate] Filtered {filtered_count} duplicate questions")
54+
3655
print(f"[AI Generate] Success: {len(questions)} questions generated")
3756
return Response({'questions': questions})
3857
except Exception as e:
@@ -41,7 +60,25 @@ def post(self, request):
4160
traceback.print_exc()
4261
return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
4362

44-
def _build_prompt(self, content, question_type, count, difficulty, examples):
63+
def _filter_duplicates(self, new_questions, existing_texts, threshold=0.85):
64+
"""Filter out questions that are too similar to existing ones."""
65+
unique_questions = []
66+
for q in new_questions:
67+
new_text = q.get('text', '').lower().strip()
68+
is_duplicate = False
69+
for existing_text in existing_texts:
70+
existing_lower = existing_text.lower().strip()
71+
# Use sequence matcher for similarity
72+
similarity = difflib.SequenceMatcher(None, new_text, existing_lower).ratio()
73+
if similarity >= threshold:
74+
print(f"[Duplicate] Skipping question (similarity={similarity:.2f}): {new_text[:50]}...")
75+
is_duplicate = True
76+
break
77+
if not is_duplicate:
78+
unique_questions.append(q)
79+
return unique_questions
80+
81+
def _build_prompt(self, content, question_type, count, difficulty, examples, existing_questions=None):
4582
type_instructions = {
4683
'multipleChoice': 'Create multiple choice questions with exactly 4 options (1 correct, 3 wrong).',
4784
'trueFalse': 'Create true/false questions.',
@@ -55,6 +92,15 @@ def _build_prompt(self, content, question_type, count, difficulty, examples):
5592
for i, ex in enumerate(examples[:3], 1):
5693
example_text += f"\nExample {i}:\n{ex}\n"
5794

95+
# Add existing questions to avoid duplicates
96+
existing_text = ""
97+
if existing_questions:
98+
existing_text = "\n\nIMPORTANT: The following questions already exist. DO NOT create questions that are similar to these:\n"
99+
for i, eq in enumerate(existing_questions[:30], 1): # Limit to 30 to save tokens
100+
truncated = eq[:200] + "..." if len(eq) > 200 else eq
101+
existing_text += f"- {truncated}\n"
102+
existing_text += "\nCreate DIFFERENT questions that cover other aspects of the material.\n"
103+
58104
# Handle mixed question types
59105
if question_type == 'mixed':
60106
type_instruction = """Create a MIX of different question types. Include a variety of:
@@ -120,7 +166,7 @@ def _build_prompt(self, content, question_type, count, difficulty, examples):
120166
prompt = f"""Generate {count} {difficulty} difficulty questions based on the following content.
121167
122168
{type_instruction}
123-
169+
{existing_text}
124170
Return your response as a JSON array with this structure:
125171
{json_format}
126172
@@ -159,14 +205,14 @@ def _parse_json_response(self, response_text):
159205
preview = response_text[:500] if len(response_text) > 500 else response_text
160206
raise ValueError(f"Failed to parse AI response as JSON: {e}. Response preview: {preview}")
161207

162-
def _generate_with_claude(self, content, question_type, count, difficulty, examples):
208+
def _generate_with_claude(self, content, question_type, count, difficulty, examples, existing_questions=None):
163209
import anthropic
164210

165211
if not settings.ANTHROPIC_API_KEY:
166212
raise ValueError("Anthropic API key not configured")
167213

168214
client = anthropic.Anthropic(api_key=settings.ANTHROPIC_API_KEY)
169-
prompt = self._build_prompt(content, question_type, count, difficulty, examples)
215+
prompt = self._build_prompt(content, question_type, count, difficulty, examples, existing_questions)
170216

171217
print(f"[Claude] Sending request, prompt length: {len(prompt)}")
172218

@@ -195,14 +241,14 @@ def _generate_with_claude(self, content, question_type, count, difficulty, examp
195241

196242
return self._parse_json_response(response_text)
197243

198-
def _generate_with_openai(self, content, question_type, count, difficulty, examples):
244+
def _generate_with_openai(self, content, question_type, count, difficulty, examples, existing_questions=None):
199245
from openai import OpenAI
200246

201247
if not settings.OPENAI_API_KEY:
202248
raise ValueError("OpenAI API key not configured")
203249

204250
client = OpenAI(api_key=settings.OPENAI_API_KEY)
205-
prompt = self._build_prompt(content, question_type, count, difficulty, examples)
251+
prompt = self._build_prompt(content, question_type, count, difficulty, examples, existing_questions)
206252

207253
response = client.chat.completions.create(
208254
model="gpt-4o",

questionbank/static/js/app.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3129,12 +3129,19 @@
31293129
lucide.createIcons();
31303130

31313131
try {
3132+
// Get course and tag for duplicate detection
3133+
const courseCode = document.getElementById('ai-course').value;
3134+
const tagName = document.getElementById('ai-tag').value;
3135+
const course = courseCode ? courses.find(c => c.code === courseCode) : null;
3136+
31323137
const res = await api('ai/generate/', 'POST', {
31333138
provider: document.getElementById('ai-provider').value,
31343139
content: content,
31353140
type: document.getElementById('ai-type').value,
31363141
difficulty: document.getElementById('ai-difficulty').value,
31373142
count: parseInt(document.getElementById('ai-count').value),
3143+
course_id: course?.id, // For duplicate detection
3144+
tag_name: tagName && tagName !== '__new__' ? tagName : null, // For duplicate detection
31383145
});
31393146
if (res.error) throw new Error(res.error);
31403147
// Append to existing questions instead of replacing

0 commit comments

Comments
 (0)