-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest-html-cleaning.sh
More file actions
executable file
·50 lines (40 loc) · 1.8 KB
/
test-html-cleaning.sh
File metadata and controls
executable file
·50 lines (40 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash
echo "Testing HTML cleaning in RAG preprocessing..."
echo "============================================="
# Clear existing chunks to force re-indexing
echo "1. Clearing existing chunks from database..."
psql -U akashswamy -d noteraity_vectors -c "DELETE FROM note_chunks;" > /dev/null 2>&1
echo " ✅ Chunks cleared"
echo ""
echo "2. Testing HTML content processing..."
# Create a test HTML string similar to what the user showed
TEST_HTML='<p>My Bio: Joe Sweany</p><p>Age: 33</p><p>Location: St Nalbans, Watford, near London</p><hr><p>Asdfasdf </p>'
# Save it to a temporary file
echo "$TEST_HTML" > /tmp/test-html.html
echo " Input HTML:"
echo " $TEST_HTML"
echo ""
# The preprocessing should remove all HTML tags
echo " Expected output (no HTML tags):"
echo " My Bio: Joe Sweany Age: 33 Location: St Nalbans, Watford, near London Asdfasdf"
echo ""
echo "3. Instructions to test in the app:"
echo " a. Start the app with: npm run tauri:dev"
echo " b. Open or create a note with HTML content"
echo " c. Save the note (triggers auto-indexing)"
echo " d. Click the database icon in the note header"
echo " e. Check the 'Chunks' tab in the debug modal"
echo " f. Verify that chunks contain NO HTML tags"
echo ""
echo "4. What's been fixed:"
echo " ✅ HTML entities ( , &, etc.) are now decoded"
echo " ✅ All HTML tags are stripped properly"
echo " ✅ Text is extracted recursively from nested elements"
echo " ✅ Links are preserved in [url] format"
echo " ✅ Double preprocessing issue fixed"
echo ""
echo "5. To verify the fix works, check console logs for:"
echo " 📝 Preprocessing note... (shows char count reduction)"
echo " 📦 Created X chunks for note..."
echo " 🔄 Generating embedding for chunk..."
echo " ✅ Successfully generated embedding..."