Skip to content

Commit c8b3b4b

Browse files
fix(application_html_formatters): sanitize zalgo text before save
1 parent f64a999 commit c8b3b4b

4 files changed

Lines changed: 74 additions & 1 deletion

File tree

app/helpers/application_html_formatters_helper.rb

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ def self.build_html_pipeline(custom_options)
145145
{ node_whitelist: [node] }
146146
end.freeze
147147

148+
# Collapses runs of more than 3 Unicode combining marks (Zalgo text) down to 3,
149+
# preserving legitimate accents (e.g. "Café", "Niño") while blocking vandalism.
150+
ZALGO_TEXT_TRANSFORMER = lambda do |env|
151+
node = env[:node]
152+
return unless node.text?
153+
154+
node.content = node.content.gsub(/(\p{M}{3})\p{M}+/, '\1')
155+
end.freeze
156+
148157
# - Allow whitelisting of base64 encoded images for HTML text.
149158
# TODO: Remove 'data' from whitelisted protocols once we disable Base64 encoding
150159
IMAGE_WHITELIST_TRANSFORMER = lambda do |env|
@@ -179,7 +188,7 @@ def self.build_html_pipeline(custom_options)
179188
'margin-bottom', 'margin-left', 'margin-right', 'margin-top', 'text-align',
180189
'width', 'list-style-type'
181190
] }
182-
list[:transformers] |= [VIDEO_WHITELIST_TRANSFORMER, IMAGE_WHITELIST_TRANSFORMER].freeze
191+
list[:transformers] |= [ZALGO_TEXT_TRANSFORMER, VIDEO_WHITELIST_TRANSFORMER, IMAGE_WHITELIST_TRANSFORMER].freeze
183192
list
184193
end.freeze
185194

app/models/course/assessment/live_feedback/message.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,10 @@ class Course::Assessment::LiveFeedback::Message < ApplicationRecord
1414
validates :content, exclusion: { in: [nil] }
1515
validates :creator_id, presence: true
1616
validates :created_at, presence: true
17+
18+
before_save :sanitize_text
19+
20+
def sanitize_text
21+
self.content = ApplicationController.helpers.sanitize_ckeditor_rich_text(content)
22+
end
1723
end

app/models/course/assessment/live_feedback_comment.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,10 @@ class Course::Assessment::LiveFeedbackComment < ApplicationRecord
44

55
validates :line_number, presence: true
66
validates :comment, presence: true
7+
8+
before_save :sanitize_text
9+
10+
def sanitize_text
11+
self.comment = ApplicationController.helpers.sanitize_ckeditor_rich_text(comment)
12+
end
713
end

spec/helpers/application_formatters_helper_spec.rb

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,58 @@ def hello:
258258
expect(helper.sanitize('<script/>')).to be_empty
259259
end
260260
end
261+
262+
describe '#sanitize_ckeditor_rich_text' do
263+
it 'leaves plain text without combining marks unchanged' do
264+
html = '<p>Hello World</p>'
265+
expect(helper.sanitize_ckeditor_rich_text(html)).to include('Hello World')
266+
end
267+
268+
it 'preserves single combining marks (normal accented characters)' do
269+
html = "<p>Cafe\u0301 and Nin\u0303o</p>"
270+
result = helper.sanitize_ckeditor_rich_text(html)
271+
expect(result).to include("e\u0301") # é via combining mark
272+
expect(result).to include("n\u0303") # ñ via combining mark
273+
end
274+
275+
it 'preserves multiple combining marks (e.g. for Indic scripts)' do
276+
html = "<p>Ta\u0302\u0301t ca\u0309 mo\u0323i ngu\u031bo\u031b\u0300i sinh ra \u0111e\u0302\u0300u</p>"
277+
result = helper.sanitize_ckeditor_rich_text(html)
278+
expect(result).to include("\u0302\u0301") # 2 combining marks on 'a'
279+
expect(result).to include("\u0309") # 1 combining mark on 'a'
280+
expect(result).to include("\u0323") # 1 combining mark on 'o'
281+
expect(result).to include("\u031b\u0300") # 2 combining marks on 'o'
282+
expect(result).to include("\u0302\u0300") # 2 combining marks on 'e'
283+
end
284+
285+
it 'preserves text with exactly 3 combining marks' do
286+
html = "<p>e\u0300\u0301\u0302</p>"
287+
result = helper.sanitize_ckeditor_rich_text(html)
288+
expect(result).to match(/\p{M}{3}/)
289+
expect(result).not_to match(/\p{M}{4}/)
290+
end
291+
292+
it 'collapses 4 combining marks down to 3' do
293+
html = "<p>e\u0300\u0301\u0302\u0303</p>"
294+
result = helper.sanitize_ckeditor_rich_text(html)
295+
expect(result).not_to match(/\p{M}{4,}/)
296+
end
297+
298+
it 'collapses Zalgo-style text with many combining marks down to 3' do
299+
# 10 combining marks on a single base character
300+
zalgo = "e#{(0x0300..0x0309).map { |cp| cp.chr(Encoding::UTF_8) }.join}"
301+
html = "<p>#{zalgo}</p>"
302+
result = helper.sanitize_ckeditor_rich_text(html)
303+
expect(result).not_to match(/\p{M}{4,}/)
304+
end
305+
306+
it 'handles multiple Zalgo sequences in the same text node' do
307+
combining_run = (0x0300..0x0309).map { |cp| cp.chr(Encoding::UTF_8) }.join
308+
html = "<p>a#{combining_run} and b#{combining_run}</p>"
309+
result = helper.sanitize_ckeditor_rich_text(html)
310+
expect(result).not_to match(/\p{M}{4,}/)
311+
end
312+
end
261313
end
262314

263315
describe 'user display helper' do

0 commit comments

Comments
 (0)