From 8cb51b85506a1a9d0acd938ab0308782cdfbca27 Mon Sep 17 00:00:00 2001 From: Gray Gilmore Date: Wed, 6 May 2026 14:04:34 -0700 Subject: [PATCH] Add Unicode whitespace filters --- lib/liquid/standardfilters.rb | 60 ++++++++++++++++++++++++ test/integration/standard_filter_test.rb | 49 +++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/lib/liquid/standardfilters.rb b/lib/liquid/standardfilters.rb index ed6141566..771585c5f 100644 --- a/lib/liquid/standardfilters.rb +++ b/lib/liquid/standardfilters.rb @@ -36,6 +36,16 @@ module StandardFilters %r{}m, ) STRIP_HTML_TAGS = /<.*?>/m + UNICODE_WHITESPACE_LEFT = /\A[[:space:]]+/ + UNICODE_WHITESPACE_RIGHT = /[[:space:]]+\z/ + UNICODE_WHITESPACE_EDGES = Regexp.union(UNICODE_WHITESPACE_LEFT, UNICODE_WHITESPACE_RIGHT) + UNICODE_WHITESPACE_RUNS = /[[:space:]]+/ + private_constant( + :UNICODE_WHITESPACE_EDGES, + :UNICODE_WHITESPACE_LEFT, + :UNICODE_WHITESPACE_RIGHT, + :UNICODE_WHITESPACE_RUNS, + ) class << self def try_coerce_encoding(input, encoding:) @@ -315,6 +325,56 @@ def squish(input) Utils.to_s(input).strip.gsub(/\s+/, ' ') end + # @liquid_public_docs + # @liquid_type filter + # @liquid_category string + # @liquid_summary + # Strips whitespace, including Unicode whitespace, from the left and right of a string. + # @liquid_syntax string | strip_whitespace + # @liquid_return [string] + def strip_whitespace(input) + input = Utils.to_s(input) + input.gsub(UNICODE_WHITESPACE_EDGES, ' ').strip + end + + # @liquid_public_docs + # @liquid_type filter + # @liquid_category string + # @liquid_summary + # Strips whitespace, including Unicode whitespace, from the left of a string. + # @liquid_syntax string | lstrip_whitespace + # @liquid_return [string] + def lstrip_whitespace(input) + input = Utils.to_s(input) + input.gsub(UNICODE_WHITESPACE_LEFT, ' ').lstrip + end + + # @liquid_public_docs + # @liquid_type filter + # @liquid_category string + # @liquid_summary + # Strips whitespace, including Unicode whitespace, from the right of a string. + # @liquid_syntax string | rstrip_whitespace + # @liquid_return [string] + def rstrip_whitespace(input) + input = Utils.to_s(input) + input.gsub(UNICODE_WHITESPACE_RIGHT, ' ').rstrip + end + + # @liquid_public_docs + # @liquid_type filter + # @liquid_category string + # @liquid_summary + # Strips whitespace, including Unicode whitespace, from a string and collapses consecutive whitespace + # to a single ASCII space. + # @liquid_syntax string | squish_whitespace + # @liquid_return [string] + def squish_whitespace(input) + return if input.nil? + + Utils.to_s(input).gsub(UNICODE_WHITESPACE_RUNS, ' ').strip + end + # @liquid_public_docs # @liquid_type filter # @liquid_category string diff --git a/test/integration/standard_filter_test.rb b/test/integration/standard_filter_test.rb index 94097ae1f..8d3f42a6e 100644 --- a/test/integration/standard_filter_test.rb +++ b/test/integration/standard_filter_test.rb @@ -171,6 +171,19 @@ def test_squish_filter assert_equal("", Liquid::Template.parse('{{ " " | squish }}').render) end + def test_squish_whitespace_filter + unicode_spaces = "\u00A0\u202F\u2009\u2007" + + assert_template_result( + "foo bar boo", + "{{ source | squish_whitespace }}", + { 'source' => "#{unicode_spaces}foo\u202F\u2009bar\t\n\u2007boo#{unicode_spaces}" }, + ) + assert_template_result("", "{{ nil | squish_whitespace }}") + assert_template_result("", "{{ source | squish_whitespace }}", { 'source' => unicode_spaces }) + assert_template_result("\u200Bfoo\u200B", "{{ source | squish_whitespace }}", { 'source' => "\u200Bfoo\u200B" }) + end + def test_escape assert_equal('<strong>', @filters.escape('')) assert_equal('1', @filters.escape(1)) @@ -705,16 +718,52 @@ def test_strip assert_template_result('ab c', "{{ source | strip }}", { 'source' => " \tab c \n \t" }) end + def test_strip_whitespace + unicode_spaces = "\u00A0\u202F\u2009\u2007" + + assert_template_result( + 'ab c', + "{{ source | strip_whitespace }}", + { 'source' => "#{unicode_spaces}ab c#{unicode_spaces}" }, + ) + assert_template_result("a\u00A0b\u202Fc", "{{ source | strip_whitespace }}", { 'source' => "a\u00A0b\u202Fc" }) + assert_template_result("\u200Bfoo\u200B", "{{ source | strip_whitespace }}", { 'source' => "\u200Bfoo\u200B" }) + end + def test_lstrip assert_template_result('ab c ', "{{ source | lstrip }}", { 'source' => " ab c " }) assert_template_result("ab c \n \t", "{{ source | lstrip }}", { 'source' => " \tab c \n \t" }) end + def test_lstrip_whitespace + unicode_spaces = "\u00A0\u202F\u2009\u2007" + + assert_template_result( + "ab c#{unicode_spaces}", + "{{ source | lstrip_whitespace }}", + { 'source' => "#{unicode_spaces}ab c#{unicode_spaces}" }, + ) + assert_template_result("a\u00A0b\u202Fc", "{{ source | lstrip_whitespace }}", { 'source' => "a\u00A0b\u202Fc" }) + assert_template_result("\u200Bfoo\u200B", "{{ source | lstrip_whitespace }}", { 'source' => "\u200Bfoo\u200B" }) + end + def test_rstrip assert_template_result(" ab c", "{{ source | rstrip }}", { 'source' => " ab c " }) assert_template_result(" \tab c", "{{ source | rstrip }}", { 'source' => " \tab c \n \t" }) end + def test_rstrip_whitespace + unicode_spaces = "\u00A0\u202F\u2009\u2007" + + assert_template_result( + "#{unicode_spaces}ab c", + "{{ source | rstrip_whitespace }}", + { 'source' => "#{unicode_spaces}ab c#{unicode_spaces}" }, + ) + assert_template_result("a\u00A0b\u202Fc", "{{ source | rstrip_whitespace }}", { 'source' => "a\u00A0b\u202Fc" }) + assert_template_result("\u200Bfoo\u200B", "{{ source | rstrip_whitespace }}", { 'source' => "\u200Bfoo\u200B" }) + end + def test_strip_newlines assert_template_result('abc', "{{ source | strip_newlines }}", { 'source' => "a\nb\nc" }) assert_template_result('abc', "{{ source | strip_newlines }}", { 'source' => "a\r\nb\nc" })