Skip to content

Commit

Permalink
FIX: Handle all UTF-8 characters (discourse#21344)
Browse files Browse the repository at this point in the history
Watched words were converted to regular expressions containing \W, which
handled only ASCII characters. Using [^[:word]] instead ensures that
UTF-8 characters are also handled correctly.
  • Loading branch information
nbianca authored May 15, 2023
1 parent 23a146a commit 9a27803
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
export function createWatchedWordRegExp(word) {
const caseFlag = word.case_sensitive ? "" : "i";
return new RegExp(word.regexp, `${caseFlag}g`);
return new RegExp(word.regexp, `${caseFlag}gu`);
}

export function toWatchedWord(regexp) {
Expand Down
6 changes: 3 additions & 3 deletions app/serializers/site_serializer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def topic_featured_link_allowed_category_ids
end

def censored_regexp
WordWatcher.serializable_word_matcher_regexp(:censor)
WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js)
end

def custom_emoji_translation
Expand All @@ -221,11 +221,11 @@ def include_shared_drafts_category_id?
end

def watched_words_replace
WordWatcher.word_matcher_regexps(:replace)
WordWatcher.word_matcher_regexps(:replace, engine: :js)
end

def watched_words_link
WordWatcher.word_matcher_regexps(:link)
WordWatcher.word_matcher_regexps(:link, engine: :js)
end

def categories
Expand Down
2 changes: 1 addition & 1 deletion app/serializers/watched_word_serializer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ class WatchedWordSerializer < ApplicationSerializer
attributes :id, :word, :regexp, :replacement, :action, :case_sensitive

def regexp
WordWatcher.word_to_regexp(word, whole: true)
WordWatcher.word_to_regexp(word)
end

def action
Expand Down
54 changes: 31 additions & 23 deletions app/services/word_watcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,31 @@ def self.get_cached_words(action)
end
end

def self.serializable_word_matcher_regexp(action)
word_matcher_regexp_list(action).map { |r| { r.source => { case_sensitive: !r.casefold? } } }
def self.serializable_word_matcher_regexp(action, engine: :ruby)
word_matcher_regexp_list(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end

# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.word_matcher_regexp_list(action, raise_errors: false)
def self.word_matcher_regexp_list(action, engine: :ruby, raise_errors: false)
words = get_cached_words(action)
return [] if words.blank?

grouped_words = { case_sensitive: [], case_insensitive: [] }

words.each do |w, attrs|
word = word_to_regexp(w)
word = "(#{word})" if SiteSetting.watched_words_regular_expressions?

words.each do |word, attrs|
word = word_to_regexp(word, whole: SiteSetting.watched_words_regular_expressions?)
group_key = attrs[:case_sensitive] ? :case_sensitive : :case_insensitive
grouped_words[group_key] << word
end

regexps = grouped_words.select { |_, w| w.present? }.transform_values { |w| w.join("|") }

if !SiteSetting.watched_words_regular_expressions?
regexps.transform_values! do |regexp|
regexp = "(#{regexp})"
"(?:\\W|^)#{regexp}(?=\\W|$)"
end
regexps.transform_values! { |regexp| wrap_regexp(regexp, engine: engine) }
end

regexps.map { |c, regexp| Regexp.new(regexp, c == :case_sensitive ? nil : Regexp::IGNORECASE) }
Expand All @@ -80,29 +77,42 @@ def self.word_matcher_regexp_list(action, raise_errors: false)
[] # Admin will be alerted via admin_dashboard_data.rb
end

def self.word_matcher_regexps(action)
def self.word_matcher_regexps(action, engine: :ruby)
if words = get_cached_words(action)
words.map { |w, opts| [word_to_regexp(w, whole: true), opts] }.to_h
words.map { |word, attrs| [word_to_regexp(word, engine: engine), attrs] }.to_h
end
end

def self.word_to_regexp(word, whole: false)
def self.word_to_regexp(word, engine: :ruby, whole: true)
if SiteSetting.watched_words_regular_expressions?
# Strip ruby regexp format if present
# Strip Ruby regexp format if present
regexp = word.start_with?("(?-mix:") ? word[7..-2] : word
regexp = "(#{regexp})" if whole
return regexp
end

regexp = Regexp.escape(word).gsub("\\*", '\S*')
# Escape regular expression. Avoid using Regexp.escape because it escapes
# more characters than it should (for example, whitespaces)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')

if whole && !SiteSetting.watched_words_regular_expressions?
regexp = "(?:\\W|^)(#{regexp})(?=\\W|$)"
end
# Handle wildcards
regexp = regexp.gsub("\\*", '\S*')

regexp = wrap_regexp(regexp, engine: engine) if whole

regexp
end

def self.wrap_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
"(?:\\W|^)(#{regexp})(?=\\W|$)"
end
end

def self.word_matcher_regexp_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
Expand Down Expand Up @@ -212,10 +222,8 @@ def word_matches_for_action?(action, all_matches: false)
end

def word_matches?(word, case_sensitive: false)
Regexp.new(
WordWatcher.word_to_regexp(word, whole: true),
case_sensitive ? nil : Regexp::IGNORECASE,
).match?(@raw)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end

def self.replace_text_with_regexp(text, regexp, replacement)
Expand Down
6 changes: 3 additions & 3 deletions lib/pretty_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ def self.markdown(text, opts = {})
__optInput.emojiUnicodeReplacer = __emojiUnicodeReplacer;
__optInput.emojiDenyList = #{Emoji.denied.to_json};
__optInput.lookupUploadUrls = __lookupUploadUrls;
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link).to_json};
__optInput.censoredRegexp = #{WordWatcher.serializable_word_matcher_regexp(:censor, engine: :js).to_json};
__optInput.watchedWordsReplace = #{WordWatcher.word_matcher_regexps(:replace, engine: :js).to_json};
__optInput.watchedWordsLink = #{WordWatcher.word_matcher_regexps(:link, engine: :js).to_json};
__optInput.additionalOptions = #{Site.markdown_additional_options.to_json};
JS

Expand Down
8 changes: 8 additions & 0 deletions spec/integration/watched_words_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def should_block_post(manager)
should_block_post(manager)
end

it "should handle UTF-8 characters" do
block_word = Fabricate(:watched_word, action: WatchedWord.actions[:block], word: "abc")
manager =
NewPostManager.new(tl2_user, title: "Hello world", raw: "abcódef", topic_id: topic.id)

expect(manager.perform).to be_success
end

it "should block the post from admin" do
manager =
NewPostManager.new(
Expand Down
4 changes: 2 additions & 2 deletions spec/services/word_watcher_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@

expect(regexps).to be_an(Array)
expect(regexps.map(&:inspect)).to contain_exactly(
"/(?:\\W|^)(#{word1}|#{word2})(?=\\W|$)/i",
"/(?:\\W|^)(#{word3}|#{word4})(?=\\W|$)/",
"/(?:[^[:word:]]|^)(#{word1}|#{word2})(?=[^[:word:]]|$)/i",
"/(?:[^[:word:]]|^)(#{word3}|#{word4})(?=[^[:word:]]|$)/",
)
end

Expand Down

0 comments on commit 9a27803

Please sign in to comment.