From b6aabdc857c6ae9a5f077a8c99dd82798ddd941d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 14:15:48 -0400 Subject: [PATCH 1/3] automata/meta: force some prefilter inlining In some ad hoc profiling, I noticed an extra function call that really didn't need to be there. --- regex-automata/src/meta/strategy.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index ea6c6ab57..5b96d888a 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -353,6 +353,7 @@ impl Pre<()> { // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ + #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } @@ -378,6 +379,7 @@ impl Strategy for Pre

{ self.pre.memory_usage() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; @@ -393,6 +395,7 @@ impl Strategy for Pre

{ .map(|sp| Match::new(PatternID::ZERO, sp)) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, @@ -401,10 +404,12 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, @@ -421,6 +426,7 @@ impl Strategy for Pre

{ Some(m.pattern()) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, From a94d2807927db2b9a48d67bd62c6ace4ebbbd834 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 12 Oct 2023 14:16:20 -0400 Subject: [PATCH 2/3] syntax: loosen ASCII compatible rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, patterns like `(?-u:☃)` were banned under the logic that Unicode scalar values shouldn't be available unless Unicode mode is enabled. But since patterns are required to be UTF-8, there really isn't any difficulty in just interpreting Unicode literals as their corresponding UTF-8 encoding. Note though that Unicode character classes, even things like `(?-u:[☃])`, remain banned. We probably could make character classes work too, but it's unclear how that plays with ASCII compatible mode requiring that a single byte is the fundamental atom of matching (where as Unicode mode requires that Unicode scalar values are the fundamental atom of matching). --- regex-syntax/src/hir/translate.rs | 46 +++++++------------------------ src/bytes.rs | 4 +-- 2 files changed, 12 insertions(+), 38 deletions(-) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 2eff6318c..313a1e9e8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err( - self.error(x.span, ErrorKind::UnicodeNotAllowed) - ); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } - } + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, }, Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); @@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { - if c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + if !c.is_ascii() { + return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { @@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { - let cp = u32::from(ch); - if cp <= 0x7F { - Ok(u8::try_from(cp).unwrap()) + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1661,16 +1653,7 @@ mod tests { assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?-u)☃"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7) - ), - } - ); + assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { @@ -1748,16 +1731,7 @@ mod tests { ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?i-u)β"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(6, 1, 7), - Position::new(8, 1, 8), - ), - } - ); + assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] diff --git a/src/bytes.rs b/src/bytes.rs index 3f53a3ea5..383ac4a5b 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -68,8 +68,8 @@ bytes: 1. The `u` flag can be disabled even when disabling it might cause the regex to match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in "ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode scalar values nor Unicode -character classes are allowed. +2. In ASCII compatible mode, Unicode character classes are not allowed. Literal +Unicode scalar values outside of character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps to `[[:digit:]]` and `\s` maps to `[[:space:]]`. From f6f73f2d19a801e29760d6b24773a120fb0ff213 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 13 Oct 2023 14:52:09 -0400 Subject: [PATCH 3/3] automata/meta: tweak reverse suffix prefilter strategy Previously, we were only use the reverse suffix optimization if it found a non-empty longest common suffix *and* if the prefilter thought itself was fast. This was a heuristic used in the old regex crate before we grew the "is prefilter fast" heuristic. We change this optimization to just use the "is prefilter fast" heuristic instead of requiring a non-empty longest common suffix. This is, after all, what the inner literal optimization does. And in the inner literal case, one should probably be even more conservative because of the extra work that needs to be done. So if things are going okay with the inner literal optimization, then we should be fine with the reverse suffix optimization doing essentially the same thing. --- regex-automata/src/meta/strategy.rs | 37 ++++++++++------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 5b96d888a..4cb3b29b9 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -1167,34 +1167,21 @@ impl ReverseSuffix { return Err(core); } let kind = core.info.config().get_match_kind(); - let suffixes = crate::util::prefilter::suffixes(kind, hirs); - let lcs = match suffixes.longest_common_suffix() { - None => { - debug!( - "skipping reverse suffix optimization because \ - a longest common suffix could not be found", - ); - return Err(core); - } - Some(lcs) if lcs.is_empty() => { - debug!( - "skipping reverse suffix optimization because \ - the longest common suffix is the empty string", - ); - return Err(core); - } - Some(lcs) => lcs, + let suffixseq = crate::util::prefilter::suffixes(kind, hirs); + let Some(suffixes) = suffixseq.literals() else { + debug!( + "skipping reverse suffix optimization because \ + the extract suffix sequence is not finite", + ); + return Err(core); }; - let pre = match Prefilter::new(kind, &[lcs]) { - Some(pre) => pre, - None => { - debug!( - "skipping reverse suffix optimization because \ + let Some(pre) = Prefilter::new(kind, suffixes) else { + debug!( + "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", - ); - return Err(core); - } + ); + return Err(core); }; if !pre.is_fast() { debug!(