From b6aabdc857c6ae9a5f077a8c99dd82798ddd941d Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 14:15:48 -0400
Subject: [PATCH 1/3] automata/meta: force some prefilter inlining

In some ad hoc profiling, I noticed an extra function call that really
didn't need to be there.
---
 regex-automata/src/meta/strategy.rs | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index ea6c6ab57..5b96d888a 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -353,6 +353,7 @@ impl Pre<()> {
 // strategy when len(patterns)==1 if the number of literals is large. In that
 // case, literal extraction gives up and will return an infinite set.)
 impl<P: PrefilterI> Strategy for Pre<P> {
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn group_info(&self) -> &GroupInfo {
         &self.group_info
     }
@@ -378,6 +379,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         self.pre.memory_usage()
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option<Match> {
         if input.is_done() {
             return None;
@@ -393,6 +395,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
             .map(|sp| Match::new(PatternID::ZERO, sp))
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_half(
         &self,
         cache: &mut Cache,
@@ -401,10 +404,12 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool {
         self.search(cache, input).is_some()
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn search_slots(
         &self,
         cache: &mut Cache,
@@ -421,6 +426,7 @@ impl<P: PrefilterI> Strategy for Pre<P> {
         Some(m.pattern())
     }
 
+    #[cfg_attr(feature = "perf-inline", inline(always))]
     fn which_overlapping_matches(
         &self,
         cache: &mut Cache,

From a94d2807927db2b9a48d67bd62c6ace4ebbbd834 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Thu, 12 Oct 2023 14:16:20 -0400
Subject: [PATCH 2/3] syntax: loosen ASCII compatible rules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, patterns like `(?-u:☃)` were banned under the logic that
Unicode scalar values shouldn't be available unless Unicode mode is
enabled. But since patterns are required to be UTF-8, there really isn't
any difficulty in just interpreting Unicode literals as their
corresponding UTF-8 encoding.

Note though that Unicode character classes, even things like
`(?-u:[☃])`, remain banned. We probably could make character classes
work too, but it's unclear how that plays with ASCII compatible mode
requiring that a single byte is the fundamental atom of matching (where
as Unicode mode requires that Unicode scalar values are the fundamental
atom of matching).
---
 regex-syntax/src/hir/translate.rs | 46 +++++++------------------------
 src/bytes.rs                      |  4 +--
 2 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 2eff6318c..313a1e9e8 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
             }
             Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
                 Either::Right(byte) => self.push_byte(byte),
-                Either::Left(ch) => {
-                    if !self.flags().unicode() && ch.len_utf8() > 1 {
-                        return Err(
-                            self.error(x.span, ErrorKind::UnicodeNotAllowed)
-                        );
-                    }
-                    match self.case_fold_char(x.span, ch)? {
-                        None => self.push_char(ch),
-                        Some(expr) => self.push(HirFrame::Expr(expr)),
-                    }
-                }
+                Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
+                    None => self.push_char(ch),
+                    Some(expr) => self.push(HirFrame::Expr(expr)),
+                },
             },
             Ast::Dot(ref span) => {
                 self.push(HirFrame::Expr(self.hir_dot(**span)?));
@@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             })?;
             Ok(Some(Hir::class(hir::Class::Unicode(cls))))
         } else {
-            if c.len_utf8() > 1 {
-                return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
+            if !c.is_ascii() {
+                return Ok(None);
             }
             // If case folding won't do anything, then don't bother trying.
             match c {
@@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         match self.ast_literal_to_scalar(ast)? {
             Either::Right(byte) => Ok(byte),
             Either::Left(ch) => {
-                let cp = u32::from(ch);
-                if cp <= 0x7F {
-                    Ok(u8::try_from(cp).unwrap())
+                if ch.is_ascii() {
+                    Ok(u8::try_from(ch).unwrap())
                 } else {
                     // We can't feasibly support Unicode in
                     // byte oriented classes. Byte classes don't
@@ -1661,16 +1653,7 @@ mod tests {
         assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
         assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?-u)☃"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(8, 1, 7)
-                ),
-            }
-        );
+        assert_eq!(t("(?-u)☃"), hir_lit("☃"));
         assert_eq!(
             t_err(r"(?-u)\xFF"),
             TestError {
@@ -1748,16 +1731,7 @@ mod tests {
         );
         assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
 
-        assert_eq!(
-            t_err("(?i-u)β"),
-            TestError {
-                kind: hir::ErrorKind::UnicodeNotAllowed,
-                span: Span::new(
-                    Position::new(6, 1, 7),
-                    Position::new(8, 1, 8),
-                ),
-            }
-        );
+        assert_eq!(t("(?i-u)β"), hir_lit("β"),);
     }
 
     #[test]
diff --git a/src/bytes.rs b/src/bytes.rs
index 3f53a3ea5..383ac4a5b 100644
--- a/src/bytes.rs
+++ b/src/bytes.rs
@@ -68,8 +68,8 @@ bytes:
 1. The `u` flag can be disabled even when disabling it might cause the regex to
 match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
 "ASCII compatible" mode.
-2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
-character classes are allowed.
+2. In ASCII compatible mode, Unicode character classes are not allowed. Literal
+Unicode scalar values outside of character classes are allowed.
 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
 revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
 to `[[:digit:]]` and `\s` maps to `[[:space:]]`.

From f6f73f2d19a801e29760d6b24773a120fb0ff213 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 13 Oct 2023 14:52:09 -0400
Subject: [PATCH 3/3] automata/meta: tweak reverse suffix prefilter strategy

Previously, we were only use the reverse suffix optimization if it found
a non-empty longest common suffix *and* if the prefilter thought itself
was fast. This was a heuristic used in the old regex crate before we
grew the "is prefilter fast" heuristic. We change this optimization to
just use the "is prefilter fast" heuristic instead of requiring a
non-empty longest common suffix.

This is, after all, what the inner literal optimization does. And in the
inner literal case, one should probably be even more conservative
because of the extra work that needs to be done. So if things are going
okay with the inner literal optimization, then we should be fine with
the reverse suffix optimization doing essentially the same thing.
---
 regex-automata/src/meta/strategy.rs | 37 ++++++++++-------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs
index 5b96d888a..4cb3b29b9 100644
--- a/regex-automata/src/meta/strategy.rs
+++ b/regex-automata/src/meta/strategy.rs
@@ -1167,34 +1167,21 @@ impl ReverseSuffix {
             return Err(core);
         }
         let kind = core.info.config().get_match_kind();
-        let suffixes = crate::util::prefilter::suffixes(kind, hirs);
-        let lcs = match suffixes.longest_common_suffix() {
-            None => {
-                debug!(
-                    "skipping reverse suffix optimization because \
-                     a longest common suffix could not be found",
-                );
-                return Err(core);
-            }
-            Some(lcs) if lcs.is_empty() => {
-                debug!(
-                    "skipping reverse suffix optimization because \
-                     the longest common suffix is the empty string",
-                );
-                return Err(core);
-            }
-            Some(lcs) => lcs,
+        let suffixseq = crate::util::prefilter::suffixes(kind, hirs);
+        let Some(suffixes) = suffixseq.literals() else {
+            debug!(
+                "skipping reverse suffix optimization because \
+                 the extract suffix sequence is not finite",
+            );
+            return Err(core);
         };
-        let pre = match Prefilter::new(kind, &[lcs]) {
-            Some(pre) => pre,
-            None => {
-                debug!(
-                    "skipping reverse suffix optimization because \
+        let Some(pre) = Prefilter::new(kind, suffixes) else {
+            debug!(
+                "skipping reverse suffix optimization because \
                      a prefilter could not be constructed from the \
                      longest common suffix",
-                );
-                return Err(core);
-            }
+            );
+            return Err(core);
         };
         if !pre.is_fast() {
             debug!(