diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index ea6c6ab57..4cb3b29b9 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -353,6 +353,7 @@ impl Pre<()> { // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ + #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } @@ -378,6 +379,7 @@ impl Strategy for Pre

{ self.pre.memory_usage() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; @@ -393,6 +395,7 @@ impl Strategy for Pre

{ .map(|sp| Match::new(PatternID::ZERO, sp)) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, @@ -401,10 +404,12 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, @@ -421,6 +426,7 @@ impl Strategy for Pre

{ Some(m.pattern()) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, @@ -1161,34 +1167,21 @@ impl ReverseSuffix { return Err(core); } let kind = core.info.config().get_match_kind(); - let suffixes = crate::util::prefilter::suffixes(kind, hirs); - let lcs = match suffixes.longest_common_suffix() { - None => { - debug!( - "skipping reverse suffix optimization because \ - a longest common suffix could not be found", - ); - return Err(core); - } - Some(lcs) if lcs.is_empty() => { - debug!( - "skipping reverse suffix optimization because \ - the longest common suffix is the empty string", - ); - return Err(core); - } - Some(lcs) => lcs, + let suffixseq = crate::util::prefilter::suffixes(kind, hirs); + let Some(suffixes) = suffixseq.literals() else { + debug!( + "skipping reverse suffix optimization because \ + the extract suffix sequence is not finite", + ); + return Err(core); }; - let pre = match Prefilter::new(kind, &[lcs]) { - Some(pre) => pre, - None => { - debug!( - "skipping reverse suffix optimization because \ + let Some(pre) = Prefilter::new(kind, suffixes) else { + debug!( + "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", - ); - return Err(core); - } + ); + return Err(core); }; if !pre.is_fast() { debug!( diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 2eff6318c..313a1e9e8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), - Either::Left(ch) => { - if !self.flags().unicode() && ch.len_utf8() > 1 { - return Err( - self.error(x.span, ErrorKind::UnicodeNotAllowed) - ); - } - match self.case_fold_char(x.span, ch)? { - None => self.push_char(ch), - Some(expr) => self.push(HirFrame::Expr(expr)), - } - } + Either::Left(ch) => match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + }, }, Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); @@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { - if c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + if !c.is_ascii() { + return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { @@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { - let cp = u32::from(ch); - if cp <= 0x7F { - Ok(u8::try_from(cp).unwrap()) + if ch.is_ascii() { + Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1661,16 +1653,7 @@ mod tests { assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?-u)☃"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7) - ), - } - ); + assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { @@ -1748,16 +1731,7 @@ mod tests { ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); - assert_eq!( - t_err("(?i-u)β"), - TestError { - kind: hir::ErrorKind::UnicodeNotAllowed, - span: Span::new( - Position::new(6, 1, 7), - Position::new(8, 1, 8), - ), - } - ); + assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] diff --git a/src/bytes.rs b/src/bytes.rs index 3f53a3ea5..383ac4a5b 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -68,8 +68,8 @@ bytes: 1. The `u` flag can be disabled even when disabling it might cause the regex to match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in "ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode scalar values nor Unicode -character classes are allowed. +2. In ASCII compatible mode, Unicode character classes are not allowed. Literal +Unicode scalar values outside of character classes are allowed. 3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps to `[[:digit:]]` and `\s` maps to `[[:space:]]`.