regress/api.rs
1use crate::classicalbacktrack;
2use crate::emit;
3use crate::exec;
4use crate::indexing;
5use crate::insn::CompiledRegex;
6use crate::optimizer;
7use crate::parse;
8use crate::types::MAX_CAPTURE_GROUPS;
9use std::iter::FusedIterator;
10
11#[cfg(feature = "utf16")]
12use crate::{
13 classicalbacktrack::MatchAttempter,
14 indexing::{InputIndexer, Ucs2Input, Utf16Input},
15};
16
17#[cfg(feature = "backend-pikevm")]
18use crate::pikevm;
19use crate::util::to_char_sat;
20
21use core::{fmt, str::FromStr};
22#[cfg(feature = "std")]
23#[cfg(not(feature = "std"))]
24use {
25 alloc::{string::String, vec::Vec},
26 hashbrown::{HashMap, hash_map::Iter},
27};
28
29pub use parse::Error;
30
31/// Flags used to control regex parsing.
32/// The default flags are case-sensitive, not-multiline, and optimizing.
33#[derive(Debug, Copy, Clone, Default)]
34pub struct Flags {
35 /// If set, make the regex case-insensitive.
36 /// Equivalent to the 'i' flag in JavaScript.
37 pub icase: bool,
38
39 /// If set, ^ and $ match at line separators, not just the input boundaries.
40 /// Equivalent to the 'm' flag in JavaScript.
41 pub multiline: bool,
42
43 /// If set, . matches at line separators as well as any other character.
44 /// Equivalent to the 'm' flag in JavaScript.
45 pub dot_all: bool,
46
47 /// If set, disable regex IR passes.
48 pub no_opt: bool,
49
50 /// If set, the regex is interpreted as a Unicode regex.
51 /// Equivalent to the 'u' flag in JavaScript.
52 pub unicode: bool,
53
54 /// If set, the regex is interpreted as a UnicodeSets regex.
55 /// Equivalent to the 'v' flag in JavaScript.
56 pub unicode_sets: bool,
57}
58
59impl Flags {
60 /// Construct a Flags from a Unicode codepoints iterator, using JavaScript field names.
61 /// 'i' means to ignore case, 'm' means multiline, 'u' means unicode.
62 /// Note the 'g' flag implies a stateful regex and is not supported.
63 /// Other flags are not implemented and are ignored.
64 #[inline]
65 pub fn new<T: Iterator<Item = u32>>(chars: T) -> Self {
66 let mut result = Self::default();
67 for c in chars {
68 match to_char_sat(c) {
69 'm' => {
70 result.multiline = true;
71 }
72 'i' => {
73 result.icase = true;
74 }
75 's' => {
76 result.dot_all = true;
77 }
78 'u' => {
79 result.unicode = true;
80 }
81 'v' => {
82 result.unicode_sets = true;
83 }
84 _ => {
85 // Silently skip unsupported flags.
86 }
87 }
88 }
89 result
90 }
91}
92
93impl From<&str> for Flags {
94 /// Construct a Flags from a string, using JavaScript field names.
95 ///
96 /// See also: [`Flags::new`].
97 #[inline]
98 fn from(s: &str) -> Self {
99 Self::new(s.chars().map(u32::from))
100 }
101}
102
103impl fmt::Display for Flags {
104 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
105 if self.multiline {
106 f.write_str("m")?;
107 }
108 if self.icase {
109 f.write_str("i")?;
110 }
111 if self.dot_all {
112 f.write_str("s")?;
113 }
114 if self.unicode {
115 f.write_str("u")?;
116 }
117 Ok(())
118 }
119}
120
121/// Range is used to express the extent of a match, as indexes into the input
122/// string.
123pub type Range = core::ops::Range<usize>;
124
125/// An iterator type which yields `Match`es found in a string.
126pub type Matches<'r, 't> = exec::Matches<backends::DefaultExecutor<'r, 't>>;
127
128/// An iterator type which yields `Match`es found in a string, supporting ASCII
129/// only.
130pub type AsciiMatches<'r, 't> = exec::Matches<backends::DefaultAsciiExecutor<'r, 't>>;
131
132/// A Match represents a portion of a string which was found to match a Regex.
133#[derive(Debug, Clone)]
134pub struct Match {
135 /// The total range of the match. Note this may be empty, if the regex
136 /// matched an empty string.
137 pub range: Range,
138
139 /// The list of captures. This has length equal to the number of capturing
140 /// groups in the regex. For each capture, if the value is None, that group
141 /// did not match (for example, it was in a not-taken branch of an
142 /// alternation). If the value is Some, the group did match with the
143 /// enclosed range.
144 pub captures: Vec<Option<Range>>,
145
146 // A list of capture group names. This is either:
147 // - Empty, if there were no named capture groups.
148 // - A list of names with length `captures.len()`, corresponding to the
149 // capture group names in order. Groups without names have an empty string.
150 pub(crate) group_names: Box<[Box<str>]>,
151}
152
153impl Match {
154 /// Access a group by index, using the convention of Python's group()
155 /// function. Index 0 is the total match, index 1 is the first capture
156 /// group.
157 #[inline]
158 pub fn group(&self, idx: usize) -> Option<Range> {
159 if idx == 0 {
160 Some(self.range.clone())
161 } else if idx <= self.captures.len() {
162 self.captures[idx - 1].clone()
163 } else {
164 None
165 }
166 }
167
168 /// Access a named group by name.
169 #[inline]
170 pub fn named_group(&self, name: &str) -> Option<Range> {
171 // Empty strings are used as sentinels to indicate unnamed group.
172 if name.is_empty() {
173 return None;
174 }
175 let pos = self.group_names.iter().position(|s| s.as_ref() == name)?;
176 self.captures[pos].clone()
177 }
178
179 /// Return an iterator over the named groups of a Match.
180 #[inline]
181 pub fn named_groups(&self) -> NamedGroups<'_> {
182 NamedGroups::new(self)
183 }
184
185 /// Returns the range over the starting and ending byte offsets of the match in the haystack.
186 ///
187 /// This is a convenience function to work around
188 /// the fact that Range does not support Copy.
189 #[inline]
190 pub fn range(&self) -> Range {
191 self.range.clone()
192 }
193
194 /// Returns the starting byte offset of the match in the haystack.
195 #[inline]
196 pub fn start(&self) -> usize {
197 self.range.start
198 }
199
200 /// Returns the ending byte offset of the match in the haystack.
201 #[inline]
202 pub fn end(&self) -> usize {
203 self.range.end
204 }
205
206 /// Returns the matched text as a string slice.
207 ///
208 /// # Examples
209 ///
210 /// ```rust
211 /// use regress::Regex;
212 ///
213 /// let re = Regex::new(r"\d+").unwrap();
214 /// let text = "Price: $123";
215 /// let m = re.find(text).unwrap();
216 /// assert_eq!(m.as_str(text), "123");
217 /// ```
218 #[inline]
219 pub fn as_str<'t>(&self, text: &'t str) -> &'t str {
220 &text[self.range()]
221 }
222
223 /// Return an iterator over a Match. The first returned value is the total
224 /// match, and subsequent values represent the capture groups.
225 #[inline]
226 pub fn groups(&self) -> Groups<'_> {
227 Groups::new(self)
228 }
229}
230
231/// An iterator over the capture groups of a [`Match`]
232///
233/// This struct is created by the [`groups`] method on [`Match`].
234///
235/// [`Match`]: ../struct.Match.html
236/// [`groups`]: ../struct.Match.html#method.groups
237#[derive(Clone)]
238pub struct Groups<'m> {
239 mat: &'m Match,
240
241 // The next group index to return, where 0 references the total match.
242 next_group_idx: usize,
243
244 // The maximum group index to return, with a +1 for the implicit total match.
245 // For example, in a regex with 1 capture group, this will be 2.
246 max: usize,
247}
248
249impl<'m> Groups<'m> {
250 #[inline]
251 fn new(mat: &'m Match) -> Self {
252 Self {
253 mat,
254 next_group_idx: 0,
255 max: mat.captures.len() + 1, // +1 for the total match
256 }
257 }
258}
259
260impl Iterator for Groups<'_> {
261 type Item = Option<Range>;
262
263 #[inline]
264 fn next(&mut self) -> Option<Self::Item> {
265 let i = self.next_group_idx;
266 if i < self.max {
267 self.next_group_idx += 1;
268 Some(self.mat.group(i))
269 } else {
270 None
271 }
272 }
273
274 fn size_hint(&self) -> (usize, Option<usize>) {
275 let size = self.max.saturating_sub(self.next_group_idx);
276 (size, Some(size))
277 }
278}
279
280impl<'m> ExactSizeIterator for Groups<'m> {}
281impl<'m> FusedIterator for Groups<'m> {}
282
283/// An iterator over the named capture groups of a [`Match`]
284///
285/// This struct is created by the [`named_groups`] method on [`Match`].
286///
287/// [`Match`]: ../struct.Match.html
288/// [`named_groups`]: ../struct.Match.html#method.named_groups
289#[derive(Clone)]
290pub struct NamedGroups<'m> {
291 mat: &'m Match,
292
293 // The next group name index to return.
294 // Note unlike `Groups` this does NOT include the implicit total match.
295 // That is, group 0 is the first capture group, NOT the total match.
296 next_group_idx: usize,
297}
298
299impl<'m> NamedGroups<'m> {
300 #[inline]
301 fn new(mat: &'m Match) -> Self {
302 Self {
303 mat,
304 next_group_idx: 0,
305 }
306 }
307}
308
309impl<'m> Iterator for NamedGroups<'m> {
310 type Item = (&'m str, Option<Range>);
311
312 #[inline]
313 fn next(&mut self) -> Option<Self::Item> {
314 // Increment next_group_idx until we find a non-empty name.
315 debug_assert!(self.next_group_idx <= self.mat.group_names.len());
316 let end = self.mat.group_names.len();
317 let mut idx = self.next_group_idx;
318 while idx < end && self.mat.group_names[idx].is_empty() {
319 idx += 1;
320 }
321 if idx == end {
322 return None;
323 }
324 let name = self.mat.group_names[idx].as_ref();
325 let range = self.mat.captures[idx].clone();
326 self.next_group_idx = idx + 1;
327 Some((name, range))
328 }
329
330 fn size_hint(&self) -> (usize, Option<usize>) {
331 let size = self.mat.group_names[self.next_group_idx..]
332 .iter()
333 .filter(|s| !s.is_empty())
334 .count();
335
336 (size, Some(size))
337 }
338}
339
340impl<'m> ExactSizeIterator for NamedGroups<'m> {}
341impl<'m> FusedIterator for NamedGroups<'m> {}
342
343/// A Regex is the compiled version of a pattern.
344#[derive(Debug, Clone)]
345pub struct Regex {
346 cr: CompiledRegex,
347}
348
349impl From<CompiledRegex> for Regex {
350 fn from(cr: CompiledRegex) -> Self {
351 Self { cr }
352 }
353}
354
355impl Regex {
356 /// Construct a regex by parsing `pattern` using the default flags.
357 /// An Error may be returned if the syntax is invalid.
358 /// Note that this is rather expensive; prefer to cache a Regex which is
359 /// intended to be used more than once.
360 #[inline]
361 pub fn new(pattern: &str) -> Result<Regex, Error> {
362 Self::with_flags(pattern, Flags::default())
363 }
364
365 /// Construct a regex by parsing `pattern` with `flags`.
366 /// An Error may be returned if the syntax is invalid.
367 //
368 /// Note it is preferable to cache a Regex which is intended to be used more
369 /// than once, as the parse may be expensive. For example:
370 #[inline]
371 pub fn with_flags<F>(pattern: &str, flags: F) -> Result<Regex, Error>
372 where
373 F: Into<Flags>,
374 {
375 Self::from_unicode(pattern.chars().map(u32::from), flags)
376 }
377
378 /// Construct a regex by parsing `pattern` with `flags`, where
379 /// `pattern` is an iterator of `u32` Unicode codepoints.
380 /// An Error may be returned if the syntax is invalid.
381 /// This allows parsing regular expressions from exotic strings in
382 /// other encodings, such as UTF-16 or UTF-32.
383 pub fn from_unicode<I, F>(pattern: I, flags: F) -> Result<Regex, Error>
384 where
385 I: Iterator<Item = u32> + Clone,
386 F: Into<Flags>,
387 {
388 let flags = flags.into();
389 let mut ire = parse::try_parse(pattern, flags)?;
390 if !flags.no_opt {
391 optimizer::optimize(&mut ire);
392 }
393 let cr = emit::emit(&ire);
394 Ok(Regex { cr })
395 }
396
397 /// Searches `text` to find the first match.
398 #[inline]
399 pub fn find(&self, text: &str) -> Option<Match> {
400 self.find_iter(text).next()
401 }
402
403 /// Searches `text`, returning an iterator over non-overlapping matches.
404 /// Note that the resulting Iterator borrows both the regex `'r` and the
405 /// input string as `'t`.
406 #[inline]
407 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
408 self.find_from(text, 0)
409 }
410
411 /// Returns an iterator for matches found in 'text' starting at byte index
412 /// `start`. Note this may be different from passing a sliced `text` in
413 /// the case of lookbehind assertions.
414 /// Example:
415 ///
416 /// ```rust
417 /// use regress::Regex;
418 /// let text = "xyxy";
419 /// let re = Regex::new(r"(?<=x)y").unwrap();
420 /// let t1 = re.find(&text[1..]).unwrap().range();
421 /// assert!(t1 == (2..3));
422 /// let t2 = re.find_from(text, 1).next().unwrap().range();
423 /// assert!(t2 == (1..2));
424 /// ```
425 #[inline]
426 pub fn find_from<'r, 't>(&'r self, text: &'t str, start: usize) -> Matches<'r, 't> {
427 backends::find(self, text, start)
428 }
429
430 /// Searches `text` to find the first match.
431 /// The input text is expected to be ascii-only: only ASCII case-folding is
432 /// supported.
433 #[inline]
434 pub fn find_ascii(&self, text: &str) -> Option<Match> {
435 self.find_iter_ascii(text).next()
436 }
437
438 /// Searches `text`, returning an iterator over non-overlapping matches.
439 /// The input text is expected to be ascii-only: only ASCII case-folding is
440 /// supported.
441 #[inline]
442 pub fn find_iter_ascii<'r, 't>(&'r self, text: &'t str) -> AsciiMatches<'r, 't> {
443 self.find_from_ascii(text, 0)
444 }
445
446 /// Returns an iterator for matches found in 'text' starting at byte index
447 /// `start`.
448 #[inline]
449 pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
450 backends::find(self, text, start)
451 }
452
453 /// Returns an iterator for matches found in 'text' starting at index `start`.
454 #[cfg(feature = "utf16")]
455 pub fn find_from_utf16<'r, 't>(
456 &'r self,
457 text: &'t [u16],
458 start: usize,
459 ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
460 {
461 let input = Utf16Input::new(text, self.cr.flags.unicode);
462 exec::Matches::new(
463 super::classicalbacktrack::BacktrackExecutor::new(
464 input,
465 MatchAttempter::new(&self.cr, input.left_end()),
466 ),
467 start,
468 )
469 }
470
471 /// Returns an iterator for matches found in 'text' starting at index `start`.
472 #[cfg(feature = "utf16")]
473 pub fn find_from_ucs2<'r, 't>(
474 &'r self,
475 text: &'t [u16],
476 start: usize,
477 ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
478 {
479 let input = Ucs2Input::new(text, self.cr.flags.unicode);
480 exec::Matches::new(
481 super::classicalbacktrack::BacktrackExecutor::new(
482 input,
483 MatchAttempter::new(&self.cr, input.left_end()),
484 ),
485 start,
486 )
487 }
488
489 /// Replaces the first match of the regex in `text` with the replacement string.
490 ///
491 /// The replacement string may contain capture group references in the form `$1`, `$2`, etc.,
492 /// where `$1` refers to the first capture group, `$2` to the second, and so on.
493 /// `$0` refers to the entire match. Use `$$` to insert a literal `$`.
494 ///
495 /// If no match is found, the original text is returned unchanged.
496 ///
497 /// # Examples
498 ///
499 /// ```rust
500 /// use regress::Regex;
501 ///
502 /// let re = Regex::new(r"(\w+)\s+(\w+)").unwrap();
503 /// let result = re.replace("hello world", "$2 $1");
504 /// assert_eq!(result, "world hello");
505 ///
506 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
507 /// let result = re.replace("2023-12-25", "$2/$3/$1");
508 /// assert_eq!(result, "12/25/2023");
509 /// ```
510 pub fn replace(&self, text: &str, replacement: &str) -> String {
511 match self.find(text) {
512 Some(m) => {
513 let mut result = String::with_capacity(text.len());
514 result.push_str(&text[..m.start()]);
515 self.expand_replacement(&m, text, replacement, &mut result);
516 result.push_str(&text[m.end()..]);
517 result
518 }
519 None => text.to_string(),
520 }
521 }
522
523 /// Replaces all matches of the regex in `text` with the replacement string.
524 ///
525 /// The replacement string may contain capture group references in the form `$1`, `$2`, etc.,
526 /// where `$1` refers to the first capture group, `$2` to the second, and so on.
527 /// `$0` refers to the entire match. Use `$$` to insert a literal `$`.
528 ///
529 /// # Examples
530 ///
531 /// ```rust
532 /// use regress::Regex;
533 ///
534 /// let re = Regex::new(r"(\w+)\s+(\w+)").unwrap();
535 /// let result = re.replace_all("hello world foo bar", "$2-$1");
536 /// assert_eq!(result, "world-hello bar-foo");
537 ///
538 /// let re = Regex::new(r"\b(\w)(\w+)").unwrap();
539 /// let result = re.replace_all("hello world", "$1.$2");
540 /// assert_eq!(result, "h.ello w.orld");
541 /// ```
542 pub fn replace_all(&self, text: &str, replacement: &str) -> String {
543 let mut result = String::with_capacity(text.len());
544 let mut last_end = 0;
545
546 for m in self.find_iter(text) {
547 result.push_str(&text[last_end..m.start()]);
548 self.expand_replacement(&m, text, replacement, &mut result);
549 last_end = m.end();
550 }
551
552 result.push_str(&text[last_end..]);
553 result
554 }
555
556 /// Replaces the first match of the regex in `text` using a closure.
557 ///
558 /// The closure receives a `&Match` and should return the replacement string.
559 /// This is useful for dynamic replacements that depend on the match details.
560 ///
561 /// If no match is found, the original text is returned unchanged.
562 ///
563 /// # Examples
564 ///
565 /// ```rust
566 /// use regress::Regex;
567 ///
568 /// let re = Regex::new(r"\d+").unwrap();
569 /// let text = "Price: $123";
570 /// let result = re.replace_with(text, |m| {
571 /// let num: i32 = m.as_str(text).parse().unwrap();
572 /// format!("{}", num * 2)
573 /// });
574 /// assert_eq!(result, "Price: $246");
575 /// ```
576 pub fn replace_with<F>(&self, text: &str, replacement: F) -> String
577 where
578 F: FnOnce(&Match) -> String,
579 {
580 match self.find(text) {
581 Some(m) => {
582 let mut result = String::with_capacity(text.len());
583 result.push_str(&text[..m.start()]);
584 result.push_str(&replacement(&m));
585 result.push_str(&text[m.end()..]);
586 result
587 }
588 None => text.to_string(),
589 }
590 }
591
592 /// Replaces all matches of the regex in `text` using a closure.
593 ///
594 /// The closure receives a `&Match` and should return the replacement string.
595 /// This is useful for dynamic replacements that depend on the match details.
596 ///
597 /// # Examples
598 ///
599 /// ```rust
600 /// use regress::Regex;
601 ///
602 /// let re = Regex::new(r"\d+").unwrap();
603 /// let text = "Items: 5, 10, 15";
604 /// let result = re.replace_all_with(text, |m| {
605 /// let num: i32 = m.as_str(text).parse().unwrap();
606 /// format!("[{}]", num * 10)
607 /// });
608 /// assert_eq!(result, "Items: [50], [100], [150]");
609 /// ```
610 pub fn replace_all_with<F>(&self, text: &str, replacement: F) -> String
611 where
612 F: Fn(&Match) -> String,
613 {
614 let mut result = String::with_capacity(text.len());
615 let mut last_end = 0;
616
617 for m in self.find_iter(text) {
618 result.push_str(&text[last_end..m.start()]);
619 result.push_str(&replacement(&m));
620 last_end = m.end();
621 }
622
623 result.push_str(&text[last_end..]);
624 result
625 }
626
627 /// Helper method to expand replacement strings with capture group substitutions.
628 fn expand_replacement(&self, m: &Match, text: &str, replacement: &str, output: &mut String) {
629 let mut chars = replacement.chars().peekable();
630
631 while let Some(ch) = chars.next() {
632 if ch == '$' {
633 match chars.peek() {
634 Some('$') => {
635 // $$ -> literal $
636 chars.next();
637 output.push('$');
638 }
639 Some(&digit) if digit.is_ascii_digit() => {
640 // Parse the group number
641 let mut group_num = 0;
642 while let Some(&digit) = chars.peek() {
643 if digit.is_ascii_digit() {
644 chars.next();
645 group_num = group_num * 10 + (digit as u32 - '0' as u32) as usize;
646 // Limit to reasonable group numbers to avoid overflow
647 if group_num > MAX_CAPTURE_GROUPS {
648 break;
649 }
650 } else {
651 break;
652 }
653 }
654
655 // Get the matched text for this group
656 if let Some(range) = m.group(group_num) {
657 output.push_str(&text[range]);
658 }
659 // If group doesn't exist or didn't match, add nothing
660 }
661 Some('{') => {
662 // Handle ${name} syntax for named groups
663 chars.next(); // consume '{'
664 let mut name = String::new();
665 let mut found_closing_brace = false;
666
667 for ch in chars.by_ref() {
668 if ch == '}' {
669 found_closing_brace = true;
670 break;
671 }
672 name.push(ch);
673 }
674
675 if found_closing_brace {
676 if let Some(range) = m.named_group(&name) {
677 output.push_str(&text[range]);
678 }
679 } else {
680 // Malformed ${...}, treat as literal
681 output.push_str("${");
682 output.push_str(&name);
683 }
684 }
685 _ => {
686 // Just a $ at end or followed by non-digit
687 output.push('$');
688 }
689 }
690 } else {
691 output.push(ch);
692 }
693 }
694 }
695}
696
697impl FromStr for Regex {
698 type Err = Error;
699
700 /// Attempts to parse a string into a regular expression
701 #[inline]
702 fn from_str(s: &str) -> Result<Self, Error> {
703 Self::new(s)
704 }
705}
706
707// Pattern trait implementation for str::find, str::contains, etc.
708#[cfg(feature = "pattern")]
709mod pattern_impl {
710 use super::*;
711 use core::str::pattern::{Pattern, ReverseSearcher, SearchStep, Searcher};
712
713 /// A searcher for a regex pattern.
714 pub struct RegexSearcher<'r, 't> {
715 haystack: &'t str,
716 regex: &'r Regex,
717 current_pos: usize,
718 done: bool,
719 // For reverse searching
720 reverse_pos: usize,
721 reverse_done: bool,
722 }
723
724 impl<'r, 't> RegexSearcher<'r, 't> {
725 fn new(regex: &'r Regex, haystack: &'t str) -> Self {
726 Self {
727 haystack,
728 regex,
729 current_pos: 0,
730 done: false,
731 reverse_pos: haystack.len(),
732 reverse_done: false,
733 }
734 }
735
736 fn find_last_match_before(&self, pos: usize) -> Option<super::Match> {
737 // Find all matches up to the given position and return the last one
738 let mut last_match = None;
739 for m in self.regex.find_from(self.haystack, 0) {
740 if m.end() <= pos {
741 last_match = Some(m);
742 } else {
743 break;
744 }
745 }
746 last_match
747 }
748 }
749
750 unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> {
751 fn haystack(&self) -> &'t str {
752 self.haystack
753 }
754
755 fn next(&mut self) -> SearchStep {
756 if self.done {
757 return SearchStep::Done;
758 }
759
760 // Try to find the next match starting from current position
761 if let Some(m) = self.regex.find_from(self.haystack, self.current_pos).next() {
762 let match_start = m.start();
763 let match_end = m.end();
764
765 // Handle any gap between current position and match start
766 if self.current_pos < match_start {
767 let reject_end = match_start;
768 let reject_start = self.current_pos;
769 self.current_pos = match_start;
770 return SearchStep::Reject(reject_start, reject_end);
771 }
772
773 // Return the match
774 self.current_pos = match_end;
775
776 // Handle zero-width matches to avoid infinite loops
777 if match_start == match_end {
778 // For zero-width matches, we need to advance at least one byte
779 // to avoid infinite loops
780 if match_end < self.haystack.len() {
781 // Find the next character boundary
782 let mut next_pos = match_end + 1;
783 while next_pos < self.haystack.len()
784 && !self.haystack.is_char_boundary(next_pos)
785 {
786 next_pos += 1;
787 }
788 self.current_pos = next_pos;
789 } else {
790 // We're at the end of the string
791 self.done = true;
792 }
793 }
794
795 SearchStep::Match(match_start, match_end)
796 } else {
797 // No more matches, reject remaining text if any
798 if self.current_pos < self.haystack.len() {
799 let reject_start = self.current_pos;
800 let reject_end = self.haystack.len();
801 self.current_pos = self.haystack.len();
802 self.done = true;
803 SearchStep::Reject(reject_start, reject_end)
804 } else {
805 self.done = true;
806 SearchStep::Done
807 }
808 }
809 }
810 }
811
812 unsafe impl<'r, 't> ReverseSearcher<'t> for RegexSearcher<'r, 't> {
813 fn next_back(&mut self) -> SearchStep {
814 if self.reverse_done {
815 return SearchStep::Done;
816 }
817
818 // Try to find the last match before current reverse position
819 if let Some(m) = self.find_last_match_before(self.reverse_pos) {
820 let match_start = m.start();
821 let match_end = m.end();
822
823 // Handle any gap between match end and current reverse position
824 if match_end < self.reverse_pos {
825 let reject_start = match_end;
826 let reject_end = self.reverse_pos;
827 self.reverse_pos = match_end;
828 return SearchStep::Reject(reject_start, reject_end);
829 }
830
831 // Return the match
832 self.reverse_pos = match_start;
833
834 // Handle zero-width matches
835 if match_start == match_end {
836 // For zero-width matches, move back by one character
837 if match_start > 0 {
838 let mut prev_pos = match_start - 1;
839 while prev_pos > 0 && !self.haystack.is_char_boundary(prev_pos) {
840 prev_pos -= 1;
841 }
842 self.reverse_pos = prev_pos;
843 } else {
844 // We're at the beginning of the string
845 self.reverse_done = true;
846 }
847 }
848
849 SearchStep::Match(match_start, match_end)
850 } else {
851 // No more matches, reject remaining text if any
852 if self.reverse_pos > 0 {
853 let reject_start = 0;
854 let reject_end = self.reverse_pos;
855 self.reverse_pos = 0;
856 self.reverse_done = true;
857 SearchStep::Reject(reject_start, reject_end)
858 } else {
859 self.reverse_done = true;
860 SearchStep::Done
861 }
862 }
863 }
864 }
865
866 impl<'r> Pattern for &'r Regex {
867 type Searcher<'a> = RegexSearcher<'r, 'a>;
868
869 fn into_searcher(self, haystack: &str) -> Self::Searcher<'_> {
870 RegexSearcher::new(self, haystack)
871 }
872 }
873}
874
875#[cfg(feature = "pattern")]
876pub use pattern_impl::*;
877
878// Support for using regress with different regex backends.
879// Currently there is only the classical backtracking, and PikeVM.
880#[doc(hidden)]
881pub mod backends {
882 use super::Regex;
883 use super::exec;
884 use super::indexing;
885 pub use crate::emit::emit;
886 pub use crate::optimizer::optimize;
887 pub use crate::parse::try_parse;
888
889 /// An Executor using the classical backtracking algorithm.
890 pub type BacktrackExecutor<'r, 't> =
891 super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf8Input<'t>>;
892
893 /// A Executor using the PikeVM executor.
894 #[cfg(feature = "backend-pikevm")]
895 pub type PikeVMExecutor<'r, 't> = super::pikevm::PikeVMExecutor<'r, indexing::Utf8Input<'t>>;
896
897 /// An alias type to the default Executor.
898 pub type DefaultExecutor<'r, 't> = BacktrackExecutor<'r, 't>;
899
900 /// An alias type to the default executor's ASCII form.
901 pub type DefaultAsciiExecutor<'r, 't> =
902 <DefaultExecutor<'r, 't> as exec::Executor<'r, 't>>::AsAscii;
903
904 /// Searches `text`, returning an iterator over non-overlapping matches.
905 pub fn find<'r, 't, Executor: exec::Executor<'r, 't>>(
906 re: &'r Regex,
907 text: &'t str,
908 start: usize,
909 ) -> exec::Matches<Executor> {
910 exec::Matches::new(Executor::new(&re.cr, text), start)
911 }
912
913 /// Searches `text`, returning an iterator over non-overlapping matches.
914 /// This is a convenience method to avoid E0223.
915 pub fn find_ascii<'r, 't, Executor: exec::Executor<'r, 't>>(
916 re: &'r Regex,
917 text: &'t str,
918 start: usize,
919 ) -> exec::Matches<Executor::AsAscii> {
920 find::<Executor::AsAscii>(re, text, start)
921 }
922}
923
924/// Escapes all special regex characters in a string to make it a literal match.
925///
926/// This function takes a string and returns a new string with all special
927/// regex characters escaped with backslashes, so the resulting string can be
928/// used as a literal pattern in a regular expression.
929///
930/// # Example
931///
932/// ```
933/// use regress::escape;
934///
935/// let escaped = escape("Hello. How are you?");
936/// assert_eq!(escaped, "Hello\\. How are you\\?");
937///
938/// let escaped = escape("$100 + tax (15%)");
939/// assert_eq!(escaped, "\\$100 \\+ tax \\(15%\\)");
940/// ```
941pub fn escape(text: &str) -> String {
942 let mut result = String::with_capacity(text.len());
943
944 for c in text.chars() {
945 match c {
946 // Characters that have special meaning in regex and need escaping
947 '\\' | '^' | '$' | '.' | '|' | '?' | '*' | '+' | '(' | ')' | '[' | ']' | '{' | '}' => {
948 result.push('\\');
949 result.push(c);
950 }
951 // All other characters are literal
952 _ => result.push(c),
953 }
954 }
955
956 result
957}