regress/
api.rs

1use crate::classicalbacktrack;
2use crate::emit;
3use crate::exec;
4use crate::indexing;
5use crate::insn::CompiledRegex;
6use crate::optimizer;
7use crate::parse;
8use crate::types::MAX_CAPTURE_GROUPS;
9use std::iter::FusedIterator;
10
11#[cfg(feature = "utf16")]
12use crate::{
13    classicalbacktrack::MatchAttempter,
14    indexing::{InputIndexer, Ucs2Input, Utf16Input},
15};
16
17#[cfg(feature = "backend-pikevm")]
18use crate::pikevm;
19use crate::util::to_char_sat;
20
21use core::{fmt, str::FromStr};
22#[cfg(feature = "std")]
23#[cfg(not(feature = "std"))]
24use {
25    alloc::{string::String, vec::Vec},
26    hashbrown::{HashMap, hash_map::Iter},
27};
28
29pub use parse::Error;
30
31/// Flags used to control regex parsing.
32/// The default flags are case-sensitive, not-multiline, and optimizing.
33#[derive(Debug, Copy, Clone, Default)]
34pub struct Flags {
35    /// If set, make the regex case-insensitive.
36    /// Equivalent to the 'i' flag in JavaScript.
37    pub icase: bool,
38
39    /// If set, ^ and $ match at line separators, not just the input boundaries.
40    /// Equivalent to the 'm' flag in JavaScript.
41    pub multiline: bool,
42
43    /// If set, . matches at line separators as well as any other character.
44    /// Equivalent to the 'm' flag in JavaScript.
45    pub dot_all: bool,
46
47    /// If set, disable regex IR passes.
48    pub no_opt: bool,
49
50    /// If set, the regex is interpreted as a Unicode regex.
51    /// Equivalent to the 'u' flag in JavaScript.
52    pub unicode: bool,
53
54    /// If set, the regex is interpreted as a UnicodeSets regex.
55    /// Equivalent to the 'v' flag in JavaScript.
56    pub unicode_sets: bool,
57}
58
59impl Flags {
60    /// Construct a Flags from a Unicode codepoints iterator, using JavaScript field names.
61    /// 'i' means to ignore case, 'm' means multiline, 'u' means unicode.
62    /// Note the 'g' flag implies a stateful regex and is not supported.
63    /// Other flags are not implemented and are ignored.
64    #[inline]
65    pub fn new<T: Iterator<Item = u32>>(chars: T) -> Self {
66        let mut result = Self::default();
67        for c in chars {
68            match to_char_sat(c) {
69                'm' => {
70                    result.multiline = true;
71                }
72                'i' => {
73                    result.icase = true;
74                }
75                's' => {
76                    result.dot_all = true;
77                }
78                'u' => {
79                    result.unicode = true;
80                }
81                'v' => {
82                    result.unicode_sets = true;
83                }
84                _ => {
85                    // Silently skip unsupported flags.
86                }
87            }
88        }
89        result
90    }
91}
92
93impl From<&str> for Flags {
94    /// Construct a Flags from a string, using JavaScript field names.
95    ///
96    /// See also: [`Flags::new`].
97    #[inline]
98    fn from(s: &str) -> Self {
99        Self::new(s.chars().map(u32::from))
100    }
101}
102
103impl fmt::Display for Flags {
104    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
105        if self.multiline {
106            f.write_str("m")?;
107        }
108        if self.icase {
109            f.write_str("i")?;
110        }
111        if self.dot_all {
112            f.write_str("s")?;
113        }
114        if self.unicode {
115            f.write_str("u")?;
116        }
117        Ok(())
118    }
119}
120
121/// Range is used to express the extent of a match, as indexes into the input
122/// string.
123pub type Range = core::ops::Range<usize>;
124
125/// An iterator type which yields `Match`es found in a string.
126pub type Matches<'r, 't> = exec::Matches<backends::DefaultExecutor<'r, 't>>;
127
128/// An iterator type which yields `Match`es found in a string, supporting ASCII
129/// only.
130pub type AsciiMatches<'r, 't> = exec::Matches<backends::DefaultAsciiExecutor<'r, 't>>;
131
132/// A Match represents a portion of a string which was found to match a Regex.
133#[derive(Debug, Clone)]
134pub struct Match {
135    /// The total range of the match. Note this may be empty, if the regex
136    /// matched an empty string.
137    pub range: Range,
138
139    /// The list of captures. This has length equal to the number of capturing
140    /// groups in the regex. For each capture, if the value is None, that group
141    /// did not match (for example, it was in a not-taken branch of an
142    /// alternation). If the value is Some, the group did match with the
143    /// enclosed range.
144    pub captures: Vec<Option<Range>>,
145
146    // A list of capture group names. This is either:
147    //   - Empty, if there were no named capture groups.
148    //   - A list of names with length `captures.len()`, corresponding to the
149    //     capture group names in order. Groups without names have an empty string.
150    pub(crate) group_names: Box<[Box<str>]>,
151}
152
153impl Match {
154    /// Access a group by index, using the convention of Python's group()
155    /// function. Index 0 is the total match, index 1 is the first capture
156    /// group.
157    #[inline]
158    pub fn group(&self, idx: usize) -> Option<Range> {
159        if idx == 0 {
160            Some(self.range.clone())
161        } else if idx <= self.captures.len() {
162            self.captures[idx - 1].clone()
163        } else {
164            None
165        }
166    }
167
168    /// Access a named group by name.
169    #[inline]
170    pub fn named_group(&self, name: &str) -> Option<Range> {
171        // Empty strings are used as sentinels to indicate unnamed group.
172        if name.is_empty() {
173            return None;
174        }
175        let pos = self.group_names.iter().position(|s| s.as_ref() == name)?;
176        self.captures[pos].clone()
177    }
178
179    /// Return an iterator over the named groups of a Match.
180    #[inline]
181    pub fn named_groups(&self) -> NamedGroups<'_> {
182        NamedGroups::new(self)
183    }
184
185    /// Returns the range over the starting and ending byte offsets of the match in the haystack.
186    ///
187    /// This is a convenience function to work around
188    /// the fact that Range does not support Copy.
189    #[inline]
190    pub fn range(&self) -> Range {
191        self.range.clone()
192    }
193
194    /// Returns the starting byte offset of the match in the haystack.
195    #[inline]
196    pub fn start(&self) -> usize {
197        self.range.start
198    }
199
200    /// Returns the ending byte offset of the match in the haystack.
201    #[inline]
202    pub fn end(&self) -> usize {
203        self.range.end
204    }
205
206    /// Returns the matched text as a string slice.
207    ///
208    /// # Examples
209    ///
210    /// ```rust
211    /// use regress::Regex;
212    ///
213    /// let re = Regex::new(r"\d+").unwrap();
214    /// let text = "Price: $123";
215    /// let m = re.find(text).unwrap();
216    /// assert_eq!(m.as_str(text), "123");
217    /// ```
218    #[inline]
219    pub fn as_str<'t>(&self, text: &'t str) -> &'t str {
220        &text[self.range()]
221    }
222
223    /// Return an iterator over a Match. The first returned value is the total
224    /// match, and subsequent values represent the capture groups.
225    #[inline]
226    pub fn groups(&self) -> Groups<'_> {
227        Groups::new(self)
228    }
229}
230
231/// An iterator over the capture groups of a [`Match`]
232///
233/// This struct is created by the [`groups`] method on [`Match`].
234///
235/// [`Match`]: ../struct.Match.html
236/// [`groups`]: ../struct.Match.html#method.groups
237#[derive(Clone)]
238pub struct Groups<'m> {
239    mat: &'m Match,
240
241    // The next group index to return, where 0 references the total match.
242    next_group_idx: usize,
243
244    // The maximum group index to return, with a +1 for the implicit total match.
245    // For example, in a regex with 1 capture group, this will be 2.
246    max: usize,
247}
248
249impl<'m> Groups<'m> {
250    #[inline]
251    fn new(mat: &'m Match) -> Self {
252        Self {
253            mat,
254            next_group_idx: 0,
255            max: mat.captures.len() + 1, // +1 for the total match
256        }
257    }
258}
259
260impl Iterator for Groups<'_> {
261    type Item = Option<Range>;
262
263    #[inline]
264    fn next(&mut self) -> Option<Self::Item> {
265        let i = self.next_group_idx;
266        if i < self.max {
267            self.next_group_idx += 1;
268            Some(self.mat.group(i))
269        } else {
270            None
271        }
272    }
273
274    fn size_hint(&self) -> (usize, Option<usize>) {
275        let size = self.max.saturating_sub(self.next_group_idx);
276        (size, Some(size))
277    }
278}
279
280impl<'m> ExactSizeIterator for Groups<'m> {}
281impl<'m> FusedIterator for Groups<'m> {}
282
283/// An iterator over the named capture groups of a [`Match`]
284///
285/// This struct is created by the [`named_groups`] method on [`Match`].
286///
287/// [`Match`]: ../struct.Match.html
288/// [`named_groups`]: ../struct.Match.html#method.named_groups
289#[derive(Clone)]
290pub struct NamedGroups<'m> {
291    mat: &'m Match,
292
293    // The next group name index to return.
294    // Note unlike `Groups` this does NOT include the implicit total match.
295    // That is, group 0 is the first capture group, NOT the total match.
296    next_group_idx: usize,
297}
298
299impl<'m> NamedGroups<'m> {
300    #[inline]
301    fn new(mat: &'m Match) -> Self {
302        Self {
303            mat,
304            next_group_idx: 0,
305        }
306    }
307}
308
309impl<'m> Iterator for NamedGroups<'m> {
310    type Item = (&'m str, Option<Range>);
311
312    #[inline]
313    fn next(&mut self) -> Option<Self::Item> {
314        // Increment next_group_idx until we find a non-empty name.
315        debug_assert!(self.next_group_idx <= self.mat.group_names.len());
316        let end = self.mat.group_names.len();
317        let mut idx = self.next_group_idx;
318        while idx < end && self.mat.group_names[idx].is_empty() {
319            idx += 1;
320        }
321        if idx == end {
322            return None;
323        }
324        let name = self.mat.group_names[idx].as_ref();
325        let range = self.mat.captures[idx].clone();
326        self.next_group_idx = idx + 1;
327        Some((name, range))
328    }
329
330    fn size_hint(&self) -> (usize, Option<usize>) {
331        let size = self.mat.group_names[self.next_group_idx..]
332            .iter()
333            .filter(|s| !s.is_empty())
334            .count();
335
336        (size, Some(size))
337    }
338}
339
340impl<'m> ExactSizeIterator for NamedGroups<'m> {}
341impl<'m> FusedIterator for NamedGroups<'m> {}
342
343/// A Regex is the compiled version of a pattern.
344#[derive(Debug, Clone)]
345pub struct Regex {
346    cr: CompiledRegex,
347}
348
349impl From<CompiledRegex> for Regex {
350    fn from(cr: CompiledRegex) -> Self {
351        Self { cr }
352    }
353}
354
355impl Regex {
356    /// Construct a regex by parsing `pattern` using the default flags.
357    /// An Error may be returned if the syntax is invalid.
358    /// Note that this is rather expensive; prefer to cache a Regex which is
359    /// intended to be used more than once.
360    #[inline]
361    pub fn new(pattern: &str) -> Result<Regex, Error> {
362        Self::with_flags(pattern, Flags::default())
363    }
364
365    /// Construct a regex by parsing `pattern` with `flags`.
366    /// An Error may be returned if the syntax is invalid.
367    //
368    /// Note it is preferable to cache a Regex which is intended to be used more
369    /// than once, as the parse may be expensive. For example:
370    #[inline]
371    pub fn with_flags<F>(pattern: &str, flags: F) -> Result<Regex, Error>
372    where
373        F: Into<Flags>,
374    {
375        Self::from_unicode(pattern.chars().map(u32::from), flags)
376    }
377
378    /// Construct a regex by parsing `pattern` with `flags`, where
379    /// `pattern` is an iterator of `u32` Unicode codepoints.
380    /// An Error may be returned if the syntax is invalid.
381    /// This allows parsing regular expressions from exotic strings in
382    /// other encodings, such as UTF-16 or UTF-32.
383    pub fn from_unicode<I, F>(pattern: I, flags: F) -> Result<Regex, Error>
384    where
385        I: Iterator<Item = u32> + Clone,
386        F: Into<Flags>,
387    {
388        let flags = flags.into();
389        let mut ire = parse::try_parse(pattern, flags)?;
390        if !flags.no_opt {
391            optimizer::optimize(&mut ire);
392        }
393        let cr = emit::emit(&ire);
394        Ok(Regex { cr })
395    }
396
397    /// Searches `text` to find the first match.
398    #[inline]
399    pub fn find(&self, text: &str) -> Option<Match> {
400        self.find_iter(text).next()
401    }
402
403    /// Searches `text`, returning an iterator over non-overlapping matches.
404    /// Note that the resulting Iterator borrows both the regex `'r` and the
405    /// input string as `'t`.
406    #[inline]
407    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
408        self.find_from(text, 0)
409    }
410
411    /// Returns an iterator for matches found in 'text' starting at byte index
412    /// `start`. Note this may be different from passing a sliced `text` in
413    /// the case of lookbehind assertions.
414    /// Example:
415    ///
416    ///  ```rust
417    ///   use regress::Regex;
418    ///   let text = "xyxy";
419    ///   let re = Regex::new(r"(?<=x)y").unwrap();
420    ///   let t1 = re.find(&text[1..]).unwrap().range();
421    ///   assert!(t1 == (2..3));
422    ///   let t2 = re.find_from(text, 1).next().unwrap().range();
423    ///   assert!(t2 == (1..2));
424    ///   ```
425    #[inline]
426    pub fn find_from<'r, 't>(&'r self, text: &'t str, start: usize) -> Matches<'r, 't> {
427        backends::find(self, text, start)
428    }
429
430    /// Searches `text` to find the first match.
431    /// The input text is expected to be ascii-only: only ASCII case-folding is
432    /// supported.
433    #[inline]
434    pub fn find_ascii(&self, text: &str) -> Option<Match> {
435        self.find_iter_ascii(text).next()
436    }
437
438    /// Searches `text`, returning an iterator over non-overlapping matches.
439    /// The input text is expected to be ascii-only: only ASCII case-folding is
440    /// supported.
441    #[inline]
442    pub fn find_iter_ascii<'r, 't>(&'r self, text: &'t str) -> AsciiMatches<'r, 't> {
443        self.find_from_ascii(text, 0)
444    }
445
446    /// Returns an iterator for matches found in 'text' starting at byte index
447    /// `start`.
448    #[inline]
449    pub fn find_from_ascii<'r, 't>(&'r self, text: &'t str, start: usize) -> AsciiMatches<'r, 't> {
450        backends::find(self, text, start)
451    }
452
453    /// Returns an iterator for matches found in 'text' starting at index `start`.
454    #[cfg(feature = "utf16")]
455    pub fn find_from_utf16<'r, 't>(
456        &'r self,
457        text: &'t [u16],
458        start: usize,
459    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf16Input<'t>>>
460    {
461        let input = Utf16Input::new(text, self.cr.flags.unicode);
462        exec::Matches::new(
463            super::classicalbacktrack::BacktrackExecutor::new(
464                input,
465                MatchAttempter::new(&self.cr, input.left_end()),
466            ),
467            start,
468        )
469    }
470
471    /// Returns an iterator for matches found in 'text' starting at index `start`.
472    #[cfg(feature = "utf16")]
473    pub fn find_from_ucs2<'r, 't>(
474        &'r self,
475        text: &'t [u16],
476        start: usize,
477    ) -> exec::Matches<super::classicalbacktrack::BacktrackExecutor<'r, indexing::Ucs2Input<'t>>>
478    {
479        let input = Ucs2Input::new(text, self.cr.flags.unicode);
480        exec::Matches::new(
481            super::classicalbacktrack::BacktrackExecutor::new(
482                input,
483                MatchAttempter::new(&self.cr, input.left_end()),
484            ),
485            start,
486        )
487    }
488
489    /// Replaces the first match of the regex in `text` with the replacement string.
490    ///
491    /// The replacement string may contain capture group references in the form `$1`, `$2`, etc.,
492    /// where `$1` refers to the first capture group, `$2` to the second, and so on.
493    /// `$0` refers to the entire match. Use `$$` to insert a literal `$`.
494    ///
495    /// If no match is found, the original text is returned unchanged.
496    ///
497    /// # Examples
498    ///
499    /// ```rust
500    /// use regress::Regex;
501    ///
502    /// let re = Regex::new(r"(\w+)\s+(\w+)").unwrap();
503    /// let result = re.replace("hello world", "$2 $1");
504    /// assert_eq!(result, "world hello");
505    ///
506    /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
507    /// let result = re.replace("2023-12-25", "$2/$3/$1");
508    /// assert_eq!(result, "12/25/2023");
509    /// ```
510    pub fn replace(&self, text: &str, replacement: &str) -> String {
511        match self.find(text) {
512            Some(m) => {
513                let mut result = String::with_capacity(text.len());
514                result.push_str(&text[..m.start()]);
515                self.expand_replacement(&m, text, replacement, &mut result);
516                result.push_str(&text[m.end()..]);
517                result
518            }
519            None => text.to_string(),
520        }
521    }
522
523    /// Replaces all matches of the regex in `text` with the replacement string.
524    ///
525    /// The replacement string may contain capture group references in the form `$1`, `$2`, etc.,
526    /// where `$1` refers to the first capture group, `$2` to the second, and so on.
527    /// `$0` refers to the entire match. Use `$$` to insert a literal `$`.
528    ///
529    /// # Examples
530    ///
531    /// ```rust
532    /// use regress::Regex;
533    ///
534    /// let re = Regex::new(r"(\w+)\s+(\w+)").unwrap();
535    /// let result = re.replace_all("hello world foo bar", "$2-$1");
536    /// assert_eq!(result, "world-hello bar-foo");
537    ///
538    /// let re = Regex::new(r"\b(\w)(\w+)").unwrap();
539    /// let result = re.replace_all("hello world", "$1.$2");
540    /// assert_eq!(result, "h.ello w.orld");
541    /// ```
542    pub fn replace_all(&self, text: &str, replacement: &str) -> String {
543        let mut result = String::with_capacity(text.len());
544        let mut last_end = 0;
545
546        for m in self.find_iter(text) {
547            result.push_str(&text[last_end..m.start()]);
548            self.expand_replacement(&m, text, replacement, &mut result);
549            last_end = m.end();
550        }
551
552        result.push_str(&text[last_end..]);
553        result
554    }
555
556    /// Replaces the first match of the regex in `text` using a closure.
557    ///
558    /// The closure receives a `&Match` and should return the replacement string.
559    /// This is useful for dynamic replacements that depend on the match details.
560    ///
561    /// If no match is found, the original text is returned unchanged.
562    ///
563    /// # Examples
564    ///
565    /// ```rust
566    /// use regress::Regex;
567    ///
568    /// let re = Regex::new(r"\d+").unwrap();
569    /// let text = "Price: $123";
570    /// let result = re.replace_with(text, |m| {
571    ///     let num: i32 = m.as_str(text).parse().unwrap();
572    ///     format!("{}", num * 2)
573    /// });
574    /// assert_eq!(result, "Price: $246");
575    /// ```
576    pub fn replace_with<F>(&self, text: &str, replacement: F) -> String
577    where
578        F: FnOnce(&Match) -> String,
579    {
580        match self.find(text) {
581            Some(m) => {
582                let mut result = String::with_capacity(text.len());
583                result.push_str(&text[..m.start()]);
584                result.push_str(&replacement(&m));
585                result.push_str(&text[m.end()..]);
586                result
587            }
588            None => text.to_string(),
589        }
590    }
591
592    /// Replaces all matches of the regex in `text` using a closure.
593    ///
594    /// The closure receives a `&Match` and should return the replacement string.
595    /// This is useful for dynamic replacements that depend on the match details.
596    ///
597    /// # Examples
598    ///
599    /// ```rust
600    /// use regress::Regex;
601    ///
602    /// let re = Regex::new(r"\d+").unwrap();
603    /// let text = "Items: 5, 10, 15";
604    /// let result = re.replace_all_with(text, |m| {
605    ///     let num: i32 = m.as_str(text).parse().unwrap();
606    ///     format!("[{}]", num * 10)
607    /// });
608    /// assert_eq!(result, "Items: [50], [100], [150]");
609    /// ```
610    pub fn replace_all_with<F>(&self, text: &str, replacement: F) -> String
611    where
612        F: Fn(&Match) -> String,
613    {
614        let mut result = String::with_capacity(text.len());
615        let mut last_end = 0;
616
617        for m in self.find_iter(text) {
618            result.push_str(&text[last_end..m.start()]);
619            result.push_str(&replacement(&m));
620            last_end = m.end();
621        }
622
623        result.push_str(&text[last_end..]);
624        result
625    }
626
627    /// Helper method to expand replacement strings with capture group substitutions.
628    fn expand_replacement(&self, m: &Match, text: &str, replacement: &str, output: &mut String) {
629        let mut chars = replacement.chars().peekable();
630
631        while let Some(ch) = chars.next() {
632            if ch == '$' {
633                match chars.peek() {
634                    Some('$') => {
635                        // $$ -> literal $
636                        chars.next();
637                        output.push('$');
638                    }
639                    Some(&digit) if digit.is_ascii_digit() => {
640                        // Parse the group number
641                        let mut group_num = 0;
642                        while let Some(&digit) = chars.peek() {
643                            if digit.is_ascii_digit() {
644                                chars.next();
645                                group_num = group_num * 10 + (digit as u32 - '0' as u32) as usize;
646                                // Limit to reasonable group numbers to avoid overflow
647                                if group_num > MAX_CAPTURE_GROUPS {
648                                    break;
649                                }
650                            } else {
651                                break;
652                            }
653                        }
654
655                        // Get the matched text for this group
656                        if let Some(range) = m.group(group_num) {
657                            output.push_str(&text[range]);
658                        }
659                        // If group doesn't exist or didn't match, add nothing
660                    }
661                    Some('{') => {
662                        // Handle ${name} syntax for named groups
663                        chars.next(); // consume '{'
664                        let mut name = String::new();
665                        let mut found_closing_brace = false;
666
667                        for ch in chars.by_ref() {
668                            if ch == '}' {
669                                found_closing_brace = true;
670                                break;
671                            }
672                            name.push(ch);
673                        }
674
675                        if found_closing_brace {
676                            if let Some(range) = m.named_group(&name) {
677                                output.push_str(&text[range]);
678                            }
679                        } else {
680                            // Malformed ${...}, treat as literal
681                            output.push_str("${");
682                            output.push_str(&name);
683                        }
684                    }
685                    _ => {
686                        // Just a $ at end or followed by non-digit
687                        output.push('$');
688                    }
689                }
690            } else {
691                output.push(ch);
692            }
693        }
694    }
695}
696
697impl FromStr for Regex {
698    type Err = Error;
699
700    /// Attempts to parse a string into a regular expression
701    #[inline]
702    fn from_str(s: &str) -> Result<Self, Error> {
703        Self::new(s)
704    }
705}
706
707// Pattern trait implementation for str::find, str::contains, etc.
708#[cfg(feature = "pattern")]
709mod pattern_impl {
710    use super::*;
711    use core::str::pattern::{Pattern, ReverseSearcher, SearchStep, Searcher};
712
713    /// A searcher for a regex pattern.
714    pub struct RegexSearcher<'r, 't> {
715        haystack: &'t str,
716        regex: &'r Regex,
717        current_pos: usize,
718        done: bool,
719        // For reverse searching
720        reverse_pos: usize,
721        reverse_done: bool,
722    }
723
724    impl<'r, 't> RegexSearcher<'r, 't> {
725        fn new(regex: &'r Regex, haystack: &'t str) -> Self {
726            Self {
727                haystack,
728                regex,
729                current_pos: 0,
730                done: false,
731                reverse_pos: haystack.len(),
732                reverse_done: false,
733            }
734        }
735
736        fn find_last_match_before(&self, pos: usize) -> Option<super::Match> {
737            // Find all matches up to the given position and return the last one
738            let mut last_match = None;
739            for m in self.regex.find_from(self.haystack, 0) {
740                if m.end() <= pos {
741                    last_match = Some(m);
742                } else {
743                    break;
744                }
745            }
746            last_match
747        }
748    }
749
750    unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> {
751        fn haystack(&self) -> &'t str {
752            self.haystack
753        }
754
755        fn next(&mut self) -> SearchStep {
756            if self.done {
757                return SearchStep::Done;
758            }
759
760            // Try to find the next match starting from current position
761            if let Some(m) = self.regex.find_from(self.haystack, self.current_pos).next() {
762                let match_start = m.start();
763                let match_end = m.end();
764
765                // Handle any gap between current position and match start
766                if self.current_pos < match_start {
767                    let reject_end = match_start;
768                    let reject_start = self.current_pos;
769                    self.current_pos = match_start;
770                    return SearchStep::Reject(reject_start, reject_end);
771                }
772
773                // Return the match
774                self.current_pos = match_end;
775
776                // Handle zero-width matches to avoid infinite loops
777                if match_start == match_end {
778                    // For zero-width matches, we need to advance at least one byte
779                    // to avoid infinite loops
780                    if match_end < self.haystack.len() {
781                        // Find the next character boundary
782                        let mut next_pos = match_end + 1;
783                        while next_pos < self.haystack.len()
784                            && !self.haystack.is_char_boundary(next_pos)
785                        {
786                            next_pos += 1;
787                        }
788                        self.current_pos = next_pos;
789                    } else {
790                        // We're at the end of the string
791                        self.done = true;
792                    }
793                }
794
795                SearchStep::Match(match_start, match_end)
796            } else {
797                // No more matches, reject remaining text if any
798                if self.current_pos < self.haystack.len() {
799                    let reject_start = self.current_pos;
800                    let reject_end = self.haystack.len();
801                    self.current_pos = self.haystack.len();
802                    self.done = true;
803                    SearchStep::Reject(reject_start, reject_end)
804                } else {
805                    self.done = true;
806                    SearchStep::Done
807                }
808            }
809        }
810    }
811
812    unsafe impl<'r, 't> ReverseSearcher<'t> for RegexSearcher<'r, 't> {
813        fn next_back(&mut self) -> SearchStep {
814            if self.reverse_done {
815                return SearchStep::Done;
816            }
817
818            // Try to find the last match before current reverse position
819            if let Some(m) = self.find_last_match_before(self.reverse_pos) {
820                let match_start = m.start();
821                let match_end = m.end();
822
823                // Handle any gap between match end and current reverse position
824                if match_end < self.reverse_pos {
825                    let reject_start = match_end;
826                    let reject_end = self.reverse_pos;
827                    self.reverse_pos = match_end;
828                    return SearchStep::Reject(reject_start, reject_end);
829                }
830
831                // Return the match
832                self.reverse_pos = match_start;
833
834                // Handle zero-width matches
835                if match_start == match_end {
836                    // For zero-width matches, move back by one character
837                    if match_start > 0 {
838                        let mut prev_pos = match_start - 1;
839                        while prev_pos > 0 && !self.haystack.is_char_boundary(prev_pos) {
840                            prev_pos -= 1;
841                        }
842                        self.reverse_pos = prev_pos;
843                    } else {
844                        // We're at the beginning of the string
845                        self.reverse_done = true;
846                    }
847                }
848
849                SearchStep::Match(match_start, match_end)
850            } else {
851                // No more matches, reject remaining text if any
852                if self.reverse_pos > 0 {
853                    let reject_start = 0;
854                    let reject_end = self.reverse_pos;
855                    self.reverse_pos = 0;
856                    self.reverse_done = true;
857                    SearchStep::Reject(reject_start, reject_end)
858                } else {
859                    self.reverse_done = true;
860                    SearchStep::Done
861                }
862            }
863        }
864    }
865
866    impl<'r> Pattern for &'r Regex {
867        type Searcher<'a> = RegexSearcher<'r, 'a>;
868
869        fn into_searcher(self, haystack: &str) -> Self::Searcher<'_> {
870            RegexSearcher::new(self, haystack)
871        }
872    }
873}
874
875#[cfg(feature = "pattern")]
876pub use pattern_impl::*;
877
878// Support for using regress with different regex backends.
879// Currently there is only the classical backtracking, and PikeVM.
880#[doc(hidden)]
881pub mod backends {
882    use super::Regex;
883    use super::exec;
884    use super::indexing;
885    pub use crate::emit::emit;
886    pub use crate::optimizer::optimize;
887    pub use crate::parse::try_parse;
888
889    /// An Executor using the classical backtracking algorithm.
890    pub type BacktrackExecutor<'r, 't> =
891        super::classicalbacktrack::BacktrackExecutor<'r, indexing::Utf8Input<'t>>;
892
893    /// A Executor using the PikeVM executor.
894    #[cfg(feature = "backend-pikevm")]
895    pub type PikeVMExecutor<'r, 't> = super::pikevm::PikeVMExecutor<'r, indexing::Utf8Input<'t>>;
896
897    /// An alias type to the default Executor.
898    pub type DefaultExecutor<'r, 't> = BacktrackExecutor<'r, 't>;
899
900    /// An alias type to the default executor's ASCII form.
901    pub type DefaultAsciiExecutor<'r, 't> =
902        <DefaultExecutor<'r, 't> as exec::Executor<'r, 't>>::AsAscii;
903
904    /// Searches `text`, returning an iterator over non-overlapping matches.
905    pub fn find<'r, 't, Executor: exec::Executor<'r, 't>>(
906        re: &'r Regex,
907        text: &'t str,
908        start: usize,
909    ) -> exec::Matches<Executor> {
910        exec::Matches::new(Executor::new(&re.cr, text), start)
911    }
912
913    /// Searches `text`, returning an iterator over non-overlapping matches.
914    /// This is a convenience method to avoid E0223.
915    pub fn find_ascii<'r, 't, Executor: exec::Executor<'r, 't>>(
916        re: &'r Regex,
917        text: &'t str,
918        start: usize,
919    ) -> exec::Matches<Executor::AsAscii> {
920        find::<Executor::AsAscii>(re, text, start)
921    }
922}
923
924/// Escapes all special regex characters in a string to make it a literal match.
925///
926/// This function takes a string and returns a new string with all special
927/// regex characters escaped with backslashes, so the resulting string can be
928/// used as a literal pattern in a regular expression.
929///
930/// # Example
931///
932/// ```
933/// use regress::escape;
934///
935/// let escaped = escape("Hello. How are you?");
936/// assert_eq!(escaped, "Hello\\. How are you\\?");
937///
938/// let escaped = escape("$100 + tax (15%)");
939/// assert_eq!(escaped, "\\$100 \\+ tax \\(15%\\)");
940/// ```
941pub fn escape(text: &str) -> String {
942    let mut result = String::with_capacity(text.len());
943
944    for c in text.chars() {
945        match c {
946            // Characters that have special meaning in regex and need escaping
947            '\\' | '^' | '$' | '.' | '|' | '?' | '*' | '+' | '(' | ')' | '[' | ']' | '{' | '}' => {
948                result.push('\\');
949                result.push(c);
950            }
951            // All other characters are literal
952            _ => result.push(c),
953        }
954    }
955
956    result
957}
regress/api.rs

regress/
api.rs