regress/
parse.rs

1//! Parser from regex patterns to IR
2
3use crate::{
4    api, charclasses,
5    codepointset::{CodePointSet, Interval, interval_contains},
6    ir,
7    types::{
8        BracketContents, CaptureGroupID, CaptureGroupName, CharacterClassType, MAX_CAPTURE_GROUPS,
9        MAX_LOOPS,
10    },
11    unicode::{
12        self, PropertyEscapeKind, unicode_property_from_str, unicode_property_name_from_str,
13    },
14    unicodetables::{id_continue_ranges, id_start_ranges},
15    util::to_char_sat,
16};
17use core::{fmt, iter::Peekable};
18#[cfg(feature = "std")]
19use std::collections::HashMap;
20#[cfg(not(feature = "std"))]
21use {
22    alloc::{
23        boxed::Box,
24        string::{String, ToString},
25        vec::Vec,
26    },
27    hashbrown::HashMap,
28};
29
30/// Represents an error encountered during regex compilation.
31///
32/// The text contains a human-readable error message.
33#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Error {
35    pub text: String,
36}
37
38impl fmt::Display for Error {
39    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40        f.write_str(&self.text)
41    }
42}
43
44#[cfg(feature = "std")]
45impl std::error::Error for Error {}
46
47enum ClassAtom {
48    CodePoint(u32),
49    CharacterClass {
50        class_type: CharacterClassType,
51        positive: bool,
52    },
53    Range {
54        iv: CodePointSet,
55        negate: bool,
56    },
57}
58
59/// Represents the result of a class set.
60#[derive(Debug, Clone)]
61struct ClassSet {
62    codepoints: CodePointSet,
63    alternatives: ClassSetAlternativeStrings,
64}
65
66impl ClassSet {
67    fn new() -> Self {
68        ClassSet {
69            codepoints: CodePointSet::new(),
70            alternatives: ClassSetAlternativeStrings::new(),
71        }
72    }
73
74    fn node(self, icase: bool, negate_set: bool) -> ir::Node {
75        let codepoints = if icase {
76            unicode::add_icase_code_points(self.codepoints)
77        } else {
78            self.codepoints
79        };
80        if codepoints.is_empty() && self.alternatives.0.is_empty() {
81            ir::Node::Bracket(BracketContents {
82                invert: negate_set,
83                cps: codepoints,
84            })
85        } else if codepoints.is_empty() {
86            make_alt(self.alternatives.to_nodes(icase))
87        } else if self.alternatives.0.is_empty() {
88            ir::Node::Bracket(BracketContents {
89                invert: negate_set,
90                cps: codepoints,
91            })
92        } else {
93            let mut nodes = self.alternatives.to_nodes(icase);
94            nodes.push(ir::Node::Bracket(BracketContents {
95                invert: negate_set,
96                cps: codepoints,
97            }));
98            make_alt(nodes)
99        }
100    }
101
102    fn union_operand(&mut self, operand: ClassSetOperand) {
103        match operand {
104            ClassSetOperand::ClassSetCharacter(c) => {
105                self.codepoints.add_one(c);
106            }
107            ClassSetOperand::CharacterClassEscape(cps) => {
108                self.codepoints.add_set(cps);
109            }
110            ClassSetOperand::Class(class) => {
111                self.codepoints.add_set(class.codepoints);
112                self.alternatives.extend(class.alternatives);
113            }
114            ClassSetOperand::ClassStringDisjunction(s) => {
115                self.alternatives.extend(s);
116            }
117        }
118    }
119
120    fn intersect_operand(&mut self, operand: ClassSetOperand) {
121        match operand {
122            ClassSetOperand::ClassSetCharacter(c) => {
123                if self.codepoints.contains(c) {
124                    self.codepoints =
125                        CodePointSet::from_sorted_disjoint_intervals(Vec::from([Interval {
126                            first: c,
127                            last: c,
128                        }]));
129                } else {
130                    self.codepoints.clear();
131                }
132                let mut found_alternative = false;
133                for alternative in &mut self.alternatives.0 {
134                    if alternative.len() == 1 && alternative[0] == c {
135                        found_alternative = true;
136                        break;
137                    }
138                }
139                self.alternatives.0.clear();
140                if found_alternative {
141                    self.alternatives.0.push(Vec::from([c]));
142                }
143            }
144            ClassSetOperand::CharacterClassEscape(cps) => {
145                self.codepoints.intersect(cps.intervals());
146                let mut retained = Vec::new();
147                for alternative in &self.alternatives.0 {
148                    if alternative.len() == 1 && cps.contains(alternative[0]) {
149                        retained.push(alternative.clone());
150                    }
151                }
152                self.alternatives.0 = retained;
153            }
154            ClassSetOperand::Class(class) => {
155                let mut retained_codepoints = CodePointSet::new();
156                for alternative in &class.alternatives.0 {
157                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
158                        retained_codepoints.add_one(alternative[0]);
159                    }
160                }
161                let mut retained_alternatives = ClassSetAlternativeStrings::new();
162                for alternative in &self.alternatives.0 {
163                    if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
164                        retained_alternatives.0.push(alternative.clone());
165                    }
166                }
167                self.codepoints.intersect(class.codepoints.intervals());
168                self.codepoints.add_set(retained_codepoints);
169                self.alternatives.intersect(class.alternatives);
170                self.alternatives.extend(retained_alternatives);
171            }
172            ClassSetOperand::ClassStringDisjunction(s) => {
173                let mut retained = CodePointSet::new();
174                for alternative in &s.0 {
175                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
176                        retained.add_one(alternative[0]);
177                    }
178                }
179                self.codepoints = retained;
180                self.alternatives.intersect(s);
181            }
182        }
183    }
184
185    fn subtract_operand(&mut self, operand: ClassSetOperand) {
186        match operand {
187            ClassSetOperand::ClassSetCharacter(c) => {
188                self.codepoints.remove(&[Interval { first: c, last: c }]);
189                self.alternatives
190                    .remove(ClassSetAlternativeStrings(Vec::from([Vec::from([c])])));
191            }
192            ClassSetOperand::CharacterClassEscape(cps) => {
193                self.codepoints.remove(cps.intervals());
194                let mut to_remove = ClassSetAlternativeStrings::new();
195                for alternative in &self.alternatives.0 {
196                    if alternative.len() == 1 && cps.contains(alternative[0]) {
197                        to_remove.0.push(alternative.clone());
198                    }
199                }
200                self.alternatives.remove(to_remove);
201            }
202            ClassSetOperand::Class(class) => {
203                let mut codepoints_removed = CodePointSet::new();
204                for alternative in &class.alternatives.0 {
205                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
206                        codepoints_removed.add_one(alternative[0]);
207                    }
208                }
209                let mut alternatives_removed = ClassSetAlternativeStrings::new();
210                for alternative in &self.alternatives.0 {
211                    if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
212                        alternatives_removed.0.push(alternative.clone());
213                    }
214                }
215                self.codepoints.remove(codepoints_removed.intervals());
216                self.codepoints.remove(class.codepoints.intervals());
217                self.alternatives.remove(alternatives_removed);
218                self.alternatives.remove(class.alternatives);
219            }
220            ClassSetOperand::ClassStringDisjunction(s) => {
221                let mut to_remove = CodePointSet::new();
222                for alternative in &s.0 {
223                    if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
224                        to_remove.add_one(alternative[0]);
225                    }
226                }
227                self.codepoints.remove(to_remove.intervals());
228                self.alternatives.remove(s);
229            }
230        }
231    }
232}
233
234/// Represents all different types of class set operands.
235#[derive(Debug, Clone)]
236enum ClassSetOperand {
237    ClassSetCharacter(u32),
238    CharacterClassEscape(CodePointSet),
239    Class(ClassSet),
240    ClassStringDisjunction(ClassSetAlternativeStrings),
241}
242
243#[derive(Debug, Clone)]
244struct ClassSetAlternativeStrings(Vec<Vec<u32>>);
245
246impl ClassSetAlternativeStrings {
247    fn new() -> Self {
248        ClassSetAlternativeStrings(Vec::new())
249    }
250
251    fn extend(&mut self, other: Self) {
252        self.0.extend(other.0);
253    }
254
255    fn intersect(&mut self, other: Self) {
256        let mut retained = Vec::new();
257        for string in &self.0 {
258            for other_string in &other.0 {
259                if string == other_string {
260                    retained.push(string.clone());
261                    break;
262                }
263            }
264        }
265        self.0 = retained;
266    }
267
268    fn remove(&mut self, other: Self) {
269        self.0.retain(|string| !other.0.contains(string));
270    }
271
272    fn to_nodes(&self, icase: bool) -> Vec<ir::Node> {
273        let mut nodes = Vec::new();
274        for string in &self.0 {
275            nodes.push(ir::Node::Cat(
276                string
277                    .iter()
278                    .map(|cp| ir::Node::Char { c: *cp, icase })
279                    .collect::<Vec<_>>(),
280            ));
281        }
282        nodes
283    }
284}
285
286fn error<S, T>(text: S) -> Result<T, Error>
287where
288    S: ToString,
289{
290    Err(Error {
291        text: text.to_string(),
292    })
293}
294
295fn make_cat(nodes: ir::NodeList) -> ir::Node {
296    match nodes.len() {
297        0 => ir::Node::Empty,
298        1 => nodes.into_iter().next().unwrap(),
299        _ => ir::Node::Cat(nodes),
300    }
301}
302
303fn make_alt(nodes: ir::NodeList) -> ir::Node {
304    let mut mright = None;
305    for node in nodes.into_iter().rev() {
306        match mright {
307            None => mright = Some(node),
308            Some(right) => mright = Some(ir::Node::Alt(Box::new(node), Box::new(right))),
309        }
310    }
311    mright.unwrap_or(ir::Node::Empty)
312}
313
314/// \return a CodePointSet for a given character escape (positive or negative).
315/// See ES9 21.2.2.12.
316fn codepoints_from_class(ct: CharacterClassType, positive: bool) -> CodePointSet {
317    let mut cps;
318    match ct {
319        CharacterClassType::Digits => {
320            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::DIGITS.to_vec())
321        }
322        CharacterClassType::Words => {
323            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WORD_CHARS.to_vec())
324        }
325        CharacterClassType::Spaces => {
326            cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WHITESPACE.to_vec());
327            for &iv in charclasses::LINE_TERMINATOR.iter() {
328                cps.add(iv)
329            }
330        }
331    };
332    if !positive {
333        cps = cps.inverted()
334    }
335    cps
336}
337
338/// \return a Bracket for a given character escape (positive or negative).
339fn make_bracket_class(ct: CharacterClassType, positive: bool) -> ir::Node {
340    ir::Node::Bracket(BracketContents {
341        invert: false,
342        cps: codepoints_from_class(ct, positive),
343    })
344}
345
346fn add_class_atom(bc: &mut BracketContents, atom: ClassAtom) {
347    match atom {
348        ClassAtom::CodePoint(c) => bc.cps.add_one(c),
349        ClassAtom::CharacterClass {
350            class_type,
351            positive,
352        } => {
353            bc.cps.add_set(codepoints_from_class(class_type, positive));
354        }
355        ClassAtom::Range { iv, negate } => {
356            if negate {
357                bc.cps.add_set(iv.inverted());
358            } else {
359                bc.cps.add_set(iv);
360            }
361        }
362    }
363}
364
365struct LookaroundParams {
366    negate: bool,
367    backwards: bool,
368}
369
370/// Represents the state used to parse a regex.
371struct Parser<I>
372where
373    I: Iterator<Item = u32>,
374{
375    /// The remaining input.
376    input: Peekable<I>,
377
378    /// Flags used.
379    flags: api::Flags,
380
381    /// Number of loops.
382    loop_count: u32,
383
384    /// Number of capturing groups.
385    group_count: CaptureGroupID,
386
387    /// Maximum number of capturing groups.
388    group_count_max: u32,
389
390    /// Named capture group references.
391    named_group_indices: HashMap<CaptureGroupName, u32>,
392
393    /// Whether a lookbehind was encountered.
394    has_lookbehind: bool,
395}
396
397impl<I> Parser<I>
398where
399    I: Iterator<Item = u32> + Clone,
400{
401    /// Consume a character, returning it.
402    fn consume<C: Into<u32>>(&mut self, c: C) -> u32 {
403        let nc = self.input.next();
404        core::debug_assert!(nc == Some(c.into()), "char was not next");
405        nc.unwrap()
406    }
407
408    /// If our contents begin with the char c, consume it from our contents
409    /// and return true. Otherwise return false.
410    fn try_consume<C: Into<u32>>(&mut self, c: C) -> bool {
411        self.input.next_if_eq(&c.into()).is_some()
412    }
413
414    /// If our contents begin with the string \p s, consume it from our contents
415    /// and return true. Otherwise return false.
416    fn try_consume_str(&mut self, s: &str) -> bool {
417        let mut cursor = self.input.clone();
418        for c1 in s.chars() {
419            if cursor.next() != Some(c1 as u32) {
420                return false;
421            }
422        }
423        self.input = cursor;
424        true
425    }
426
427    /// Fold a character if icase.
428    fn fold_if_icase(&self, c: u32) -> u32 {
429        if self.flags.icase {
430            unicode::fold_code_point(c, self.flags.unicode)
431        } else {
432            c
433        }
434    }
435
436    /// Peek at the next character.
437    fn peek(&mut self) -> Option<u32> {
438        self.input.peek().copied()
439    }
440
441    /// \return the next character.
442    fn next(&mut self) -> Option<u32> {
443        self.input.next()
444    }
445
446    fn try_parse(&mut self) -> Result<ir::Regex, Error> {
447        self.parse_capture_groups()?;
448
449        // Parse a catenation. If we consume everything, it's success. If there's
450        // something left, it's an error (for example, an excess closing paren).
451        let body = self.consume_disjunction()?;
452        match self.input.peek().copied() {
453            Some(c) if c == ')' as u32 => error("Unbalanced parenthesis"),
454            Some(c) => error(format!(
455                "Unexpected char: {}",
456                char::from_u32(c)
457                    .map(String::from)
458                    .unwrap_or_else(|| format!("\\u{c:04X}"))
459            )),
460            None => self.finalize(ir::Regex {
461                node: make_cat(vec![body, ir::Node::Goal]),
462                flags: self.flags,
463            }),
464        }
465    }
466
467    /// ES6 21.2.2.3 Disjunction.
468    fn consume_disjunction(&mut self) -> Result<ir::Node, Error> {
469        let mut terms = vec![self.consume_term()?];
470        while self.try_consume('|') {
471            terms.push(self.consume_term()?)
472        }
473        Ok(make_alt(terms))
474    }
475
476    /// ES6 21.2.2.5 Term.
477    fn consume_term(&mut self) -> Result<ir::Node, Error> {
478        let mut result: Vec<ir::Node> = Vec::new();
479        loop {
480            let start_group = self.group_count;
481            let mut start_offset = result.len();
482            let mut quantifier_allowed = true;
483
484            let nc = self.peek();
485            if nc.is_none() {
486                return Ok(make_cat(result));
487            }
488            let c = nc.unwrap();
489            match to_char_sat(c) {
490                // A concatenation is terminated by closing parens or vertical bar (alternations).
491                ')' | '|' => break,
492                // Term :: Assertion :: ^
493                '^' => {
494                    self.consume('^');
495                    result.push(ir::Node::Anchor {
496                        anchor_type: ir::AnchorType::StartOfLine,
497                        multiline: self.flags.multiline,
498                    });
499                    quantifier_allowed = false;
500                }
501                // Term :: Assertion :: $
502                '$' => {
503                    self.consume('$');
504                    result.push(ir::Node::Anchor {
505                        anchor_type: ir::AnchorType::EndOfLine,
506                        multiline: self.flags.multiline,
507                    });
508                    quantifier_allowed = false;
509                }
510
511                '\\' => {
512                    self.consume('\\');
513                    let Some(c) = self.peek() else {
514                        return error("Incomplete escape");
515                    };
516                    match to_char_sat(c) {
517                        // Term :: Assertion :: \b
518                        'b' => {
519                            self.consume('b');
520                            result.push(ir::Node::WordBoundary { invert: false });
521                        }
522                        // Term :: Assertion :: \B
523                        'B' => {
524                            self.consume('B');
525                            result.push(ir::Node::WordBoundary { invert: true });
526                        }
527                        // Term :: Atom :: \ AtomEscape :: CharacterEscape :: c AsciiLetter
528                        // Term :: ExtendedAtom :: \ [lookahead = c]
529                        'c' if !self.flags.unicode => {
530                            self.consume('c');
531                            if self
532                                .peek()
533                                .and_then(char::from_u32)
534                                .map(|c| c.is_ascii_alphabetic())
535                                == Some(true)
536                            {
537                                result.push(ir::Node::Char {
538                                    c: self.next().expect("char was not next") % 32,
539                                    icase: self.flags.icase,
540                                });
541                            } else {
542                                start_offset += 1;
543                                result.push(ir::Node::Char {
544                                    c: u32::from('\\'),
545                                    icase: self.flags.icase,
546                                });
547                                result.push(ir::Node::Char {
548                                    c: u32::from('c'),
549                                    icase: self.flags.icase,
550                                });
551                            }
552                        }
553                        // Term :: Atom :: \ AtomEscape
554                        _ => {
555                            result.push(self.consume_atom_escape()?);
556                        }
557                    }
558                }
559
560                // Term :: Atom :: .
561                '.' => {
562                    self.consume('.');
563                    result.push(if self.flags.dot_all {
564                        ir::Node::MatchAny
565                    } else {
566                        ir::Node::MatchAnyExceptLineTerminator
567                    });
568                }
569
570                '(' => {
571                    if self.try_consume_str("(?=") {
572                        // Positive lookahead.
573                        quantifier_allowed = !self.flags.unicode;
574                        result.push(self.consume_lookaround_assertion(LookaroundParams {
575                            negate: false,
576                            backwards: false,
577                        })?);
578                    } else if self.try_consume_str("(?!") {
579                        // Negative lookahead.
580                        quantifier_allowed = !self.flags.unicode;
581                        result.push(self.consume_lookaround_assertion(LookaroundParams {
582                            negate: true,
583                            backwards: false,
584                        })?);
585                    } else if self.try_consume_str("(?<=") {
586                        // Positive lookbehind.
587                        quantifier_allowed = false;
588                        self.has_lookbehind = true;
589                        result.push(self.consume_lookaround_assertion(LookaroundParams {
590                            negate: false,
591                            backwards: true,
592                        })?);
593                    } else if self.try_consume_str("(?<!") {
594                        // Negative lookbehind.
595                        quantifier_allowed = false;
596                        self.has_lookbehind = true;
597                        result.push(self.consume_lookaround_assertion(LookaroundParams {
598                            negate: true,
599                            backwards: true,
600                        })?);
601                    } else if self.try_consume_str("(?:") {
602                        // Non-capturing group.
603                        result.push(self.consume_disjunction()?);
604                    } else if let Some(group) = self.try_consume_modifier_group()? {
605                        result.push(group);
606                    } else {
607                        // Capturing group.
608                        self.consume('(');
609                        let group = self.group_count;
610                        if self.group_count as usize >= MAX_CAPTURE_GROUPS {
611                            return error("Capture group count limit exceeded");
612                        }
613                        self.group_count += 1;
614
615                        // Parse capture group name.
616                        if self.try_consume_str("?") {
617                            let group_name = if let Some(group_name) =
618                                self.try_consume_named_capture_group_name()
619                            {
620                                group_name
621                            } else {
622                                return error("Invalid token at named capture group identifier");
623                            };
624                            let contents = self.consume_disjunction()?;
625                            result.push(ir::Node::NamedCaptureGroup(
626                                Box::new(contents),
627                                group,
628                                group_name,
629                            ))
630                        } else {
631                            let contents = self.consume_disjunction()?;
632                            result.push(ir::Node::CaptureGroup(Box::new(contents), group))
633                        }
634                    }
635                    if !self.try_consume(')') {
636                        return error("Unbalanced parenthesis");
637                    }
638                }
639
640                // CharacterClass :: ClassContents :: ClassSetExpression
641                '[' if self.flags.unicode_sets => {
642                    self.consume('[');
643                    let negate_set = self.try_consume('^');
644                    result.push(
645                        self.consume_class_set_expression(negate_set)?
646                            .node(self.flags.icase, negate_set),
647                    );
648                }
649
650                '[' => {
651                    result.push(self.consume_bracket()?);
652                }
653
654                // Term :: ExtendedAtom :: InvalidBracedQuantifier
655                '{' if !self.flags.unicode => {
656                    if self.try_consume_braced_quantifier().is_some() {
657                        return error("Invalid braced quantifier");
658                    }
659
660                    // Term :: ExtendedAtom :: ExtendedPatternCharacter
661                    result.push(ir::Node::Char {
662                        c: self.consume(c),
663                        icase: self.flags.icase,
664                    })
665                }
666
667                // Term :: Atom :: PatternCharacter :: SourceCharacter but not ^ $ \ . * + ? ( ) [ ] { } |
668                '*' | '+' | '?' | ']' | '{' | '}' if self.flags.unicode => {
669                    return error("Invalid atom character");
670                }
671
672                // Term :: ExtendedAtom :: SourceCharacter but not ^ $ \ . * + ? ( ) [ |
673                '*' | '+' | '?' => {
674                    return error("Invalid atom character");
675                }
676
677                // Term :: Atom :: PatternCharacter
678                // Term :: ExtendedAtom :: ExtendedPatternCharacter
679                _ => {
680                    self.consume(c);
681                    result.push(ir::Node::Char {
682                        c: self.fold_if_icase(c),
683                        icase: self.flags.icase,
684                    })
685                }
686            }
687
688            // We just parsed a term; try parsing a quantifier.
689            if let Some(quant) = self.try_consume_quantifier()? {
690                if !quantifier_allowed {
691                    return error("Quantifier not allowed here");
692                }
693                // Validate the quantifier.
694                // Note we don't want to do this as part of parsing the quantiifer in some cases
695                // an incomplete quantifier is not recognized as a quantifier, e.g. `/{3/` is
696                // valid.
697                if matches!(quant.max, Some(max) if quant.min > max) {
698                    return error("Invalid quantifier");
699                }
700                let quantifee = result.split_off(start_offset);
701                if self.loop_count as usize >= MAX_LOOPS {
702                    return error("Loop count limit exceeded");
703                }
704                self.loop_count += 1;
705                result.push(ir::Node::Loop {
706                    loopee: Box::new(make_cat(quantifee)),
707                    quant,
708                    enclosed_groups: start_group..self.group_count,
709                });
710            }
711        }
712        Ok(make_cat(result))
713    }
714
715    fn try_consume_modifier_group(&mut self) -> Result<Option<ir::Node>, Error> {
716        let mut cursor = self.input.clone();
717        if cursor.next() != Some('(' as u32) {
718            return Ok(None);
719        }
720        if cursor.next() != Some('?' as u32) {
721            return Ok(None);
722        }
723        let Some(mut current) = cursor.next() else {
724            return Ok(None);
725        };
726
727        if to_char_sat(current) == '<' {
728            return Ok(None);
729        }
730
731        let mut seen_hyphen = false;
732        let mut saw_flag = false;
733        let mut icase_override: Option<bool> = None;
734        let mut multiline_override: Option<bool> = None;
735        let mut dot_all_override: Option<bool> = None;
736
737        loop {
738            let ch = to_char_sat(current);
739            match ch {
740                'i' | 'm' | 's' => {
741                    let value = !seen_hyphen;
742                    let target = match ch {
743                        'i' => &mut icase_override,
744                        'm' => &mut multiline_override,
745                        's' => &mut dot_all_override,
746                        _ => unreachable!(),
747                    };
748                    if target.is_some() {
749                        return error("Invalid group modifier");
750                    }
751                    *target = Some(value);
752                    saw_flag = true;
753                }
754                '-' => {
755                    if seen_hyphen {
756                        return error("Invalid group modifier");
757                    }
758                    seen_hyphen = true;
759                }
760                ':' => {
761                    if !saw_flag {
762                        return error("Invalid group modifier");
763                    }
764                    self.input = cursor;
765                    let mut new_flags = self.flags;
766                    if let Some(value) = icase_override {
767                        new_flags.icase = value;
768                    }
769                    if let Some(value) = multiline_override {
770                        new_flags.multiline = value;
771                    }
772                    if let Some(value) = dot_all_override {
773                        new_flags.dot_all = value;
774                    }
775                    let saved_flags = self.flags;
776                    self.flags = new_flags;
777                    let contents = match self.consume_disjunction() {
778                        Ok(node) => node,
779                        Err(err) => {
780                            self.flags = saved_flags;
781                            return Err(err);
782                        }
783                    };
784                    self.flags = saved_flags;
785                    return Ok(Some(contents));
786                }
787                _ => {
788                    return error("Invalid group modifier");
789                }
790            }
791
792            current = match cursor.next() {
793                Some(next) => next,
794                None => return error("Invalid group modifier"),
795            };
796        }
797    }
798
799    /// ES6 21.2.2.13 CharacterClass.
800    fn consume_bracket(&mut self) -> Result<ir::Node, Error> {
801        self.consume('[');
802        let invert = self.try_consume('^');
803        let mut result = BracketContents {
804            invert,
805            cps: CodePointSet::default(),
806        };
807
808        loop {
809            match self.peek().map(to_char_sat) {
810                None => {
811                    return error("Unbalanced bracket");
812                }
813                Some(']') => {
814                    self.consume(']');
815                    if self.flags.icase {
816                        result.cps = unicode::add_icase_code_points(result.cps);
817                    }
818                    return Ok(ir::Node::Bracket(result));
819                }
820                _ => {}
821            }
822
823            // Parse a code point or character class.
824            let Some(first) = self.try_consume_bracket_class_atom()? else {
825                continue;
826            };
827
828            // Check for a dash; we may have a range.
829            if !self.try_consume('-') {
830                add_class_atom(&mut result, first);
831                continue;
832            }
833
834            let Some(second) = self.try_consume_bracket_class_atom()? else {
835                // No second atom. For example: [a-].
836                add_class_atom(&mut result, first);
837                add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
838                continue;
839            };
840
841            // Ranges must also be in order: z-a is invalid.
842            // ES6 21.2.2.15.1 "If i > j, throw a SyntaxError exception"
843            if let (ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) = (&first, &second) {
844                if c1 > c2 {
845                    return error(
846                        "Range values reversed, start char code is greater than end char code.",
847                    );
848                }
849                result.cps.add(Interval {
850                    first: *c1,
851                    last: *c2,
852                });
853
854                continue;
855            }
856
857            if self.flags.unicode {
858                return error("Invalid character range");
859            }
860
861            // If it does not match a range treat as any match single characters.
862            add_class_atom(&mut result, first);
863            add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
864            add_class_atom(&mut result, second);
865        }
866    }
867
868    fn try_consume_bracket_class_atom(&mut self) -> Result<Option<ClassAtom>, Error> {
869        let c = self.peek();
870        if c.is_none() {
871            return Ok(None);
872        }
873        let c = c.unwrap();
874        match to_char_sat(c) {
875            // End of bracket.
876            ']' => Ok(None),
877
878            // ClassEscape
879            '\\' => {
880                self.consume('\\');
881                let ec = if let Some(ec) = self.peek() {
882                    ec
883                } else {
884                    return error("Unterminated escape");
885                };
886                match to_char_sat(ec) {
887                    // ClassEscape :: b
888                    'b' => {
889                        self.consume('b');
890                        Ok(Some(ClassAtom::CodePoint(u32::from('\x08'))))
891                    }
892                    // ClassEscape :: [+UnicodeMode] -
893                    '-' if self.flags.unicode => {
894                        self.consume('-');
895                        Ok(Some(ClassAtom::CodePoint(u32::from('-'))))
896                    }
897                    'c' if !self.flags.unicode => {
898                        let input = self.input.clone();
899                        self.consume('c');
900                        match self.peek().map(to_char_sat) {
901                            // ClassEscape :: [~UnicodeMode] c ClassControlLetter
902                            Some('0'..='9' | '_') => {
903                                let next = self.next().expect("char was not next");
904                                Ok(Some(ClassAtom::CodePoint(next & 0x1F)))
905                            }
906                            // CharacterEscape :: c AsciiLetter
907                            Some('a'..='z' | 'A'..='Z') => {
908                                let next = self.next().expect("char was not next");
909                                Ok(Some(ClassAtom::CodePoint(next % 32)))
910                            }
911                            // ClassAtomNoDash :: \ [lookahead = c]
912                            _ => {
913                                self.input = input;
914                                Ok(Some(ClassAtom::CodePoint(u32::from('\\'))))
915                            }
916                        }
917                    }
918                    // ClassEscape :: CharacterClassEscape :: d
919                    'd' => {
920                        self.consume('d');
921                        Ok(Some(ClassAtom::CharacterClass {
922                            class_type: CharacterClassType::Digits,
923                            positive: true,
924                        }))
925                    }
926                    // ClassEscape :: CharacterClassEscape :: D
927                    'D' => {
928                        self.consume('D');
929                        Ok(Some(ClassAtom::CharacterClass {
930                            class_type: CharacterClassType::Digits,
931                            positive: false,
932                        }))
933                    }
934                    // ClassEscape :: CharacterClassEscape :: s
935                    's' => {
936                        self.consume('s');
937                        Ok(Some(ClassAtom::CharacterClass {
938                            class_type: CharacterClassType::Spaces,
939                            positive: true,
940                        }))
941                    }
942                    // ClassEscape :: CharacterClassEscape :: S
943                    'S' => {
944                        self.consume('S');
945                        Ok(Some(ClassAtom::CharacterClass {
946                            class_type: CharacterClassType::Spaces,
947                            positive: false,
948                        }))
949                    }
950                    // ClassEscape :: CharacterClassEscape :: w
951                    'w' => {
952                        self.consume('w');
953                        Ok(Some(ClassAtom::CharacterClass {
954                            class_type: CharacterClassType::Words,
955                            positive: true,
956                        }))
957                    }
958                    // ClassEscape :: CharacterClassEscape :: W
959                    'W' => {
960                        self.consume('W');
961                        Ok(Some(ClassAtom::CharacterClass {
962                            class_type: CharacterClassType::Words,
963                            positive: false,
964                        }))
965                    }
966                    // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
967                    // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
968                    'p' | 'P' if self.flags.unicode => {
969                        self.consume(ec);
970                        let negate = ec == 'P' as u32;
971                        match self.try_consume_unicode_property_escape()? {
972                            PropertyEscapeKind::CharacterClass(s) => Ok(Some(ClassAtom::Range {
973                                iv: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
974                                negate,
975                            })),
976                            PropertyEscapeKind::StringSet(_) => error("Invalid property escape"),
977                        }
978                    }
979                    // ClassEscape :: CharacterEscape
980                    _ => {
981                        let cc = self.consume_character_escape()?;
982                        Ok(Some(ClassAtom::CodePoint(cc)))
983                    }
984                }
985            }
986
987            _ => Ok(Some(ClassAtom::CodePoint(self.consume(c)))),
988        }
989    }
990
991    // CharacterClass :: ClassContents :: ClassSetExpression
992    fn consume_class_set_expression(&mut self, negate_set: bool) -> Result<ClassSet, Error> {
993        let mut result = ClassSet::new();
994
995        let first = match self.peek() {
996            Some(0x5D /* ] */) => {
997                self.consume(']');
998                return Ok(result);
999            }
1000            Some(_) => self.consume_class_set_operand(negate_set)?,
1001            None => {
1002                return error("Unbalanced class set bracket");
1003            }
1004        };
1005
1006        enum ClassSetOperator {
1007            Union,
1008            Intersection,
1009            Subtraction,
1010        }
1011
1012        let op = match self.peek() {
1013            Some(0x5D /* ] */) => {
1014                self.consume(']');
1015                result.union_operand(first);
1016                return Ok(result);
1017            }
1018            Some(0x26 /* & */) => {
1019                self.consume('&');
1020                if self.peek() == Some(0x26 /* & */) {
1021                    self.consume('&');
1022                    result.union_operand(first.clone());
1023                    ClassSetOperator::Intersection
1024                } else {
1025                    result.codepoints.add_one(0x26 /* & */);
1026                    ClassSetOperator::Union
1027                }
1028            }
1029            Some(0x2D /* - */) => {
1030                self.consume('-');
1031                if self.peek() == Some(0x2D /* - */) {
1032                    self.consume('-');
1033                    result.union_operand(first.clone());
1034                    ClassSetOperator::Subtraction
1035                } else {
1036                    match first {
1037                        ClassSetOperand::ClassSetCharacter(first) => {
1038                            let ClassSetOperand::ClassSetCharacter(last) =
1039                                self.consume_class_set_operand(negate_set)?
1040                            else {
1041                                return error("Invalid class set range");
1042                            };
1043                            if first > last {
1044                                return error("Invalid class set range");
1045                            }
1046                            result.codepoints.add(Interval { first, last });
1047                        }
1048                        _ => {
1049                            return error("Invalid class set range");
1050                        }
1051                    };
1052                    ClassSetOperator::Union
1053                }
1054            }
1055            Some(_) => {
1056                result.union_operand(first.clone());
1057                ClassSetOperator::Union
1058            }
1059            None => {
1060                return error("Unbalanced class set bracket");
1061            }
1062        };
1063
1064        match op {
1065            ClassSetOperator::Union => {
1066                loop {
1067                    let operand = match self.peek() {
1068                        Some(0x5D /* ] */) => {
1069                            self.consume(']');
1070                            return Ok(result);
1071                        }
1072                        Some(_) => self.consume_class_set_operand(negate_set)?,
1073                        None => return error("Unbalanced class set bracket"),
1074                    };
1075                    if self.peek() == Some(0x2D /* - */) {
1076                        self.consume('-');
1077                        match operand {
1078                            ClassSetOperand::ClassSetCharacter(first) => {
1079                                let ClassSetOperand::ClassSetCharacter(last) =
1080                                    self.consume_class_set_operand(negate_set)?
1081                                else {
1082                                    return error("Invalid class set range");
1083                                };
1084                                if first > last {
1085                                    return error("Invalid class set range");
1086                                }
1087                                result.codepoints.add(Interval { first, last });
1088                            }
1089                            _ => {
1090                                return error("Invalid class set range");
1091                            }
1092                        };
1093                    } else {
1094                        result.union_operand(operand);
1095                    }
1096                }
1097            }
1098            // ClassIntersection :: ClassSetOperand && [lookahead ≠ &]
1099            ClassSetOperator::Intersection => {
1100                loop {
1101                    let operand = self.consume_class_set_operand(negate_set)?;
1102                    result.intersect_operand(operand);
1103                    match self.next() {
1104                        Some(0x5D /* ] */) => return Ok(result),
1105                        Some(0x26 /* & */) => {}
1106                        Some(_) => return error("Unexpected character in class set intersection"),
1107                        _ => return error("Unbalanced class set bracket"),
1108                    }
1109                    if self.next() != Some(0x26 /* & */) {
1110                        return error("Unbalanced class set bracket");
1111                    }
1112                }
1113            }
1114            // ClassSubtraction :: ClassSubtraction -- ClassSetOperand
1115            ClassSetOperator::Subtraction => {
1116                loop {
1117                    let operand = self.consume_class_set_operand(negate_set)?;
1118                    result.subtract_operand(operand);
1119                    match self.next() {
1120                        Some(0x5D /* ] */) => return Ok(result),
1121                        Some(0x2D /* - */) => {}
1122                        Some(_) => return error("Unexpected character in class set subtraction"),
1123                        _ => return error("Unbalanced class set bracket"),
1124                    }
1125                    if self.next() != Some(0x2D /* - */) {
1126                        return error("Unbalanced class set bracket");
1127                    }
1128                }
1129            }
1130        }
1131    }
1132
1133    fn consume_class_set_operand(&mut self, negate_set: bool) -> Result<ClassSetOperand, Error> {
1134        use ClassSetOperand::*;
1135        let Some(cp) = self.peek() else {
1136            return error("Empty class set operand");
1137        };
1138        match cp {
1139            // ClassSetOperand :: NestedClass :: [ [lookahead ≠ ^] ClassContents[+UnicodeMode, +UnicodeSetsMode] ]
1140            // ClassSetOperand :: NestedClass :: [^ ClassContents[+UnicodeMode, +UnicodeSetsMode] ]
1141            0x5B /* [ */ => {
1142                self.consume('[');
1143                let negate_set = self.try_consume('^');
1144                let result = self.consume_class_set_expression(negate_set)?;
1145                if negate_set {
1146                    result.codepoints.inverted();
1147                }
1148                Ok(Class(result))
1149            }
1150            // ClassSetOperand :: NestedClass :: \ CharacterClassEscape
1151            // ClassSetOperand :: ClassStringDisjunction
1152            // ClassSetOperand :: ClassSetCharacter :: \...
1153            // ClassSetRange :: ClassSetCharacter :: \...
1154            0x5C /* \ */ => {
1155                self.consume('\\');
1156                let Some(cp) = self.peek() else {
1157                    return error("Incomplete class set escape");
1158                };
1159                match cp {
1160                    // ClassStringDisjunction  \q{ ClassStringDisjunctionContents }
1161                    0x71 /* q */ => {
1162                        self.consume('q');
1163                        if !self.try_consume('{') {
1164                            return error("Invalid class set escape: expected {");
1165                        }
1166                        let mut alternatives = Vec::new();
1167                        let mut alternative = Vec::new();
1168                        loop {
1169                            match self.peek() {
1170                                Some(0x7D /* } */) => {
1171                                    self.consume('}');
1172                                    if !alternative.is_empty() {
1173                                        alternatives.push(alternative.clone());
1174                                        alternative.clear();
1175                                    }
1176                                    break;
1177                                }
1178                                Some(0x7C /* | */) => {
1179                                    self.consume('|');
1180                                    if !alternative.is_empty() {
1181                                        alternatives.push(alternative.clone());
1182                                        alternative.clear();
1183                                    }
1184                                }
1185                                Some(_) => {
1186                                    alternative.push(self.consume_class_set_character()?);
1187                                }
1188                                None => {
1189                                    return error("Unbalanced class set string disjunction");
1190                                }
1191                            }
1192                        }
1193                        Ok(ClassStringDisjunction(ClassSetAlternativeStrings(alternatives)))
1194                    }
1195                    // CharacterClassEscape :: d
1196                    0x64 /* d */ => {
1197                        self.consume('d');
1198                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, true)))
1199                    }
1200                    // CharacterClassEscape :: D
1201                    0x44 /* D */ => {
1202                        self.consume('D');
1203                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, false)))
1204                    }
1205                    // CharacterClassEscape :: s
1206                    0x73 /* s */ => {
1207                        self.consume('s');
1208                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, true)))
1209                    }
1210                    // CharacterClassEscape :: S
1211                    0x53 /* S */ => {
1212                        self.consume('S');
1213                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, false)))
1214                    }
1215                    // CharacterClassEscape :: w
1216                    0x77 /* w */ => {
1217                        self.consume('w');
1218                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, true)))
1219                    }
1220                    // CharacterClassEscape :: W
1221                    0x57 /* W */ => {
1222                        self.consume('W');
1223                        Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, false)))
1224                    }
1225                    // CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
1226                    0x70 /* p */ => {
1227                        self.consume('p');
1228                        match self.try_consume_unicode_property_escape()? {
1229                            PropertyEscapeKind::CharacterClass(intervals) => {
1230                                Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1231                                    intervals.to_vec(),
1232                                )))
1233                            }
1234                            PropertyEscapeKind::StringSet(_) if negate_set => error("Invalid character escape"),
1235                            PropertyEscapeKind::StringSet(strings) => {
1236                                Ok(ClassStringDisjunction(ClassSetAlternativeStrings(strings.iter().map(|s| s.to_vec()).collect())))
1237                            }
1238                        }
1239                    }
1240                    // CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
1241                    0x50 /* P */ => {
1242                        self.consume('P');
1243                        match self.try_consume_unicode_property_escape()? {
1244                            PropertyEscapeKind::CharacterClass(s) => {
1245                                Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1246                                    s.to_vec(),
1247                                ).inverted()))
1248                            }
1249                            PropertyEscapeKind::StringSet(_) => error("Invalid character escape"),
1250                        }
1251                    }
1252                    // ClassSetCharacter:: \b
1253                    0x62 /* b */ => {
1254                        Ok(ClassSetCharacter(self.consume(cp)))
1255                    }
1256                    // ClassSetCharacter:: \ ClassSetReservedPunctuator
1257                    _ if Self::is_class_set_reserved_punctuator(cp) => Ok(ClassSetCharacter(self.consume(cp))),
1258                    // ClassSetCharacter:: \ CharacterEscape[+UnicodeMode]
1259                    _ => Ok(ClassSetCharacter(self.consume_character_escape()?))
1260                }
1261            }
1262            // ClassSetOperand :: ClassSetCharacter
1263            // ClassSetRange :: ClassSetCharacter
1264            _ => Ok(ClassSetCharacter(self.consume_class_set_character()?)),
1265        }
1266    }
1267
1268    // ClassSetCharacter
1269    fn consume_class_set_character(&mut self) -> Result<u32, Error> {
1270        let Some(cp) = self.next() else {
1271            return error("Incomplete class set character");
1272        };
1273        match cp {
1274            0x5C /* \ */ => {
1275                let Some(cp) = self.peek() else {
1276                    return error("Incomplete class set escape");
1277                };
1278                match cp {
1279                    // \b
1280                    0x62 /* b */ => {
1281                        Ok(self.consume(cp))
1282                    }
1283                    // \ ClassSetReservedPunctuator
1284                    _ if Self::is_class_set_reserved_punctuator(cp) => Ok(self.consume(cp)),
1285                    // \ CharacterEscape[+UnicodeMode]
1286                    _ => Ok(self.consume_character_escape()?)
1287                }
1288            }
1289            // [lookahead ∉ ClassSetReservedDoublePunctuator] SourceCharacter but not ClassSetSyntaxCharacter
1290            0x28 /* ( */ | 0x29 /* ) */ | 0x7B /* { */ | 0x7D /* } */ | 0x2F /* / */
1291            | 0x2D /* - */ | 0x7C /* | */ => error("Invalid class set character"),
1292            _ => {
1293                if Self::is_class_set_reserved_double_punctuator(cp)
1294                    && let Some(cp) = self.peek()
1295                        && Self::is_class_set_reserved_double_punctuator(cp) {
1296                            return error("Invalid class set character");
1297                        }
1298                Ok(cp)
1299            }
1300        }
1301    }
1302
1303    // ClassSetReservedPunctuator
1304    fn is_class_set_reserved_punctuator(cp: u32) -> bool {
1305        match cp {
1306            0x26 /* & */ | 0x2D /* - */ | 0x21 /* ! */ | 0x23 /* # */ | 0x25 /* % */
1307            | 0x2C /* , */ | 0x3A /* : */ | 0x3B /* ; */ | 0x3C /* < */ | 0x3D /* = */
1308            | 0x3E /* > */ | 0x40 /* @ */ | 0x60 /* ` */ | 0x7E /* ~ */ => true,
1309            _ => false,
1310        }
1311    }
1312
1313    fn is_class_set_reserved_double_punctuator(cp: u32) -> bool {
1314        match cp {
1315            0x26 /* & */ | 0x21 /* ! */ | 0x23 /* # */ | 0x24 /* $ */ | 0x25 /* % */
1316            | 0x2A /* * */ | 0x2B /* + */ | 0x2C /* , */ | 0x2E /* . */ | 0x3A /* : */
1317            | 0x3B /* ; */ | 0x3C /* < */ | 0x3D /* = */ | 0x3E /* > */ | 0x3F /* ? */
1318            | 0x40 /* @ */ | 0x5E /* ^ */ | 0x60 /* ` */ | 0x7E /* ~ */ => true,
1319            _ => false,
1320        }
1321    }
1322
1323    fn try_consume_quantifier(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1324        if let Some(mut quant) = self.try_consume_quantifier_prefix()? {
1325            quant.greedy = !self.try_consume('?');
1326            Ok(Some(quant))
1327        } else {
1328            Ok(None)
1329        }
1330    }
1331
1332    fn try_consume_quantifier_prefix(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1333        let nc = self.peek();
1334        if nc.is_none() {
1335            return Ok(None);
1336        }
1337        let c = nc.unwrap();
1338        match char::from_u32(c) {
1339            Some('+') => {
1340                self.consume('+');
1341                Ok(Some(ir::Quantifier {
1342                    min: 1,
1343                    max: None,
1344                    greedy: true,
1345                }))
1346            }
1347            Some('*') => {
1348                self.consume('*');
1349                Ok(Some(ir::Quantifier {
1350                    min: 0,
1351                    max: None,
1352                    greedy: true,
1353                }))
1354            }
1355            Some('?') => {
1356                self.consume('?');
1357                Ok(Some(ir::Quantifier {
1358                    min: 0,
1359                    max: Some(1),
1360                    greedy: true,
1361                }))
1362            }
1363            Some('{') => {
1364                if let Some(quantifier) = self.try_consume_braced_quantifier() {
1365                    Ok(Some(quantifier))
1366                } else if self.flags.unicode {
1367                    // if there was a brace '{' that doesn't parse into a valid quantifier,
1368                    // it's not valid with the unicode flag
1369                    error("Invalid quantifier")
1370                } else {
1371                    Ok(None)
1372                }
1373            }
1374            _ => Ok(None),
1375        }
1376    }
1377
1378    fn try_consume_braced_quantifier(&mut self) -> Option<ir::Quantifier> {
1379        // if parsed input is actually invalid, keep the previous one for rollback
1380        let pre_input = self.input.clone();
1381        self.consume('{');
1382        let optmin = self.try_consume_decimal_integer_literal();
1383        let Some(optmin) = optmin else {
1384            // not a valid quantifier, rollback consumption
1385            self.input = pre_input;
1386            return None;
1387        };
1388        let mut quant = ir::Quantifier {
1389            min: optmin,
1390            max: Some(optmin),
1391            greedy: true,
1392        };
1393        if self.try_consume(',') {
1394            let max = self.try_consume_decimal_integer_literal();
1395            // Either like {3,4} in which case we want to set the max;
1396            // or like {3,} in which case the max should be None to indicate unbounded.
1397            quant.max = max;
1398        } else {
1399            // Like {3}.
1400        }
1401        if !self.try_consume('}') {
1402            // not a valid quantifier, rollback consumption
1403            self.input = pre_input;
1404            return None;
1405        }
1406        Some(quant)
1407    }
1408
1409    /// ES6 11.8.3 DecimalIntegerLiteral.
1410    /// If the value would overflow, usize::MAX is returned.
1411    /// All decimal digits are consumed regardless.
1412    fn try_consume_decimal_integer_literal(&mut self) -> Option<usize> {
1413        let mut result: usize = 0;
1414        let mut char_count = 0;
1415        while let Some(c) = self.peek() {
1416            if let Some(digit) = char::from_u32(c).and_then(|c| char::to_digit(c, 10)) {
1417                self.consume(c);
1418                char_count += 1;
1419                result = result.saturating_mul(10);
1420                result = result.saturating_add(digit as usize);
1421            } else {
1422                break;
1423            }
1424        }
1425        if char_count > 0 { Some(result) } else { None }
1426    }
1427
1428    fn consume_lookaround_assertion(
1429        &mut self,
1430        params: LookaroundParams,
1431    ) -> Result<ir::Node, Error> {
1432        let start_group = self.group_count;
1433        let contents = self.consume_disjunction()?;
1434        let end_group = self.group_count;
1435        Ok(ir::Node::LookaroundAssertion {
1436            negate: params.negate,
1437            backwards: params.backwards,
1438            start_group,
1439            end_group,
1440            contents: Box::new(contents),
1441        })
1442    }
1443
1444    fn consume_character_escape(&mut self) -> Result<u32, Error> {
1445        let c = self.next().expect("Should have a character");
1446        let ch = to_char_sat(c);
1447        match ch {
1448            // CharacterEscape :: ControlEscape :: f
1449            'f' => Ok(0xC),
1450            // CharacterEscape :: ControlEscape :: n
1451            'n' => Ok(0xA),
1452            // CharacterEscape :: ControlEscape :: r
1453            'r' => Ok(0xD),
1454            // CharacterEscape :: ControlEscape :: t
1455            't' => Ok(0x9),
1456            // CharacterEscape :: ControlEscape :: v
1457            'v' => Ok(0xB),
1458            // CharacterEscape :: c AsciiLetter
1459            'c' => {
1460                if let Some(nc) = self.next().and_then(char::from_u32)
1461                    && (nc.is_ascii_lowercase() || nc.is_ascii_uppercase())
1462                {
1463                    return Ok((nc as u32) % 32);
1464                }
1465                error("Invalid character escape")
1466            }
1467            // CharacterEscape :: 0 [lookahead ∉ DecimalDigit]
1468            '0' if self
1469                .peek()
1470                .and_then(char::from_u32)
1471                .map(|c: char| c.is_ascii_digit())
1472                != Some(true) =>
1473            {
1474                Ok(0x0)
1475            }
1476            // CharacterEscape :: HexEscapeSequence :: x HexDigit HexDigit
1477            'x' => {
1478                let hex_to_digit = |c: char| c.to_digit(16);
1479                let x1 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1480                let x2 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1481                match (x1, x2) {
1482                    (Some(x1), Some(x2)) => Ok(x1 * 16 + x2),
1483                    // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1484                    _ if !self.flags.unicode => Ok(c),
1485                    _ => error("Invalid character escape"),
1486                }
1487            }
1488            // CharacterEscape :: RegExpUnicodeEscapeSequence
1489            'u' => {
1490                if let Some(c) = self.try_escape_unicode_sequence() {
1491                    Ok(c)
1492                } else if !self.flags.unicode {
1493                    // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1494                    Ok(c)
1495                } else {
1496                    error("Invalid unicode escape")
1497                }
1498            }
1499            // CharacterEscape :: [~UnicodeMode] LegacyOctalEscapeSequence
1500            '0'..='7' if !self.flags.unicode => {
1501                let Some(c1) = self.peek() else {
1502                    return Ok(c - '0' as u32);
1503                };
1504                let ch1 = to_char_sat(c1);
1505
1506                match ch {
1507                    // 0 [lookahead ∈ { 8, 9 }]
1508                    '0' if ('8'..='9').contains(&ch1) => Ok(0x0),
1509                    // NonZeroOctalDigit [lookahead ∉ OctalDigit]
1510                    _ if !('0'..='7').contains(&ch1) => Ok(c - '0' as u32),
1511                    // FourToSeven OctalDigit
1512                    '4'..='7' => {
1513                        self.consume(c1);
1514                        Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1515                    }
1516                    // ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
1517                    // ZeroToThree OctalDigit OctalDigit
1518                    '0'..='3' => {
1519                        self.consume(c1);
1520                        if self.peek().map(|c2| ('0'..='7').contains(&to_char_sat(c2)))
1521                            == Some(true)
1522                        {
1523                            let c2 = self.next().expect("char was not next");
1524                            Ok((c - '0' as u32) * 64 + (c1 - '0' as u32) * 8 + c2 - '0' as u32)
1525                        } else {
1526                            Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1527                        }
1528                    }
1529                    _ => unreachable!(),
1530                }
1531            }
1532            // CharacterEscape :: IdentityEscape :: [+UnicodeMode] SyntaxCharacter
1533            // CharacterEscape :: IdentityEscape :: [+UnicodeMode] /
1534            '^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
1535            | '/' => Ok(c),
1536            // CharacterEscape :: IdentityEscape :: SourceCharacterIdentityEscape
1537            _ if !self.flags.unicode => Ok(c),
1538            _ => error("Invalid character escape"),
1539        }
1540    }
1541
1542    // AtomEscape
1543    fn consume_atom_escape(&mut self) -> Result<ir::Node, Error> {
1544        let Some(c) = self.peek() else {
1545            return error("Incomplete escape");
1546        };
1547        match to_char_sat(c) {
1548            'd' | 'D' => {
1549                self.consume(c);
1550                Ok(make_bracket_class(
1551                    CharacterClassType::Digits,
1552                    c == 'd' as u32,
1553                ))
1554            }
1555
1556            's' | 'S' => {
1557                self.consume(c);
1558                Ok(make_bracket_class(
1559                    CharacterClassType::Spaces,
1560                    c == 's' as u32,
1561                ))
1562            }
1563
1564            'w' | 'W' => {
1565                self.consume(c);
1566                Ok(make_bracket_class(
1567                    CharacterClassType::Words,
1568                    c == 'w' as u32,
1569                ))
1570            }
1571
1572            // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] p{ UnicodePropertyValueExpression }
1573            // ClassEscape :: CharacterClassEscape :: [+UnicodeMode] P{ UnicodePropertyValueExpression }
1574            'p' | 'P' if self.flags.unicode || self.flags.unicode_sets => {
1575                self.consume(c);
1576                let negate = c == 'P' as u32;
1577                let property_escape = self.try_consume_unicode_property_escape()?;
1578                match property_escape {
1579                    PropertyEscapeKind::CharacterClass(s) => {
1580                        Ok(ir::Node::Bracket(BracketContents {
1581                            invert: negate,
1582                            cps: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
1583                        }))
1584                    }
1585                    PropertyEscapeKind::StringSet(_) if negate => error("Invalid character escape"),
1586                    PropertyEscapeKind::StringSet(strings) => Ok(make_alt(
1587                        strings
1588                            .iter()
1589                            .map(|s| {
1590                                ir::Node::Cat(
1591                                    s.iter()
1592                                        .map(|c| ir::Node::Char {
1593                                            c: *c,
1594                                            icase: self.flags.icase,
1595                                        })
1596                                        .collect(),
1597                                )
1598                            })
1599                            .collect(),
1600                    )),
1601                }
1602            }
1603
1604            // [+UnicodeMode] DecimalEscape
1605            // Note: This is a backreference.
1606            '1'..='9' if self.flags.unicode => {
1607                let group = self.try_consume_decimal_integer_literal().unwrap();
1608                if group <= self.group_count_max as usize {
1609                    Ok(ir::Node::BackRef(group as u32))
1610                } else {
1611                    error("Invalid character escape")
1612                }
1613            }
1614
1615            // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber of DecimalEscape
1616            //    is ≤ CountLeftCapturingParensWithin(the Pattern containing DecimalEscape)
1617            // Note: This could be either a backreference, a legacy octal escape or an identity escape.
1618            '1'..='9' => {
1619                let input = self.input.clone();
1620                let group = self.try_consume_decimal_integer_literal().unwrap();
1621
1622                if group <= self.group_count_max as usize {
1623                    Ok(ir::Node::BackRef(group as u32))
1624                } else {
1625                    self.input = input;
1626                    let c = self.consume_character_escape()?;
1627                    Ok(ir::Node::Char {
1628                        c: self.fold_if_icase(c),
1629                        icase: self.flags.icase,
1630                    })
1631                }
1632            }
1633
1634            // [+NamedCaptureGroups] k GroupName
1635            'k' if self.flags.unicode || !self.named_group_indices.is_empty() => {
1636                self.consume('k');
1637
1638                // The sequence `\k` must be the start of a backreference to a named capture group.
1639                if let Some(group_name) = self.try_consume_named_capture_group_name() {
1640                    if let Some(index) = self.named_group_indices.get(&group_name) {
1641                        Ok(ir::Node::BackRef(*index + 1))
1642                    } else {
1643                        error(format!(
1644                            "Backreference to invalid named capture group: {}",
1645                            &group_name
1646                        ))
1647                    }
1648                } else {
1649                    error("Unexpected end of named backreference")
1650                }
1651            }
1652
1653            // [~NamedCaptureGroups] k GroupName
1654            'k' => {
1655                self.consume('k');
1656                Ok(ir::Node::Char {
1657                    c: self.fold_if_icase(c),
1658                    icase: self.flags.icase,
1659                })
1660            }
1661
1662            _ => {
1663                let c = self.consume_character_escape()?;
1664                Ok(ir::Node::Char {
1665                    c: self.fold_if_icase(c),
1666                    icase: self.flags.icase,
1667                })
1668            }
1669        }
1670    }
1671
1672    #[allow(clippy::branches_sharing_code)]
1673    fn try_escape_unicode_sequence(&mut self) -> Option<u32> {
1674        let mut orig_input = self.input.clone();
1675
1676        // Support \u{X..X} (Unicode CodePoint)
1677        if self.try_consume('{') {
1678            let mut s = String::new();
1679            loop {
1680                match self.next().and_then(char::from_u32) {
1681                    Some('}') => break,
1682                    Some(c) => s.push(c),
1683                    None => {
1684                        // Surrogates not supported in code point escapes.
1685                        self.input = orig_input;
1686                        return None;
1687                    }
1688                }
1689            }
1690
1691            match u32::from_str_radix(&s, 16) {
1692                Ok(u) => {
1693                    if u > 0x10_FFFF {
1694                        self.input = orig_input;
1695                        None
1696                    } else {
1697                        Some(u)
1698                    }
1699                }
1700                _ => {
1701                    self.input = orig_input;
1702                    None
1703                }
1704            }
1705        } else {
1706            // Hex4Digits
1707            let mut s = String::new();
1708            for _ in 0..4 {
1709                if let Some(c) = self.next().and_then(char::from_u32) {
1710                    s.push(c);
1711                } else {
1712                    // Surrogates are not hex digits.
1713                    self.input = orig_input;
1714                    return None;
1715                }
1716            }
1717            match u16::from_str_radix(&s, 16) {
1718                Ok(u) => {
1719                    if (0xD800..=0xDBFF).contains(&u) {
1720                        // Found a high surrogate. Try to parse a low surrogate next
1721                        // to see if we can rebuild the original `char`
1722
1723                        if !self.try_consume_str("\\u") {
1724                            return Some(u as u32);
1725                        }
1726                        orig_input = self.input.clone();
1727
1728                        // A poor man's try block to handle the backtracking
1729                        // in a single place instead of every time we want to return.
1730                        // This allows us to use `?` within the inner block without returning
1731                        // from the entire parent function.
1732                        let result = (|| {
1733                            let mut s = String::new();
1734                            for _ in 0..4 {
1735                                let c = self.next().and_then(char::from_u32)?;
1736                                s.push(c);
1737                            }
1738
1739                            let uu = u16::from_str_radix(&s, 16).ok()?;
1740                            let ch = char::decode_utf16([u, uu]).next()?.ok()?;
1741                            Some(u32::from(ch))
1742                        })();
1743
1744                        result.or_else(|| {
1745                            self.input = orig_input;
1746                            Some(u as u32)
1747                        })
1748                    } else {
1749                        // If `u` is not a surrogate or is a low surrogate we can directly return it,
1750                        // since all paired low surrogates should have been handled above.
1751                        Some(u as u32)
1752                    }
1753                }
1754                _ => {
1755                    self.input = orig_input;
1756                    None
1757                }
1758            }
1759        }
1760    }
1761
1762    fn try_consume_named_capture_group_name(&mut self) -> Option<String> {
1763        if !self.try_consume('<') {
1764            return None;
1765        }
1766
1767        let orig_input = self.input.clone();
1768        let mut group_name = String::new();
1769
1770        if let Some(mut c) = self.next().and_then(char::from_u32) {
1771            if c == '\\' && self.try_consume('u') {
1772                if let Some(escaped) = self.try_escape_unicode_sequence().and_then(char::from_u32) {
1773                    c = escaped;
1774                } else {
1775                    self.input = orig_input;
1776                    return None;
1777                }
1778            }
1779
1780            if interval_contains(id_start_ranges(), c.into()) || c == '$' || c == '_' {
1781                group_name.push(c);
1782            } else {
1783                self.input = orig_input;
1784                return None;
1785            }
1786        } else {
1787            self.input = orig_input;
1788            return None;
1789        }
1790
1791        loop {
1792            if let Some(mut c) = self.next().and_then(char::from_u32) {
1793                if c == '\\' && self.try_consume('u') {
1794                    if let Some(escaped) =
1795                        self.try_escape_unicode_sequence().and_then(char::from_u32)
1796                    {
1797                        c = escaped;
1798                    } else {
1799                        self.input = orig_input;
1800                        return None;
1801                    }
1802                }
1803
1804                if c == '>' {
1805                    break;
1806                }
1807
1808                if interval_contains(id_continue_ranges(), c.into()) || c == '$' || c == '_' || c == '\u{200C}' /* <ZWNJ> */ || c == '\u{200D}'
1809                /* <ZWJ> */
1810                {
1811                    group_name.push(c);
1812                } else {
1813                    self.input = orig_input;
1814                    return None;
1815                }
1816            } else {
1817                self.input = orig_input;
1818                return None;
1819            }
1820        }
1821
1822        Some(group_name)
1823    }
1824
1825    // Quickly parse all capture groups.
1826    fn parse_capture_groups(&mut self) -> Result<(), Error> {
1827        let orig_input = self.input.clone();
1828
1829        loop {
1830            match self.next().map(to_char_sat) {
1831                Some('\\') => {
1832                    self.next();
1833                    continue;
1834                }
1835                Some('[') => loop {
1836                    match self.next().map(to_char_sat) {
1837                        Some('\\') => {
1838                            self.next();
1839                            continue;
1840                        }
1841                        Some(']') => break,
1842                        Some(_) => continue,
1843                        None => break,
1844                    }
1845                },
1846                Some('(') => {
1847                    if self.try_consume_str("?")
1848                        && let Some(name) = self.try_consume_named_capture_group_name()
1849                        && self
1850                            .named_group_indices
1851                            .insert(name, self.group_count_max)
1852                            .is_some()
1853                    {
1854                        return error("Duplicate capture group name");
1855                    }
1856                    self.group_count_max = if self.group_count_max + 1 > MAX_CAPTURE_GROUPS as u32 {
1857                        MAX_CAPTURE_GROUPS as u32
1858                    } else {
1859                        self.group_count_max + 1
1860                    };
1861                }
1862                Some(_) => continue,
1863                None => break,
1864            }
1865        }
1866
1867        self.input = orig_input;
1868
1869        Ok(())
1870    }
1871
1872    fn try_consume_unicode_property_escape(&mut self) -> Result<PropertyEscapeKind, Error> {
1873        if !self.try_consume('{') {
1874            return error("Invalid character at property escape start");
1875        }
1876
1877        let mut buffer = String::new();
1878        let mut name = None;
1879
1880        while let Some(c) = self.peek().and_then(char::from_u32) {
1881            match c {
1882                '}' => {
1883                    self.consume(c);
1884                    let Some(value) =
1885                        unicode_property_from_str(&buffer, name, self.flags.unicode_sets)
1886                    else {
1887                        break;
1888                    };
1889                    return Ok(value);
1890                }
1891                '=' if name.is_none() => {
1892                    self.consume(c);
1893                    let Some(n) = unicode_property_name_from_str(&buffer) else {
1894                        break;
1895                    };
1896                    name = Some(n);
1897                    buffer.clear();
1898                }
1899                c if c.is_ascii_alphanumeric() || c == '_' => {
1900                    self.consume(c);
1901                    buffer.push(c);
1902                }
1903                _ => break,
1904            }
1905        }
1906
1907        error("Invalid property name")
1908    }
1909
1910    fn finalize(&self, mut re: ir::Regex) -> Result<ir::Regex, Error> {
1911        debug_assert!(self.loop_count <= MAX_LOOPS as u32);
1912        debug_assert!(self.group_count as usize <= MAX_CAPTURE_GROUPS);
1913        if self.has_lookbehind {
1914            ir::walk_mut(
1915                false,
1916                re.flags.unicode,
1917                &mut re.node,
1918                &mut ir::Node::reverse_cats,
1919            );
1920        }
1921        Ok(re)
1922    }
1923}
1924
1925/// Try parsing a given pattern.
1926/// Return the resulting IR regex, or an error.
1927pub fn try_parse<I>(pattern: I, flags: api::Flags) -> Result<ir::Regex, Error>
1928where
1929    I: Iterator<Item = u32> + Clone,
1930{
1931    let mut p = Parser {
1932        input: pattern.peekable(),
1933        flags,
1934        loop_count: 0,
1935        group_count: 0,
1936        named_group_indices: HashMap::new(),
1937        group_count_max: 0,
1938        has_lookbehind: false,
1939    };
1940    p.try_parse()
1941}