1use crate::{
4 api, charclasses,
5 codepointset::{CodePointSet, Interval, interval_contains},
6 ir,
7 types::{
8 BracketContents, CaptureGroupID, CaptureGroupName, CharacterClassType, MAX_CAPTURE_GROUPS,
9 MAX_LOOPS,
10 },
11 unicode::{
12 self, PropertyEscapeKind, unicode_property_from_str, unicode_property_name_from_str,
13 },
14 unicodetables::{id_continue_ranges, id_start_ranges},
15 util::to_char_sat,
16};
17use core::{fmt, iter::Peekable};
18#[cfg(feature = "std")]
19use std::collections::HashMap;
20#[cfg(not(feature = "std"))]
21use {
22 alloc::{
23 boxed::Box,
24 string::{String, ToString},
25 vec::Vec,
26 },
27 hashbrown::HashMap,
28};
29
30#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
34pub struct Error {
35 pub text: String,
36}
37
38impl fmt::Display for Error {
39 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40 f.write_str(&self.text)
41 }
42}
43
44#[cfg(feature = "std")]
45impl std::error::Error for Error {}
46
47enum ClassAtom {
48 CodePoint(u32),
49 CharacterClass {
50 class_type: CharacterClassType,
51 positive: bool,
52 },
53 Range {
54 iv: CodePointSet,
55 negate: bool,
56 },
57}
58
59#[derive(Debug, Clone)]
61struct ClassSet {
62 codepoints: CodePointSet,
63 alternatives: ClassSetAlternativeStrings,
64}
65
66impl ClassSet {
67 fn new() -> Self {
68 ClassSet {
69 codepoints: CodePointSet::new(),
70 alternatives: ClassSetAlternativeStrings::new(),
71 }
72 }
73
74 fn node(self, icase: bool, negate_set: bool) -> ir::Node {
75 let codepoints = if icase {
76 unicode::add_icase_code_points(self.codepoints)
77 } else {
78 self.codepoints
79 };
80 if codepoints.is_empty() && self.alternatives.0.is_empty() {
81 ir::Node::Bracket(BracketContents {
82 invert: negate_set,
83 cps: codepoints,
84 })
85 } else if codepoints.is_empty() {
86 make_alt(self.alternatives.to_nodes(icase))
87 } else if self.alternatives.0.is_empty() {
88 ir::Node::Bracket(BracketContents {
89 invert: negate_set,
90 cps: codepoints,
91 })
92 } else {
93 let mut nodes = self.alternatives.to_nodes(icase);
94 nodes.push(ir::Node::Bracket(BracketContents {
95 invert: negate_set,
96 cps: codepoints,
97 }));
98 make_alt(nodes)
99 }
100 }
101
102 fn union_operand(&mut self, operand: ClassSetOperand) {
103 match operand {
104 ClassSetOperand::ClassSetCharacter(c) => {
105 self.codepoints.add_one(c);
106 }
107 ClassSetOperand::CharacterClassEscape(cps) => {
108 self.codepoints.add_set(cps);
109 }
110 ClassSetOperand::Class(class) => {
111 self.codepoints.add_set(class.codepoints);
112 self.alternatives.extend(class.alternatives);
113 }
114 ClassSetOperand::ClassStringDisjunction(s) => {
115 self.alternatives.extend(s);
116 }
117 }
118 }
119
120 fn intersect_operand(&mut self, operand: ClassSetOperand) {
121 match operand {
122 ClassSetOperand::ClassSetCharacter(c) => {
123 if self.codepoints.contains(c) {
124 self.codepoints =
125 CodePointSet::from_sorted_disjoint_intervals(Vec::from([Interval {
126 first: c,
127 last: c,
128 }]));
129 } else {
130 self.codepoints.clear();
131 }
132 let mut found_alternative = false;
133 for alternative in &mut self.alternatives.0 {
134 if alternative.len() == 1 && alternative[0] == c {
135 found_alternative = true;
136 break;
137 }
138 }
139 self.alternatives.0.clear();
140 if found_alternative {
141 self.alternatives.0.push(Vec::from([c]));
142 }
143 }
144 ClassSetOperand::CharacterClassEscape(cps) => {
145 self.codepoints.intersect(cps.intervals());
146 let mut retained = Vec::new();
147 for alternative in &self.alternatives.0 {
148 if alternative.len() == 1 && cps.contains(alternative[0]) {
149 retained.push(alternative.clone());
150 }
151 }
152 self.alternatives.0 = retained;
153 }
154 ClassSetOperand::Class(class) => {
155 let mut retained_codepoints = CodePointSet::new();
156 for alternative in &class.alternatives.0 {
157 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
158 retained_codepoints.add_one(alternative[0]);
159 }
160 }
161 let mut retained_alternatives = ClassSetAlternativeStrings::new();
162 for alternative in &self.alternatives.0 {
163 if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
164 retained_alternatives.0.push(alternative.clone());
165 }
166 }
167 self.codepoints.intersect(class.codepoints.intervals());
168 self.codepoints.add_set(retained_codepoints);
169 self.alternatives.intersect(class.alternatives);
170 self.alternatives.extend(retained_alternatives);
171 }
172 ClassSetOperand::ClassStringDisjunction(s) => {
173 let mut retained = CodePointSet::new();
174 for alternative in &s.0 {
175 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
176 retained.add_one(alternative[0]);
177 }
178 }
179 self.codepoints = retained;
180 self.alternatives.intersect(s);
181 }
182 }
183 }
184
185 fn subtract_operand(&mut self, operand: ClassSetOperand) {
186 match operand {
187 ClassSetOperand::ClassSetCharacter(c) => {
188 self.codepoints.remove(&[Interval { first: c, last: c }]);
189 self.alternatives
190 .remove(ClassSetAlternativeStrings(Vec::from([Vec::from([c])])));
191 }
192 ClassSetOperand::CharacterClassEscape(cps) => {
193 self.codepoints.remove(cps.intervals());
194 let mut to_remove = ClassSetAlternativeStrings::new();
195 for alternative in &self.alternatives.0 {
196 if alternative.len() == 1 && cps.contains(alternative[0]) {
197 to_remove.0.push(alternative.clone());
198 }
199 }
200 self.alternatives.remove(to_remove);
201 }
202 ClassSetOperand::Class(class) => {
203 let mut codepoints_removed = CodePointSet::new();
204 for alternative in &class.alternatives.0 {
205 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
206 codepoints_removed.add_one(alternative[0]);
207 }
208 }
209 let mut alternatives_removed = ClassSetAlternativeStrings::new();
210 for alternative in &self.alternatives.0 {
211 if alternative.len() == 1 && class.codepoints.contains(alternative[0]) {
212 alternatives_removed.0.push(alternative.clone());
213 }
214 }
215 self.codepoints.remove(codepoints_removed.intervals());
216 self.codepoints.remove(class.codepoints.intervals());
217 self.alternatives.remove(alternatives_removed);
218 self.alternatives.remove(class.alternatives);
219 }
220 ClassSetOperand::ClassStringDisjunction(s) => {
221 let mut to_remove = CodePointSet::new();
222 for alternative in &s.0 {
223 if alternative.len() == 1 && self.codepoints.contains(alternative[0]) {
224 to_remove.add_one(alternative[0]);
225 }
226 }
227 self.codepoints.remove(to_remove.intervals());
228 self.alternatives.remove(s);
229 }
230 }
231 }
232}
233
234#[derive(Debug, Clone)]
236enum ClassSetOperand {
237 ClassSetCharacter(u32),
238 CharacterClassEscape(CodePointSet),
239 Class(ClassSet),
240 ClassStringDisjunction(ClassSetAlternativeStrings),
241}
242
243#[derive(Debug, Clone)]
244struct ClassSetAlternativeStrings(Vec<Vec<u32>>);
245
246impl ClassSetAlternativeStrings {
247 fn new() -> Self {
248 ClassSetAlternativeStrings(Vec::new())
249 }
250
251 fn extend(&mut self, other: Self) {
252 self.0.extend(other.0);
253 }
254
255 fn intersect(&mut self, other: Self) {
256 let mut retained = Vec::new();
257 for string in &self.0 {
258 for other_string in &other.0 {
259 if string == other_string {
260 retained.push(string.clone());
261 break;
262 }
263 }
264 }
265 self.0 = retained;
266 }
267
268 fn remove(&mut self, other: Self) {
269 self.0.retain(|string| !other.0.contains(string));
270 }
271
272 fn to_nodes(&self, icase: bool) -> Vec<ir::Node> {
273 let mut nodes = Vec::new();
274 for string in &self.0 {
275 nodes.push(ir::Node::Cat(
276 string
277 .iter()
278 .map(|cp| ir::Node::Char { c: *cp, icase })
279 .collect::<Vec<_>>(),
280 ));
281 }
282 nodes
283 }
284}
285
286fn error<S, T>(text: S) -> Result<T, Error>
287where
288 S: ToString,
289{
290 Err(Error {
291 text: text.to_string(),
292 })
293}
294
295fn make_cat(nodes: ir::NodeList) -> ir::Node {
296 match nodes.len() {
297 0 => ir::Node::Empty,
298 1 => nodes.into_iter().next().unwrap(),
299 _ => ir::Node::Cat(nodes),
300 }
301}
302
303fn make_alt(nodes: ir::NodeList) -> ir::Node {
304 let mut mright = None;
305 for node in nodes.into_iter().rev() {
306 match mright {
307 None => mright = Some(node),
308 Some(right) => mright = Some(ir::Node::Alt(Box::new(node), Box::new(right))),
309 }
310 }
311 mright.unwrap_or(ir::Node::Empty)
312}
313
314fn codepoints_from_class(ct: CharacterClassType, positive: bool) -> CodePointSet {
317 let mut cps;
318 match ct {
319 CharacterClassType::Digits => {
320 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::DIGITS.to_vec())
321 }
322 CharacterClassType::Words => {
323 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WORD_CHARS.to_vec())
324 }
325 CharacterClassType::Spaces => {
326 cps = CodePointSet::from_sorted_disjoint_intervals(charclasses::WHITESPACE.to_vec());
327 for &iv in charclasses::LINE_TERMINATOR.iter() {
328 cps.add(iv)
329 }
330 }
331 };
332 if !positive {
333 cps = cps.inverted()
334 }
335 cps
336}
337
338fn make_bracket_class(ct: CharacterClassType, positive: bool) -> ir::Node {
340 ir::Node::Bracket(BracketContents {
341 invert: false,
342 cps: codepoints_from_class(ct, positive),
343 })
344}
345
346fn add_class_atom(bc: &mut BracketContents, atom: ClassAtom) {
347 match atom {
348 ClassAtom::CodePoint(c) => bc.cps.add_one(c),
349 ClassAtom::CharacterClass {
350 class_type,
351 positive,
352 } => {
353 bc.cps.add_set(codepoints_from_class(class_type, positive));
354 }
355 ClassAtom::Range { iv, negate } => {
356 if negate {
357 bc.cps.add_set(iv.inverted());
358 } else {
359 bc.cps.add_set(iv);
360 }
361 }
362 }
363}
364
365struct LookaroundParams {
366 negate: bool,
367 backwards: bool,
368}
369
370struct Parser<I>
372where
373 I: Iterator<Item = u32>,
374{
375 input: Peekable<I>,
377
378 flags: api::Flags,
380
381 loop_count: u32,
383
384 group_count: CaptureGroupID,
386
387 group_count_max: u32,
389
390 named_group_indices: HashMap<CaptureGroupName, u32>,
392
393 has_lookbehind: bool,
395}
396
397impl<I> Parser<I>
398where
399 I: Iterator<Item = u32> + Clone,
400{
401 fn consume<C: Into<u32>>(&mut self, c: C) -> u32 {
403 let nc = self.input.next();
404 core::debug_assert!(nc == Some(c.into()), "char was not next");
405 nc.unwrap()
406 }
407
408 fn try_consume<C: Into<u32>>(&mut self, c: C) -> bool {
411 self.input.next_if_eq(&c.into()).is_some()
412 }
413
414 fn try_consume_str(&mut self, s: &str) -> bool {
417 let mut cursor = self.input.clone();
418 for c1 in s.chars() {
419 if cursor.next() != Some(c1 as u32) {
420 return false;
421 }
422 }
423 self.input = cursor;
424 true
425 }
426
427 fn fold_if_icase(&self, c: u32) -> u32 {
429 if self.flags.icase {
430 unicode::fold_code_point(c, self.flags.unicode)
431 } else {
432 c
433 }
434 }
435
436 fn peek(&mut self) -> Option<u32> {
438 self.input.peek().copied()
439 }
440
441 fn next(&mut self) -> Option<u32> {
443 self.input.next()
444 }
445
446 fn try_parse(&mut self) -> Result<ir::Regex, Error> {
447 self.parse_capture_groups()?;
448
449 let body = self.consume_disjunction()?;
452 match self.input.peek().copied() {
453 Some(c) if c == ')' as u32 => error("Unbalanced parenthesis"),
454 Some(c) => error(format!(
455 "Unexpected char: {}",
456 char::from_u32(c)
457 .map(String::from)
458 .unwrap_or_else(|| format!("\\u{c:04X}"))
459 )),
460 None => self.finalize(ir::Regex {
461 node: make_cat(vec![body, ir::Node::Goal]),
462 flags: self.flags,
463 }),
464 }
465 }
466
467 fn consume_disjunction(&mut self) -> Result<ir::Node, Error> {
469 let mut terms = vec![self.consume_term()?];
470 while self.try_consume('|') {
471 terms.push(self.consume_term()?)
472 }
473 Ok(make_alt(terms))
474 }
475
476 fn consume_term(&mut self) -> Result<ir::Node, Error> {
478 let mut result: Vec<ir::Node> = Vec::new();
479 loop {
480 let start_group = self.group_count;
481 let mut start_offset = result.len();
482 let mut quantifier_allowed = true;
483
484 let nc = self.peek();
485 if nc.is_none() {
486 return Ok(make_cat(result));
487 }
488 let c = nc.unwrap();
489 match to_char_sat(c) {
490 ')' | '|' => break,
492 '^' => {
494 self.consume('^');
495 result.push(ir::Node::Anchor {
496 anchor_type: ir::AnchorType::StartOfLine,
497 multiline: self.flags.multiline,
498 });
499 quantifier_allowed = false;
500 }
501 '$' => {
503 self.consume('$');
504 result.push(ir::Node::Anchor {
505 anchor_type: ir::AnchorType::EndOfLine,
506 multiline: self.flags.multiline,
507 });
508 quantifier_allowed = false;
509 }
510
511 '\\' => {
512 self.consume('\\');
513 let Some(c) = self.peek() else {
514 return error("Incomplete escape");
515 };
516 match to_char_sat(c) {
517 'b' => {
519 self.consume('b');
520 result.push(ir::Node::WordBoundary { invert: false });
521 }
522 'B' => {
524 self.consume('B');
525 result.push(ir::Node::WordBoundary { invert: true });
526 }
527 'c' if !self.flags.unicode => {
530 self.consume('c');
531 if self
532 .peek()
533 .and_then(char::from_u32)
534 .map(|c| c.is_ascii_alphabetic())
535 == Some(true)
536 {
537 result.push(ir::Node::Char {
538 c: self.next().expect("char was not next") % 32,
539 icase: self.flags.icase,
540 });
541 } else {
542 start_offset += 1;
543 result.push(ir::Node::Char {
544 c: u32::from('\\'),
545 icase: self.flags.icase,
546 });
547 result.push(ir::Node::Char {
548 c: u32::from('c'),
549 icase: self.flags.icase,
550 });
551 }
552 }
553 _ => {
555 result.push(self.consume_atom_escape()?);
556 }
557 }
558 }
559
560 '.' => {
562 self.consume('.');
563 result.push(if self.flags.dot_all {
564 ir::Node::MatchAny
565 } else {
566 ir::Node::MatchAnyExceptLineTerminator
567 });
568 }
569
570 '(' => {
571 if self.try_consume_str("(?=") {
572 quantifier_allowed = !self.flags.unicode;
574 result.push(self.consume_lookaround_assertion(LookaroundParams {
575 negate: false,
576 backwards: false,
577 })?);
578 } else if self.try_consume_str("(?!") {
579 quantifier_allowed = !self.flags.unicode;
581 result.push(self.consume_lookaround_assertion(LookaroundParams {
582 negate: true,
583 backwards: false,
584 })?);
585 } else if self.try_consume_str("(?<=") {
586 quantifier_allowed = false;
588 self.has_lookbehind = true;
589 result.push(self.consume_lookaround_assertion(LookaroundParams {
590 negate: false,
591 backwards: true,
592 })?);
593 } else if self.try_consume_str("(?<!") {
594 quantifier_allowed = false;
596 self.has_lookbehind = true;
597 result.push(self.consume_lookaround_assertion(LookaroundParams {
598 negate: true,
599 backwards: true,
600 })?);
601 } else if self.try_consume_str("(?:") {
602 result.push(self.consume_disjunction()?);
604 } else if let Some(group) = self.try_consume_modifier_group()? {
605 result.push(group);
606 } else {
607 self.consume('(');
609 let group = self.group_count;
610 if self.group_count as usize >= MAX_CAPTURE_GROUPS {
611 return error("Capture group count limit exceeded");
612 }
613 self.group_count += 1;
614
615 if self.try_consume_str("?") {
617 let group_name = if let Some(group_name) =
618 self.try_consume_named_capture_group_name()
619 {
620 group_name
621 } else {
622 return error("Invalid token at named capture group identifier");
623 };
624 let contents = self.consume_disjunction()?;
625 result.push(ir::Node::NamedCaptureGroup(
626 Box::new(contents),
627 group,
628 group_name,
629 ))
630 } else {
631 let contents = self.consume_disjunction()?;
632 result.push(ir::Node::CaptureGroup(Box::new(contents), group))
633 }
634 }
635 if !self.try_consume(')') {
636 return error("Unbalanced parenthesis");
637 }
638 }
639
640 '[' if self.flags.unicode_sets => {
642 self.consume('[');
643 let negate_set = self.try_consume('^');
644 result.push(
645 self.consume_class_set_expression(negate_set)?
646 .node(self.flags.icase, negate_set),
647 );
648 }
649
650 '[' => {
651 result.push(self.consume_bracket()?);
652 }
653
654 '{' if !self.flags.unicode => {
656 if self.try_consume_braced_quantifier().is_some() {
657 return error("Invalid braced quantifier");
658 }
659
660 result.push(ir::Node::Char {
662 c: self.consume(c),
663 icase: self.flags.icase,
664 })
665 }
666
667 '*' | '+' | '?' | ']' | '{' | '}' if self.flags.unicode => {
669 return error("Invalid atom character");
670 }
671
672 '*' | '+' | '?' => {
674 return error("Invalid atom character");
675 }
676
677 _ => {
680 self.consume(c);
681 result.push(ir::Node::Char {
682 c: self.fold_if_icase(c),
683 icase: self.flags.icase,
684 })
685 }
686 }
687
688 if let Some(quant) = self.try_consume_quantifier()? {
690 if !quantifier_allowed {
691 return error("Quantifier not allowed here");
692 }
693 if matches!(quant.max, Some(max) if quant.min > max) {
698 return error("Invalid quantifier");
699 }
700 let quantifee = result.split_off(start_offset);
701 if self.loop_count as usize >= MAX_LOOPS {
702 return error("Loop count limit exceeded");
703 }
704 self.loop_count += 1;
705 result.push(ir::Node::Loop {
706 loopee: Box::new(make_cat(quantifee)),
707 quant,
708 enclosed_groups: start_group..self.group_count,
709 });
710 }
711 }
712 Ok(make_cat(result))
713 }
714
715 fn try_consume_modifier_group(&mut self) -> Result<Option<ir::Node>, Error> {
716 let mut cursor = self.input.clone();
717 if cursor.next() != Some('(' as u32) {
718 return Ok(None);
719 }
720 if cursor.next() != Some('?' as u32) {
721 return Ok(None);
722 }
723 let Some(mut current) = cursor.next() else {
724 return Ok(None);
725 };
726
727 if to_char_sat(current) == '<' {
728 return Ok(None);
729 }
730
731 let mut seen_hyphen = false;
732 let mut saw_flag = false;
733 let mut icase_override: Option<bool> = None;
734 let mut multiline_override: Option<bool> = None;
735 let mut dot_all_override: Option<bool> = None;
736
737 loop {
738 let ch = to_char_sat(current);
739 match ch {
740 'i' | 'm' | 's' => {
741 let value = !seen_hyphen;
742 let target = match ch {
743 'i' => &mut icase_override,
744 'm' => &mut multiline_override,
745 's' => &mut dot_all_override,
746 _ => unreachable!(),
747 };
748 if target.is_some() {
749 return error("Invalid group modifier");
750 }
751 *target = Some(value);
752 saw_flag = true;
753 }
754 '-' => {
755 if seen_hyphen {
756 return error("Invalid group modifier");
757 }
758 seen_hyphen = true;
759 }
760 ':' => {
761 if !saw_flag {
762 return error("Invalid group modifier");
763 }
764 self.input = cursor;
765 let mut new_flags = self.flags;
766 if let Some(value) = icase_override {
767 new_flags.icase = value;
768 }
769 if let Some(value) = multiline_override {
770 new_flags.multiline = value;
771 }
772 if let Some(value) = dot_all_override {
773 new_flags.dot_all = value;
774 }
775 let saved_flags = self.flags;
776 self.flags = new_flags;
777 let contents = match self.consume_disjunction() {
778 Ok(node) => node,
779 Err(err) => {
780 self.flags = saved_flags;
781 return Err(err);
782 }
783 };
784 self.flags = saved_flags;
785 return Ok(Some(contents));
786 }
787 _ => {
788 return error("Invalid group modifier");
789 }
790 }
791
792 current = match cursor.next() {
793 Some(next) => next,
794 None => return error("Invalid group modifier"),
795 };
796 }
797 }
798
799 fn consume_bracket(&mut self) -> Result<ir::Node, Error> {
801 self.consume('[');
802 let invert = self.try_consume('^');
803 let mut result = BracketContents {
804 invert,
805 cps: CodePointSet::default(),
806 };
807
808 loop {
809 match self.peek().map(to_char_sat) {
810 None => {
811 return error("Unbalanced bracket");
812 }
813 Some(']') => {
814 self.consume(']');
815 if self.flags.icase {
816 result.cps = unicode::add_icase_code_points(result.cps);
817 }
818 return Ok(ir::Node::Bracket(result));
819 }
820 _ => {}
821 }
822
823 let Some(first) = self.try_consume_bracket_class_atom()? else {
825 continue;
826 };
827
828 if !self.try_consume('-') {
830 add_class_atom(&mut result, first);
831 continue;
832 }
833
834 let Some(second) = self.try_consume_bracket_class_atom()? else {
835 add_class_atom(&mut result, first);
837 add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
838 continue;
839 };
840
841 if let (ClassAtom::CodePoint(c1), ClassAtom::CodePoint(c2)) = (&first, &second) {
844 if c1 > c2 {
845 return error(
846 "Range values reversed, start char code is greater than end char code.",
847 );
848 }
849 result.cps.add(Interval {
850 first: *c1,
851 last: *c2,
852 });
853
854 continue;
855 }
856
857 if self.flags.unicode {
858 return error("Invalid character range");
859 }
860
861 add_class_atom(&mut result, first);
863 add_class_atom(&mut result, ClassAtom::CodePoint(u32::from('-')));
864 add_class_atom(&mut result, second);
865 }
866 }
867
868 fn try_consume_bracket_class_atom(&mut self) -> Result<Option<ClassAtom>, Error> {
869 let c = self.peek();
870 if c.is_none() {
871 return Ok(None);
872 }
873 let c = c.unwrap();
874 match to_char_sat(c) {
875 ']' => Ok(None),
877
878 '\\' => {
880 self.consume('\\');
881 let ec = if let Some(ec) = self.peek() {
882 ec
883 } else {
884 return error("Unterminated escape");
885 };
886 match to_char_sat(ec) {
887 'b' => {
889 self.consume('b');
890 Ok(Some(ClassAtom::CodePoint(u32::from('\x08'))))
891 }
892 '-' if self.flags.unicode => {
894 self.consume('-');
895 Ok(Some(ClassAtom::CodePoint(u32::from('-'))))
896 }
897 'c' if !self.flags.unicode => {
898 let input = self.input.clone();
899 self.consume('c');
900 match self.peek().map(to_char_sat) {
901 Some('0'..='9' | '_') => {
903 let next = self.next().expect("char was not next");
904 Ok(Some(ClassAtom::CodePoint(next & 0x1F)))
905 }
906 Some('a'..='z' | 'A'..='Z') => {
908 let next = self.next().expect("char was not next");
909 Ok(Some(ClassAtom::CodePoint(next % 32)))
910 }
911 _ => {
913 self.input = input;
914 Ok(Some(ClassAtom::CodePoint(u32::from('\\'))))
915 }
916 }
917 }
918 'd' => {
920 self.consume('d');
921 Ok(Some(ClassAtom::CharacterClass {
922 class_type: CharacterClassType::Digits,
923 positive: true,
924 }))
925 }
926 'D' => {
928 self.consume('D');
929 Ok(Some(ClassAtom::CharacterClass {
930 class_type: CharacterClassType::Digits,
931 positive: false,
932 }))
933 }
934 's' => {
936 self.consume('s');
937 Ok(Some(ClassAtom::CharacterClass {
938 class_type: CharacterClassType::Spaces,
939 positive: true,
940 }))
941 }
942 'S' => {
944 self.consume('S');
945 Ok(Some(ClassAtom::CharacterClass {
946 class_type: CharacterClassType::Spaces,
947 positive: false,
948 }))
949 }
950 'w' => {
952 self.consume('w');
953 Ok(Some(ClassAtom::CharacterClass {
954 class_type: CharacterClassType::Words,
955 positive: true,
956 }))
957 }
958 'W' => {
960 self.consume('W');
961 Ok(Some(ClassAtom::CharacterClass {
962 class_type: CharacterClassType::Words,
963 positive: false,
964 }))
965 }
966 'p' | 'P' if self.flags.unicode => {
969 self.consume(ec);
970 let negate = ec == 'P' as u32;
971 match self.try_consume_unicode_property_escape()? {
972 PropertyEscapeKind::CharacterClass(s) => Ok(Some(ClassAtom::Range {
973 iv: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
974 negate,
975 })),
976 PropertyEscapeKind::StringSet(_) => error("Invalid property escape"),
977 }
978 }
979 _ => {
981 let cc = self.consume_character_escape()?;
982 Ok(Some(ClassAtom::CodePoint(cc)))
983 }
984 }
985 }
986
987 _ => Ok(Some(ClassAtom::CodePoint(self.consume(c)))),
988 }
989 }
990
991 fn consume_class_set_expression(&mut self, negate_set: bool) -> Result<ClassSet, Error> {
993 let mut result = ClassSet::new();
994
995 let first = match self.peek() {
996 Some(0x5D ) => {
997 self.consume(']');
998 return Ok(result);
999 }
1000 Some(_) => self.consume_class_set_operand(negate_set)?,
1001 None => {
1002 return error("Unbalanced class set bracket");
1003 }
1004 };
1005
1006 enum ClassSetOperator {
1007 Union,
1008 Intersection,
1009 Subtraction,
1010 }
1011
1012 let op = match self.peek() {
1013 Some(0x5D ) => {
1014 self.consume(']');
1015 result.union_operand(first);
1016 return Ok(result);
1017 }
1018 Some(0x26 ) => {
1019 self.consume('&');
1020 if self.peek() == Some(0x26 ) {
1021 self.consume('&');
1022 result.union_operand(first.clone());
1023 ClassSetOperator::Intersection
1024 } else {
1025 result.codepoints.add_one(0x26 );
1026 ClassSetOperator::Union
1027 }
1028 }
1029 Some(0x2D ) => {
1030 self.consume('-');
1031 if self.peek() == Some(0x2D ) {
1032 self.consume('-');
1033 result.union_operand(first.clone());
1034 ClassSetOperator::Subtraction
1035 } else {
1036 match first {
1037 ClassSetOperand::ClassSetCharacter(first) => {
1038 let ClassSetOperand::ClassSetCharacter(last) =
1039 self.consume_class_set_operand(negate_set)?
1040 else {
1041 return error("Invalid class set range");
1042 };
1043 if first > last {
1044 return error("Invalid class set range");
1045 }
1046 result.codepoints.add(Interval { first, last });
1047 }
1048 _ => {
1049 return error("Invalid class set range");
1050 }
1051 };
1052 ClassSetOperator::Union
1053 }
1054 }
1055 Some(_) => {
1056 result.union_operand(first.clone());
1057 ClassSetOperator::Union
1058 }
1059 None => {
1060 return error("Unbalanced class set bracket");
1061 }
1062 };
1063
1064 match op {
1065 ClassSetOperator::Union => {
1066 loop {
1067 let operand = match self.peek() {
1068 Some(0x5D ) => {
1069 self.consume(']');
1070 return Ok(result);
1071 }
1072 Some(_) => self.consume_class_set_operand(negate_set)?,
1073 None => return error("Unbalanced class set bracket"),
1074 };
1075 if self.peek() == Some(0x2D ) {
1076 self.consume('-');
1077 match operand {
1078 ClassSetOperand::ClassSetCharacter(first) => {
1079 let ClassSetOperand::ClassSetCharacter(last) =
1080 self.consume_class_set_operand(negate_set)?
1081 else {
1082 return error("Invalid class set range");
1083 };
1084 if first > last {
1085 return error("Invalid class set range");
1086 }
1087 result.codepoints.add(Interval { first, last });
1088 }
1089 _ => {
1090 return error("Invalid class set range");
1091 }
1092 };
1093 } else {
1094 result.union_operand(operand);
1095 }
1096 }
1097 }
1098 ClassSetOperator::Intersection => {
1100 loop {
1101 let operand = self.consume_class_set_operand(negate_set)?;
1102 result.intersect_operand(operand);
1103 match self.next() {
1104 Some(0x5D ) => return Ok(result),
1105 Some(0x26 ) => {}
1106 Some(_) => return error("Unexpected character in class set intersection"),
1107 _ => return error("Unbalanced class set bracket"),
1108 }
1109 if self.next() != Some(0x26 ) {
1110 return error("Unbalanced class set bracket");
1111 }
1112 }
1113 }
1114 ClassSetOperator::Subtraction => {
1116 loop {
1117 let operand = self.consume_class_set_operand(negate_set)?;
1118 result.subtract_operand(operand);
1119 match self.next() {
1120 Some(0x5D ) => return Ok(result),
1121 Some(0x2D ) => {}
1122 Some(_) => return error("Unexpected character in class set subtraction"),
1123 _ => return error("Unbalanced class set bracket"),
1124 }
1125 if self.next() != Some(0x2D ) {
1126 return error("Unbalanced class set bracket");
1127 }
1128 }
1129 }
1130 }
1131 }
1132
1133 fn consume_class_set_operand(&mut self, negate_set: bool) -> Result<ClassSetOperand, Error> {
1134 use ClassSetOperand::*;
1135 let Some(cp) = self.peek() else {
1136 return error("Empty class set operand");
1137 };
1138 match cp {
1139 0x5B => {
1142 self.consume('[');
1143 let negate_set = self.try_consume('^');
1144 let result = self.consume_class_set_expression(negate_set)?;
1145 if negate_set {
1146 result.codepoints.inverted();
1147 }
1148 Ok(Class(result))
1149 }
1150 0x5C => {
1155 self.consume('\\');
1156 let Some(cp) = self.peek() else {
1157 return error("Incomplete class set escape");
1158 };
1159 match cp {
1160 0x71 => {
1162 self.consume('q');
1163 if !self.try_consume('{') {
1164 return error("Invalid class set escape: expected {");
1165 }
1166 let mut alternatives = Vec::new();
1167 let mut alternative = Vec::new();
1168 loop {
1169 match self.peek() {
1170 Some(0x7D ) => {
1171 self.consume('}');
1172 if !alternative.is_empty() {
1173 alternatives.push(alternative.clone());
1174 alternative.clear();
1175 }
1176 break;
1177 }
1178 Some(0x7C ) => {
1179 self.consume('|');
1180 if !alternative.is_empty() {
1181 alternatives.push(alternative.clone());
1182 alternative.clear();
1183 }
1184 }
1185 Some(_) => {
1186 alternative.push(self.consume_class_set_character()?);
1187 }
1188 None => {
1189 return error("Unbalanced class set string disjunction");
1190 }
1191 }
1192 }
1193 Ok(ClassStringDisjunction(ClassSetAlternativeStrings(alternatives)))
1194 }
1195 0x64 => {
1197 self.consume('d');
1198 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, true)))
1199 }
1200 0x44 => {
1202 self.consume('D');
1203 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Digits, false)))
1204 }
1205 0x73 => {
1207 self.consume('s');
1208 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, true)))
1209 }
1210 0x53 => {
1212 self.consume('S');
1213 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Spaces, false)))
1214 }
1215 0x77 => {
1217 self.consume('w');
1218 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, true)))
1219 }
1220 0x57 => {
1222 self.consume('W');
1223 Ok(CharacterClassEscape(codepoints_from_class(CharacterClassType::Words, false)))
1224 }
1225 0x70 => {
1227 self.consume('p');
1228 match self.try_consume_unicode_property_escape()? {
1229 PropertyEscapeKind::CharacterClass(intervals) => {
1230 Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1231 intervals.to_vec(),
1232 )))
1233 }
1234 PropertyEscapeKind::StringSet(_) if negate_set => error("Invalid character escape"),
1235 PropertyEscapeKind::StringSet(strings) => {
1236 Ok(ClassStringDisjunction(ClassSetAlternativeStrings(strings.iter().map(|s| s.to_vec()).collect())))
1237 }
1238 }
1239 }
1240 0x50 => {
1242 self.consume('P');
1243 match self.try_consume_unicode_property_escape()? {
1244 PropertyEscapeKind::CharacterClass(s) => {
1245 Ok(CharacterClassEscape(CodePointSet::from_sorted_disjoint_intervals(
1246 s.to_vec(),
1247 ).inverted()))
1248 }
1249 PropertyEscapeKind::StringSet(_) => error("Invalid character escape"),
1250 }
1251 }
1252 0x62 => {
1254 Ok(ClassSetCharacter(self.consume(cp)))
1255 }
1256 _ if Self::is_class_set_reserved_punctuator(cp) => Ok(ClassSetCharacter(self.consume(cp))),
1258 _ => Ok(ClassSetCharacter(self.consume_character_escape()?))
1260 }
1261 }
1262 _ => Ok(ClassSetCharacter(self.consume_class_set_character()?)),
1265 }
1266 }
1267
1268 fn consume_class_set_character(&mut self) -> Result<u32, Error> {
1270 let Some(cp) = self.next() else {
1271 return error("Incomplete class set character");
1272 };
1273 match cp {
1274 0x5C => {
1275 let Some(cp) = self.peek() else {
1276 return error("Incomplete class set escape");
1277 };
1278 match cp {
1279 0x62 => {
1281 Ok(self.consume(cp))
1282 }
1283 _ if Self::is_class_set_reserved_punctuator(cp) => Ok(self.consume(cp)),
1285 _ => Ok(self.consume_character_escape()?)
1287 }
1288 }
1289 0x28 | 0x29 | 0x7B | 0x7D | 0x2F | 0x2D | 0x7C => error("Invalid class set character"),
1292 _ => {
1293 if Self::is_class_set_reserved_double_punctuator(cp)
1294 && let Some(cp) = self.peek()
1295 && Self::is_class_set_reserved_double_punctuator(cp) {
1296 return error("Invalid class set character");
1297 }
1298 Ok(cp)
1299 }
1300 }
1301 }
1302
1303 fn is_class_set_reserved_punctuator(cp: u32) -> bool {
1305 match cp {
1306 0x26 | 0x2D | 0x21 | 0x23 | 0x25 | 0x2C | 0x3A | 0x3B | 0x3C | 0x3D | 0x3E | 0x40 | 0x60 | 0x7E => true,
1309 _ => false,
1310 }
1311 }
1312
1313 fn is_class_set_reserved_double_punctuator(cp: u32) -> bool {
1314 match cp {
1315 0x26 | 0x21 | 0x23 | 0x24 | 0x25 | 0x2A | 0x2B | 0x2C | 0x2E | 0x3A | 0x3B | 0x3C | 0x3D | 0x3E | 0x3F | 0x40 | 0x5E | 0x60 | 0x7E => true,
1319 _ => false,
1320 }
1321 }
1322
1323 fn try_consume_quantifier(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1324 if let Some(mut quant) = self.try_consume_quantifier_prefix()? {
1325 quant.greedy = !self.try_consume('?');
1326 Ok(Some(quant))
1327 } else {
1328 Ok(None)
1329 }
1330 }
1331
1332 fn try_consume_quantifier_prefix(&mut self) -> Result<Option<ir::Quantifier>, Error> {
1333 let nc = self.peek();
1334 if nc.is_none() {
1335 return Ok(None);
1336 }
1337 let c = nc.unwrap();
1338 match char::from_u32(c) {
1339 Some('+') => {
1340 self.consume('+');
1341 Ok(Some(ir::Quantifier {
1342 min: 1,
1343 max: None,
1344 greedy: true,
1345 }))
1346 }
1347 Some('*') => {
1348 self.consume('*');
1349 Ok(Some(ir::Quantifier {
1350 min: 0,
1351 max: None,
1352 greedy: true,
1353 }))
1354 }
1355 Some('?') => {
1356 self.consume('?');
1357 Ok(Some(ir::Quantifier {
1358 min: 0,
1359 max: Some(1),
1360 greedy: true,
1361 }))
1362 }
1363 Some('{') => {
1364 if let Some(quantifier) = self.try_consume_braced_quantifier() {
1365 Ok(Some(quantifier))
1366 } else if self.flags.unicode {
1367 error("Invalid quantifier")
1370 } else {
1371 Ok(None)
1372 }
1373 }
1374 _ => Ok(None),
1375 }
1376 }
1377
1378 fn try_consume_braced_quantifier(&mut self) -> Option<ir::Quantifier> {
1379 let pre_input = self.input.clone();
1381 self.consume('{');
1382 let optmin = self.try_consume_decimal_integer_literal();
1383 let Some(optmin) = optmin else {
1384 self.input = pre_input;
1386 return None;
1387 };
1388 let mut quant = ir::Quantifier {
1389 min: optmin,
1390 max: Some(optmin),
1391 greedy: true,
1392 };
1393 if self.try_consume(',') {
1394 let max = self.try_consume_decimal_integer_literal();
1395 quant.max = max;
1398 } else {
1399 }
1401 if !self.try_consume('}') {
1402 self.input = pre_input;
1404 return None;
1405 }
1406 Some(quant)
1407 }
1408
1409 fn try_consume_decimal_integer_literal(&mut self) -> Option<usize> {
1413 let mut result: usize = 0;
1414 let mut char_count = 0;
1415 while let Some(c) = self.peek() {
1416 if let Some(digit) = char::from_u32(c).and_then(|c| char::to_digit(c, 10)) {
1417 self.consume(c);
1418 char_count += 1;
1419 result = result.saturating_mul(10);
1420 result = result.saturating_add(digit as usize);
1421 } else {
1422 break;
1423 }
1424 }
1425 if char_count > 0 { Some(result) } else { None }
1426 }
1427
1428 fn consume_lookaround_assertion(
1429 &mut self,
1430 params: LookaroundParams,
1431 ) -> Result<ir::Node, Error> {
1432 let start_group = self.group_count;
1433 let contents = self.consume_disjunction()?;
1434 let end_group = self.group_count;
1435 Ok(ir::Node::LookaroundAssertion {
1436 negate: params.negate,
1437 backwards: params.backwards,
1438 start_group,
1439 end_group,
1440 contents: Box::new(contents),
1441 })
1442 }
1443
1444 fn consume_character_escape(&mut self) -> Result<u32, Error> {
1445 let c = self.next().expect("Should have a character");
1446 let ch = to_char_sat(c);
1447 match ch {
1448 'f' => Ok(0xC),
1450 'n' => Ok(0xA),
1452 'r' => Ok(0xD),
1454 't' => Ok(0x9),
1456 'v' => Ok(0xB),
1458 'c' => {
1460 if let Some(nc) = self.next().and_then(char::from_u32)
1461 && (nc.is_ascii_lowercase() || nc.is_ascii_uppercase())
1462 {
1463 return Ok((nc as u32) % 32);
1464 }
1465 error("Invalid character escape")
1466 }
1467 '0' if self
1469 .peek()
1470 .and_then(char::from_u32)
1471 .map(|c: char| c.is_ascii_digit())
1472 != Some(true) =>
1473 {
1474 Ok(0x0)
1475 }
1476 'x' => {
1478 let hex_to_digit = |c: char| c.to_digit(16);
1479 let x1 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1480 let x2 = self.next().and_then(char::from_u32).and_then(hex_to_digit);
1481 match (x1, x2) {
1482 (Some(x1), Some(x2)) => Ok(x1 * 16 + x2),
1483 _ if !self.flags.unicode => Ok(c),
1485 _ => error("Invalid character escape"),
1486 }
1487 }
1488 'u' => {
1490 if let Some(c) = self.try_escape_unicode_sequence() {
1491 Ok(c)
1492 } else if !self.flags.unicode {
1493 Ok(c)
1495 } else {
1496 error("Invalid unicode escape")
1497 }
1498 }
1499 '0'..='7' if !self.flags.unicode => {
1501 let Some(c1) = self.peek() else {
1502 return Ok(c - '0' as u32);
1503 };
1504 let ch1 = to_char_sat(c1);
1505
1506 match ch {
1507 '0' if ('8'..='9').contains(&ch1) => Ok(0x0),
1509 _ if !('0'..='7').contains(&ch1) => Ok(c - '0' as u32),
1511 '4'..='7' => {
1513 self.consume(c1);
1514 Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1515 }
1516 '0'..='3' => {
1519 self.consume(c1);
1520 if self.peek().map(|c2| ('0'..='7').contains(&to_char_sat(c2)))
1521 == Some(true)
1522 {
1523 let c2 = self.next().expect("char was not next");
1524 Ok((c - '0' as u32) * 64 + (c1 - '0' as u32) * 8 + c2 - '0' as u32)
1525 } else {
1526 Ok((c - '0' as u32) * 8 + c1 - '0' as u32)
1527 }
1528 }
1529 _ => unreachable!(),
1530 }
1531 }
1532 '^' | '$' | '\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|'
1535 | '/' => Ok(c),
1536 _ if !self.flags.unicode => Ok(c),
1538 _ => error("Invalid character escape"),
1539 }
1540 }
1541
1542 fn consume_atom_escape(&mut self) -> Result<ir::Node, Error> {
1544 let Some(c) = self.peek() else {
1545 return error("Incomplete escape");
1546 };
1547 match to_char_sat(c) {
1548 'd' | 'D' => {
1549 self.consume(c);
1550 Ok(make_bracket_class(
1551 CharacterClassType::Digits,
1552 c == 'd' as u32,
1553 ))
1554 }
1555
1556 's' | 'S' => {
1557 self.consume(c);
1558 Ok(make_bracket_class(
1559 CharacterClassType::Spaces,
1560 c == 's' as u32,
1561 ))
1562 }
1563
1564 'w' | 'W' => {
1565 self.consume(c);
1566 Ok(make_bracket_class(
1567 CharacterClassType::Words,
1568 c == 'w' as u32,
1569 ))
1570 }
1571
1572 'p' | 'P' if self.flags.unicode || self.flags.unicode_sets => {
1575 self.consume(c);
1576 let negate = c == 'P' as u32;
1577 let property_escape = self.try_consume_unicode_property_escape()?;
1578 match property_escape {
1579 PropertyEscapeKind::CharacterClass(s) => {
1580 Ok(ir::Node::Bracket(BracketContents {
1581 invert: negate,
1582 cps: CodePointSet::from_sorted_disjoint_intervals(s.to_vec()),
1583 }))
1584 }
1585 PropertyEscapeKind::StringSet(_) if negate => error("Invalid character escape"),
1586 PropertyEscapeKind::StringSet(strings) => Ok(make_alt(
1587 strings
1588 .iter()
1589 .map(|s| {
1590 ir::Node::Cat(
1591 s.iter()
1592 .map(|c| ir::Node::Char {
1593 c: *c,
1594 icase: self.flags.icase,
1595 })
1596 .collect(),
1597 )
1598 })
1599 .collect(),
1600 )),
1601 }
1602 }
1603
1604 '1'..='9' if self.flags.unicode => {
1607 let group = self.try_consume_decimal_integer_literal().unwrap();
1608 if group <= self.group_count_max as usize {
1609 Ok(ir::Node::BackRef(group as u32))
1610 } else {
1611 error("Invalid character escape")
1612 }
1613 }
1614
1615 '1'..='9' => {
1619 let input = self.input.clone();
1620 let group = self.try_consume_decimal_integer_literal().unwrap();
1621
1622 if group <= self.group_count_max as usize {
1623 Ok(ir::Node::BackRef(group as u32))
1624 } else {
1625 self.input = input;
1626 let c = self.consume_character_escape()?;
1627 Ok(ir::Node::Char {
1628 c: self.fold_if_icase(c),
1629 icase: self.flags.icase,
1630 })
1631 }
1632 }
1633
1634 'k' if self.flags.unicode || !self.named_group_indices.is_empty() => {
1636 self.consume('k');
1637
1638 if let Some(group_name) = self.try_consume_named_capture_group_name() {
1640 if let Some(index) = self.named_group_indices.get(&group_name) {
1641 Ok(ir::Node::BackRef(*index + 1))
1642 } else {
1643 error(format!(
1644 "Backreference to invalid named capture group: {}",
1645 &group_name
1646 ))
1647 }
1648 } else {
1649 error("Unexpected end of named backreference")
1650 }
1651 }
1652
1653 'k' => {
1655 self.consume('k');
1656 Ok(ir::Node::Char {
1657 c: self.fold_if_icase(c),
1658 icase: self.flags.icase,
1659 })
1660 }
1661
1662 _ => {
1663 let c = self.consume_character_escape()?;
1664 Ok(ir::Node::Char {
1665 c: self.fold_if_icase(c),
1666 icase: self.flags.icase,
1667 })
1668 }
1669 }
1670 }
1671
1672 #[allow(clippy::branches_sharing_code)]
1673 fn try_escape_unicode_sequence(&mut self) -> Option<u32> {
1674 let mut orig_input = self.input.clone();
1675
1676 if self.try_consume('{') {
1678 let mut s = String::new();
1679 loop {
1680 match self.next().and_then(char::from_u32) {
1681 Some('}') => break,
1682 Some(c) => s.push(c),
1683 None => {
1684 self.input = orig_input;
1686 return None;
1687 }
1688 }
1689 }
1690
1691 match u32::from_str_radix(&s, 16) {
1692 Ok(u) => {
1693 if u > 0x10_FFFF {
1694 self.input = orig_input;
1695 None
1696 } else {
1697 Some(u)
1698 }
1699 }
1700 _ => {
1701 self.input = orig_input;
1702 None
1703 }
1704 }
1705 } else {
1706 let mut s = String::new();
1708 for _ in 0..4 {
1709 if let Some(c) = self.next().and_then(char::from_u32) {
1710 s.push(c);
1711 } else {
1712 self.input = orig_input;
1714 return None;
1715 }
1716 }
1717 match u16::from_str_radix(&s, 16) {
1718 Ok(u) => {
1719 if (0xD800..=0xDBFF).contains(&u) {
1720 if !self.try_consume_str("\\u") {
1724 return Some(u as u32);
1725 }
1726 orig_input = self.input.clone();
1727
1728 let result = (|| {
1733 let mut s = String::new();
1734 for _ in 0..4 {
1735 let c = self.next().and_then(char::from_u32)?;
1736 s.push(c);
1737 }
1738
1739 let uu = u16::from_str_radix(&s, 16).ok()?;
1740 let ch = char::decode_utf16([u, uu]).next()?.ok()?;
1741 Some(u32::from(ch))
1742 })();
1743
1744 result.or_else(|| {
1745 self.input = orig_input;
1746 Some(u as u32)
1747 })
1748 } else {
1749 Some(u as u32)
1752 }
1753 }
1754 _ => {
1755 self.input = orig_input;
1756 None
1757 }
1758 }
1759 }
1760 }
1761
1762 fn try_consume_named_capture_group_name(&mut self) -> Option<String> {
1763 if !self.try_consume('<') {
1764 return None;
1765 }
1766
1767 let orig_input = self.input.clone();
1768 let mut group_name = String::new();
1769
1770 if let Some(mut c) = self.next().and_then(char::from_u32) {
1771 if c == '\\' && self.try_consume('u') {
1772 if let Some(escaped) = self.try_escape_unicode_sequence().and_then(char::from_u32) {
1773 c = escaped;
1774 } else {
1775 self.input = orig_input;
1776 return None;
1777 }
1778 }
1779
1780 if interval_contains(id_start_ranges(), c.into()) || c == '$' || c == '_' {
1781 group_name.push(c);
1782 } else {
1783 self.input = orig_input;
1784 return None;
1785 }
1786 } else {
1787 self.input = orig_input;
1788 return None;
1789 }
1790
1791 loop {
1792 if let Some(mut c) = self.next().and_then(char::from_u32) {
1793 if c == '\\' && self.try_consume('u') {
1794 if let Some(escaped) =
1795 self.try_escape_unicode_sequence().and_then(char::from_u32)
1796 {
1797 c = escaped;
1798 } else {
1799 self.input = orig_input;
1800 return None;
1801 }
1802 }
1803
1804 if c == '>' {
1805 break;
1806 }
1807
1808 if interval_contains(id_continue_ranges(), c.into()) || c == '$' || c == '_' || c == '\u{200C}' || c == '\u{200D}'
1809 {
1811 group_name.push(c);
1812 } else {
1813 self.input = orig_input;
1814 return None;
1815 }
1816 } else {
1817 self.input = orig_input;
1818 return None;
1819 }
1820 }
1821
1822 Some(group_name)
1823 }
1824
1825 fn parse_capture_groups(&mut self) -> Result<(), Error> {
1827 let orig_input = self.input.clone();
1828
1829 loop {
1830 match self.next().map(to_char_sat) {
1831 Some('\\') => {
1832 self.next();
1833 continue;
1834 }
1835 Some('[') => loop {
1836 match self.next().map(to_char_sat) {
1837 Some('\\') => {
1838 self.next();
1839 continue;
1840 }
1841 Some(']') => break,
1842 Some(_) => continue,
1843 None => break,
1844 }
1845 },
1846 Some('(') => {
1847 if self.try_consume_str("?")
1848 && let Some(name) = self.try_consume_named_capture_group_name()
1849 && self
1850 .named_group_indices
1851 .insert(name, self.group_count_max)
1852 .is_some()
1853 {
1854 return error("Duplicate capture group name");
1855 }
1856 self.group_count_max = if self.group_count_max + 1 > MAX_CAPTURE_GROUPS as u32 {
1857 MAX_CAPTURE_GROUPS as u32
1858 } else {
1859 self.group_count_max + 1
1860 };
1861 }
1862 Some(_) => continue,
1863 None => break,
1864 }
1865 }
1866
1867 self.input = orig_input;
1868
1869 Ok(())
1870 }
1871
1872 fn try_consume_unicode_property_escape(&mut self) -> Result<PropertyEscapeKind, Error> {
1873 if !self.try_consume('{') {
1874 return error("Invalid character at property escape start");
1875 }
1876
1877 let mut buffer = String::new();
1878 let mut name = None;
1879
1880 while let Some(c) = self.peek().and_then(char::from_u32) {
1881 match c {
1882 '}' => {
1883 self.consume(c);
1884 let Some(value) =
1885 unicode_property_from_str(&buffer, name, self.flags.unicode_sets)
1886 else {
1887 break;
1888 };
1889 return Ok(value);
1890 }
1891 '=' if name.is_none() => {
1892 self.consume(c);
1893 let Some(n) = unicode_property_name_from_str(&buffer) else {
1894 break;
1895 };
1896 name = Some(n);
1897 buffer.clear();
1898 }
1899 c if c.is_ascii_alphanumeric() || c == '_' => {
1900 self.consume(c);
1901 buffer.push(c);
1902 }
1903 _ => break,
1904 }
1905 }
1906
1907 error("Invalid property name")
1908 }
1909
1910 fn finalize(&self, mut re: ir::Regex) -> Result<ir::Regex, Error> {
1911 debug_assert!(self.loop_count <= MAX_LOOPS as u32);
1912 debug_assert!(self.group_count as usize <= MAX_CAPTURE_GROUPS);
1913 if self.has_lookbehind {
1914 ir::walk_mut(
1915 false,
1916 re.flags.unicode,
1917 &mut re.node,
1918 &mut ir::Node::reverse_cats,
1919 );
1920 }
1921 Ok(re)
1922 }
1923}
1924
1925pub fn try_parse<I>(pattern: I, flags: api::Flags) -> Result<ir::Regex, Error>
1928where
1929 I: Iterator<Item = u32> + Clone,
1930{
1931 let mut p = Parser {
1932 input: pattern.peekable(),
1933 flags,
1934 loop_count: 0,
1935 group_count: 0,
1936 named_group_indices: HashMap::new(),
1937 group_count_max: 0,
1938 has_lookbehind: false,
1939 };
1940 p.try_parse()
1941}