swc_ecma_parser/lexer/
state.rs

1use std::mem::take;
2
3use swc_common::BytePos;
4use swc_ecma_ast::EsVersion;
5use swc_ecma_lexer::{
6    common::{
7        lexer::{
8            char::CharExt,
9            comments_buffer::{BufferedCommentKind, CommentsBufferTrait},
10            state::State as StateTrait,
11            LexResult,
12        },
13        syntax::SyntaxFlags,
14    },
15    error::SyntaxError,
16    TokenContexts,
17};
18
19use super::{Context, Input, Lexer, LexerTrait};
20use crate::{
21    error::Error,
22    input::Tokens,
23    lexer::{
24        comments_buffer::CommentsBufferCheckpoint,
25        token::{Token, TokenAndSpan, TokenValue},
26    },
27};
28
29/// State of lexer.
30///
31/// Ported from babylon.
32#[derive(Clone)]
33pub struct State {
34    /// if line break exists between previous token and new token?
35    pub had_line_break: bool,
36    /// if line break exists before last?
37    pub had_line_break_before_last: bool,
38    /// TODO: Remove this field.
39    is_first: bool,
40    pub next_regexp: Option<BytePos>,
41    pub start: BytePos,
42    pub prev_hi: BytePos,
43
44    pub(super) token_value: Option<TokenValue>,
45    token_type: Option<Token>,
46}
47
48pub struct LexerCheckpoint {
49    comments_buffer: CommentsBufferCheckpoint,
50    state: State,
51    ctx: Context,
52    input_last_pos: BytePos,
53}
54
55impl<'a> swc_ecma_lexer::common::input::Tokens<TokenAndSpan> for Lexer<'a> {
56    type Checkpoint = LexerCheckpoint;
57
58    fn checkpoint_save(&self) -> Self::Checkpoint {
59        Self::Checkpoint {
60            state: self.state.clone(),
61            ctx: self.ctx,
62            input_last_pos: self.input.last_pos(),
63            comments_buffer: self
64                .comments_buffer
65                .as_ref()
66                .map(|cb| cb.checkpoint_save())
67                .unwrap_or_default(),
68        }
69    }
70
71    fn checkpoint_load(&mut self, checkpoint: Self::Checkpoint) {
72        self.state = checkpoint.state;
73        self.ctx = checkpoint.ctx;
74        unsafe { self.input.reset_to(checkpoint.input_last_pos) };
75        if let Some(comments_buffer) = self.comments_buffer.as_mut() {
76            comments_buffer.checkpoint_load(checkpoint.comments_buffer);
77        }
78    }
79
80    #[inline]
81    fn set_ctx(&mut self, ctx: Context) {
82        if ctx.contains(Context::Module) && !self.module_errors.is_empty() {
83            self.errors.append(&mut self.module_errors);
84        }
85        self.ctx = ctx
86    }
87
88    #[inline]
89    fn ctx(&self) -> Context {
90        self.ctx
91    }
92
93    #[inline]
94    fn ctx_mut(&mut self) -> &mut Context {
95        &mut self.ctx
96    }
97
98    #[inline]
99    fn syntax(&self) -> SyntaxFlags {
100        self.syntax
101    }
102
103    #[inline]
104    fn target(&self) -> EsVersion {
105        self.target
106    }
107
108    #[inline]
109    fn start_pos(&self) -> BytePos {
110        self.start_pos
111    }
112
113    #[inline]
114    fn set_expr_allowed(&mut self, _: bool) {}
115
116    #[inline]
117    fn set_next_regexp(&mut self, start: Option<BytePos>) {
118        self.state.next_regexp = start;
119    }
120
121    #[inline]
122    fn token_context(&self) -> &TokenContexts {
123        unreachable!();
124    }
125
126    #[inline]
127    fn token_context_mut(&mut self) -> &mut TokenContexts {
128        unreachable!();
129    }
130
131    #[inline]
132    fn set_token_context(&mut self, _: TokenContexts) {
133        unreachable!();
134    }
135
136    fn add_error(&mut self, error: Error) {
137        self.errors.push(error);
138    }
139
140    fn add_module_mode_error(&mut self, error: Error) {
141        if self.ctx.contains(Context::Module) {
142            self.add_error(error);
143            return;
144        }
145        self.module_errors.push(error);
146    }
147
148    #[inline]
149    fn take_errors(&mut self) -> Vec<Error> {
150        take(&mut self.errors)
151    }
152
153    #[inline]
154    fn take_script_module_errors(&mut self) -> Vec<Error> {
155        take(&mut self.module_errors)
156    }
157
158    #[inline]
159    fn end_pos(&self) -> BytePos {
160        self.input.end_pos()
161    }
162
163    #[inline]
164    fn update_token_flags(&mut self, f: impl FnOnce(&mut swc_ecma_lexer::lexer::TokenFlags)) {
165        f(&mut self.token_flags)
166    }
167
168    #[inline]
169    fn token_flags(&self) -> swc_ecma_lexer::lexer::TokenFlags {
170        self.token_flags
171    }
172}
173
174impl crate::input::Tokens for Lexer<'_> {
175    fn clone_token_value(&self) -> Option<TokenValue> {
176        self.state.token_value.clone()
177    }
178
179    fn get_token_value(&self) -> Option<&TokenValue> {
180        self.state.token_value.as_ref()
181    }
182
183    fn set_token_value(&mut self, token_value: Option<TokenValue>) {
184        self.state.token_value = token_value;
185    }
186
187    fn take_token_value(&mut self) -> Option<TokenValue> {
188        self.state.token_value.take()
189    }
190
191    fn rescan_jsx_token(&mut self, allow_multiline_jsx_text: bool, reset: BytePos) -> TokenAndSpan {
192        unsafe {
193            self.input.reset_to(reset);
194        }
195        Tokens::scan_jsx_token(self, allow_multiline_jsx_text)
196    }
197
198    fn rescan_jsx_open_el_terminal_token(&mut self, reset: BytePos) -> TokenAndSpan {
199        unsafe {
200            self.input.reset_to(reset);
201        }
202        Tokens::scan_jsx_open_el_terminal_token(self)
203    }
204
205    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> TokenAndSpan {
206        let start = self.cur_pos();
207        let res = match self.scan_jsx_token(allow_multiline_jsx_text) {
208            Ok(res) => Ok(res),
209            Err(error) => {
210                self.state.set_token_value(TokenValue::Error(error));
211                Err(Token::Error)
212            }
213        };
214        let token = match res {
215            Ok(t) => t,
216            Err(e) => e,
217        };
218        let span = self.span(start);
219        if token != Token::Eof {
220            if let Some(comments) = self.comments_buffer.as_mut() {
221                comments.pending_to_comment(BufferedCommentKind::Leading, start);
222            }
223
224            self.state.set_token_type(token);
225            self.state.prev_hi = self.last_pos();
226            self.state.had_line_break_before_last = self.had_line_break_before_last();
227        }
228        // Attach span to token.
229        TokenAndSpan {
230            token,
231            had_line_break: self.had_line_break_before_last(),
232            span,
233        }
234    }
235
236    fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan {
237        self.skip_space::<true>();
238        let start = self.input.cur_pos();
239        let res = match self.scan_jsx_attrs_terminal_token() {
240            Ok(res) => Ok(res),
241            Err(error) => {
242                self.state.set_token_value(TokenValue::Error(error));
243                Err(Token::Error)
244            }
245        };
246        let token = match res {
247            Ok(t) => t,
248            Err(e) => e,
249        };
250        let span = self.span(start);
251        if token != Token::Eof {
252            if let Some(comments) = self.comments_buffer.as_mut() {
253                comments.pending_to_comment(BufferedCommentKind::Leading, start);
254            }
255
256            self.state.set_token_type(token);
257            self.state.prev_hi = self.last_pos();
258            self.state.had_line_break_before_last = self.had_line_break_before_last();
259        }
260        // Attach span to token.
261        TokenAndSpan {
262            token,
263            had_line_break: self.had_line_break_before_last(),
264            span,
265        }
266    }
267
268    fn scan_jsx_identifier(&mut self, start: BytePos) -> TokenAndSpan {
269        let token = self.state.token_type.unwrap();
270        debug_assert!(token.is_word());
271        let mut v = String::with_capacity(16);
272        while let Some(ch) = self.input().cur() {
273            if ch == '-' {
274                v.push(ch);
275                self.bump();
276            } else {
277                let old_pos = self.cur_pos();
278                v.push_str(&self.scan_identifier_parts());
279                if self.cur_pos() == old_pos {
280                    break;
281                }
282            }
283        }
284        let v = if !v.is_empty() {
285            let v = if token.is_known_ident() {
286                format!("{}{}", token.to_string(None), v)
287            } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
288                format!("{value}{v}")
289            } else {
290                format!("{}{}", token.to_string(None), v)
291            };
292            self.atom(v)
293        } else if token.is_known_ident() || token.is_keyword() {
294            self.atom(token.to_string(None))
295        } else if let Some(TokenValue::Word(value)) = self.state.token_value.take() {
296            value
297        } else {
298            unreachable!(
299                "`token_value` should be a word, but got: {:?}",
300                self.state.token_value
301            )
302        };
303        self.state.set_token_value(TokenValue::Word(v));
304        TokenAndSpan {
305            token: Token::JSXName,
306            had_line_break: self.had_line_break_before_last(),
307            span: self.span(start),
308        }
309    }
310
311    fn scan_jsx_attribute_value(&mut self) -> TokenAndSpan {
312        let Some(cur) = self.cur() else {
313            let start = self.cur_pos();
314            return TokenAndSpan {
315                token: Token::Eof,
316                had_line_break: self.had_line_break_before_last(),
317                span: self.span(start),
318            };
319        };
320        let start = self.cur_pos();
321
322        match cur {
323            '\'' | '"' => {
324                let token = self.read_jsx_str(cur);
325                let token = match token {
326                    Ok(token) => token,
327                    Err(e) => {
328                        self.state.set_token_value(TokenValue::Error(e));
329                        return TokenAndSpan {
330                            token: Token::Error,
331                            had_line_break: self.had_line_break_before_last(),
332                            span: self.span(start),
333                        };
334                    }
335                };
336                debug_assert!(self
337                    .get_token_value()
338                    .is_some_and(|t| matches!(t, TokenValue::Str { .. })));
339                debug_assert!(token == Token::Str);
340                TokenAndSpan {
341                    token,
342                    had_line_break: self.had_line_break_before_last(),
343                    span: self.span(start),
344                }
345            }
346            _ => self.next().unwrap_or_else(|| TokenAndSpan {
347                token: Token::Eof,
348                had_line_break: self.had_line_break_before_last(),
349                span: self.span(start),
350            }),
351        }
352    }
353
354    fn rescan_template_token(
355        &mut self,
356        start: BytePos,
357        start_with_back_tick: bool,
358    ) -> TokenAndSpan {
359        unsafe { self.input.reset_to(start) };
360        let res = self.scan_template_token(start, start_with_back_tick);
361        let token = match res.map_err(|e| {
362            self.state.set_token_value(TokenValue::Error(e));
363            Token::Error
364        }) {
365            Ok(t) => t,
366            Err(e) => e,
367        };
368        let span = if start_with_back_tick {
369            self.span(start)
370        } else {
371            // `+ BytePos(1)` is used to skip `{`
372            self.span(start + BytePos(1))
373        };
374
375        if token != Token::Eof {
376            if let Some(comments) = self.comments_buffer.as_mut() {
377                comments.pending_to_comment(BufferedCommentKind::Leading, start);
378            }
379
380            self.state.set_token_type(token);
381            self.state.prev_hi = self.last_pos();
382            self.state.had_line_break_before_last = self.had_line_break_before_last();
383        }
384        // Attach span to token.
385        TokenAndSpan {
386            token,
387            had_line_break: self.had_line_break_before_last(),
388            span,
389        }
390    }
391}
392
393impl Lexer<'_> {
394    fn next_token(&mut self, start: &mut BytePos) -> Result<Token, Error> {
395        if let Some(start) = self.state.next_regexp {
396            return self.read_regexp(start);
397        }
398
399        if self.state.is_first {
400            if let Some(shebang) = self.read_shebang()? {
401                self.state.set_token_value(TokenValue::Word(shebang));
402                return Ok(Token::Shebang);
403            }
404        }
405
406        self.state.had_line_break = self.state.is_first;
407        self.state.is_first = false;
408
409        self.skip_space::<true>();
410        *start = self.input.cur_pos();
411
412        if self.input.last_pos() == self.input.end_pos() {
413            // End of input.
414            self.consume_pending_comments();
415            return Ok(Token::Eof);
416        }
417
418        // println!(
419        //     "\tContext: ({:?}) {:?}",
420        //     self.input.cur().unwrap(),
421        //     self.state.context.0
422        // );
423
424        self.state.start = *start;
425
426        self.read_token()
427    }
428
429    fn scan_jsx_token(&mut self, allow_multiline_jsx_text: bool) -> Result<Token, Error> {
430        debug_assert!(self.syntax.jsx());
431
432        if self.input_mut().as_str().is_empty() {
433            return Ok(Token::Eof);
434        };
435
436        if self.input.eat_byte(b'<') {
437            return Ok(if self.input.eat_byte(b'/') {
438                Token::LessSlash
439            } else {
440                Token::Lt
441            });
442        } else if self.input.eat_byte(b'{') {
443            return Ok(Token::LBrace);
444        }
445
446        let start = self.input.cur_pos();
447        let mut first_non_whitespace = 0;
448        let mut chunk_start = start;
449        let mut value = String::new();
450
451        while let Some(ch) = self.input_mut().cur() {
452            if ch == '{' {
453                break;
454            } else if ch == '<' {
455                // TODO: check git conflict mark
456                break;
457            }
458
459            if ch == '>' {
460                self.emit_error(
461                    self.input().cur_pos(),
462                    SyntaxError::UnexpectedTokenWithSuggestions {
463                        candidate_list: vec!["`{'>'}`", "`&gt;`"],
464                    },
465                );
466            } else if ch == '}' {
467                self.emit_error(
468                    self.input().cur_pos(),
469                    SyntaxError::UnexpectedTokenWithSuggestions {
470                        candidate_list: vec!["`{'}'}`", "`&rbrace;`"],
471                    },
472                );
473            }
474
475            if first_non_whitespace == 0 && ch.is_line_terminator() {
476                first_non_whitespace = -1;
477            } else if !allow_multiline_jsx_text
478                && ch.is_line_terminator()
479                && first_non_whitespace > 0
480            {
481                break;
482            } else if ch.is_whitespace() {
483                first_non_whitespace = self.cur_pos().0 as i32;
484            }
485
486            if ch == '&' {
487                let cur_pos = self.input().cur_pos();
488
489                let s = unsafe {
490                    // Safety: We already checked for the range
491                    self.input_slice(chunk_start, cur_pos)
492                };
493                value.push_str(s);
494
495                if let Ok(jsx_entity) = self.read_jsx_entity() {
496                    value.push(jsx_entity.0);
497
498                    chunk_start = self.input.cur_pos();
499                }
500            } else {
501                self.bump();
502            }
503        }
504
505        let end = self.input().cur_pos();
506        let raw = unsafe {
507            // Safety: Both of `start` and `end` are generated from `cur_pos()`
508            self.input_slice(start, end)
509        };
510        let value = if value.is_empty() {
511            self.atom(raw)
512        } else {
513            let s = unsafe {
514                // Safety: We already checked for the range
515                self.input_slice(chunk_start, end)
516            };
517            value.push_str(s);
518            self.atom(value)
519        };
520
521        let raw: swc_atoms::Atom = self.atom(raw);
522
523        self.state.set_token_value(TokenValue::Str { raw, value });
524
525        self.state.start = start;
526
527        Ok(Token::JSXText)
528    }
529
530    fn scan_jsx_attrs_terminal_token(&mut self) -> LexResult<Token> {
531        if self.input_mut().as_str().is_empty() {
532            Ok(Token::Eof)
533        } else if self.input.eat_byte(b'>') {
534            Ok(Token::Gt)
535        } else if self.input.eat_byte(b'/') {
536            Ok(Token::Slash)
537        } else {
538            self.read_token()
539        }
540    }
541
542    fn scan_identifier_parts(&mut self) -> String {
543        let mut v = String::with_capacity(16);
544        while let Some(ch) = self.input().cur() {
545            if ch.is_ident_part() {
546                v.push(ch);
547                self.input_mut().bump_bytes(ch.len_utf8());
548            } else if ch == '\\' {
549                self.bump(); // bump '\'
550                if !self.is(b'u') {
551                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
552                    continue;
553                }
554                self.bump(); // bump 'u'
555                let Ok(chars) = self.read_unicode_escape() else {
556                    self.emit_error(self.cur_pos(), SyntaxError::InvalidUnicodeEscape);
557                    break;
558                };
559                for c in chars {
560                    v.extend(c);
561                }
562                self.token_flags |= swc_ecma_lexer::lexer::TokenFlags::UNICODE;
563            } else {
564                break;
565            }
566        }
567        v
568    }
569}
570
571impl Iterator for Lexer<'_> {
572    type Item = TokenAndSpan;
573
574    fn next(&mut self) -> Option<Self::Item> {
575        let mut start = self.cur_pos();
576
577        let token = match self.next_token(&mut start) {
578            Ok(res) => res,
579            Err(error) => {
580                self.state.set_token_value(TokenValue::Error(error));
581                Token::Error
582            }
583        };
584
585        let span = self.span(start);
586        if token != Token::Eof {
587            if let Some(comments) = self.comments_buffer.as_mut() {
588                comments.pending_to_comment(BufferedCommentKind::Leading, start);
589            }
590
591            self.state.set_token_type(token);
592            self.state.prev_hi = self.last_pos();
593            self.state.had_line_break_before_last = self.had_line_break_before_last();
594            // Attach span to token.
595            Some(TokenAndSpan {
596                token,
597                had_line_break: self.had_line_break_before_last(),
598                span,
599            })
600        } else {
601            None
602        }
603    }
604}
605
606impl State {
607    pub fn new(start_pos: BytePos) -> Self {
608        State {
609            had_line_break: false,
610            had_line_break_before_last: false,
611            is_first: true,
612            next_regexp: None,
613            start: BytePos(0),
614            prev_hi: start_pos,
615            token_value: None,
616            token_type: None,
617        }
618    }
619
620    pub(crate) fn set_token_value(&mut self, token_value: TokenValue) {
621        self.token_value = Some(token_value);
622    }
623}
624
625impl swc_ecma_lexer::common::lexer::state::State for State {
626    type TokenKind = Token;
627    type TokenType = Token;
628
629    #[inline(always)]
630    fn is_expr_allowed(&self) -> bool {
631        unreachable!("is_expr_allowed should not be called in Parser/State")
632    }
633
634    #[inline(always)]
635    fn set_is_expr_allowed(&mut self, _: bool) {
636        // noop
637    }
638
639    #[inline(always)]
640    fn set_next_regexp(&mut self, start: Option<BytePos>) {
641        self.next_regexp = start;
642    }
643
644    #[inline(always)]
645    fn had_line_break(&self) -> bool {
646        self.had_line_break
647    }
648
649    #[inline(always)]
650    fn mark_had_line_break(&mut self) {
651        self.had_line_break = true;
652    }
653
654    #[inline(always)]
655    fn had_line_break_before_last(&self) -> bool {
656        self.had_line_break_before_last
657    }
658
659    #[inline(always)]
660    fn token_contexts(&self) -> &swc_ecma_lexer::TokenContexts {
661        unreachable!();
662    }
663
664    #[inline(always)]
665    fn mut_token_contexts(&mut self) -> &mut swc_ecma_lexer::TokenContexts {
666        unreachable!();
667    }
668
669    #[inline(always)]
670    fn set_token_type(&mut self, token_type: Self::TokenType) {
671        self.token_type = Some(token_type);
672    }
673
674    #[inline(always)]
675    fn token_type(&self) -> Option<Self::TokenType> {
676        self.token_type
677    }
678
679    #[inline(always)]
680    fn syntax(&self) -> SyntaxFlags {
681        unreachable!("syntax is not stored in State, but in Lexer")
682    }
683
684    #[inline(always)]
685    fn prev_hi(&self) -> BytePos {
686        self.prev_hi
687    }
688
689    #[inline(always)]
690    fn start(&self) -> BytePos {
691        self.start
692    }
693}