swc_ecma_parser/lexer/
util.rs

1//! Ported from [babylon/util/identifier.js][]
2//!
3//!
4//! [babylon/util/identifier.js]:https://github.com/babel/babel/blob/master/packages/babylon/src/util/identifier.js
5use std::char;
6
7use swc_common::{
8    comments::{Comment, CommentKind},
9    input::Input,
10    BytePos, Span,
11};
12use swc_ecma_ast::Ident;
13use tracing::warn;
14
15use super::{comments_buffer::BufferedComment, whitespace::SkipWhitespace, Char, LexResult, Lexer};
16use crate::{
17    error::{Error, SyntaxError},
18    lexer::comments_buffer::BufferedCommentKind,
19    Tokens,
20};
21
22impl Lexer<'_> {
23    pub(super) fn span(&self, start: BytePos) -> Span {
24        let end = self.last_pos();
25        if cfg!(debug_assertions) && start > end {
26            unreachable!(
27                "assertion failed: (span.start <= span.end).
28 start = {}, end = {}",
29                start.0, end.0
30            )
31        }
32        Span { lo: start, hi: end }
33    }
34
35    #[inline(always)]
36    pub(super) fn bump(&mut self) {
37        unsafe {
38            // Safety: Actually this is not safe but this is an internal method.
39            self.input.bump()
40        }
41    }
42
43    #[inline(always)]
44    pub(super) fn is(&mut self, c: u8) -> bool {
45        self.input.is_byte(c)
46    }
47
48    #[inline(always)]
49    pub(super) fn is_str(&self, s: &str) -> bool {
50        self.input.is_str(s)
51    }
52
53    #[inline(always)]
54    pub(super) fn eat(&mut self, c: u8) -> bool {
55        self.input.eat_byte(c)
56    }
57
58    #[inline(always)]
59    pub(super) fn cur(&mut self) -> Option<char> {
60        self.input.cur()
61    }
62
63    #[inline(always)]
64    pub(super) fn peek(&mut self) -> Option<char> {
65        self.input.peek()
66    }
67
68    #[inline(always)]
69    pub(super) fn peek_ahead(&mut self) -> Option<char> {
70        self.input.peek_ahead()
71    }
72
73    #[inline(always)]
74    pub(super) fn cur_pos(&mut self) -> BytePos {
75        self.input.cur_pos()
76    }
77
78    #[inline(always)]
79    pub(super) fn last_pos(&self) -> BytePos {
80        self.input.last_pos()
81    }
82
83    /// Shorthand for `let span = self.span(start); self.error_span(span)`
84    #[cold]
85    #[inline(never)]
86    pub(super) fn error<T>(&mut self, start: BytePos, kind: SyntaxError) -> LexResult<T> {
87        let span = self.span(start);
88        self.error_span(span, kind)
89    }
90
91    #[cold]
92    #[inline(never)]
93    pub(super) fn error_span<T>(&mut self, span: Span, kind: SyntaxError) -> LexResult<T> {
94        Err(Error::new(span, kind))
95    }
96
97    #[cold]
98    #[inline(never)]
99    pub(super) fn emit_error(&mut self, start: BytePos, kind: SyntaxError) {
100        let span = self.span(start);
101        self.emit_error_span(span, kind)
102    }
103
104    #[cold]
105    #[inline(never)]
106    pub(super) fn emit_error_span(&mut self, span: Span, kind: SyntaxError) {
107        if self.ctx.ignore_error {
108            return;
109        }
110
111        warn!("Lexer error at {:?}", span);
112        let err = Error::new(span, kind);
113        self.errors.borrow_mut().push(err);
114    }
115
116    #[cold]
117    #[inline(never)]
118    pub(super) fn emit_strict_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
119        let span = self.span(start);
120        self.emit_strict_mode_error_span(span, kind)
121    }
122
123    #[cold]
124    #[inline(never)]
125    pub(super) fn emit_strict_mode_error_span(&mut self, span: Span, kind: SyntaxError) {
126        if self.ctx.strict {
127            self.emit_error_span(span, kind);
128            return;
129        }
130
131        let err = Error::new(span, kind);
132
133        self.add_module_mode_error(err);
134    }
135
136    #[cold]
137    #[inline(never)]
138    pub(super) fn emit_module_mode_error(&mut self, start: BytePos, kind: SyntaxError) {
139        let span = self.span(start);
140        self.emit_module_mode_error_span(span, kind)
141    }
142
143    /// Some codes are valid in a strict mode script  but invalid in module
144    /// code.
145    #[cold]
146    #[inline(never)]
147    pub(super) fn emit_module_mode_error_span(&mut self, span: Span, kind: SyntaxError) {
148        let err = Error::new(span, kind);
149
150        self.add_module_mode_error(err);
151    }
152
153    /// Skip comments or whitespaces.
154    ///
155    /// See https://tc39.github.io/ecma262/#sec-white-space
156    #[inline(never)]
157    pub(super) fn skip_space<const LEX_COMMENTS: bool>(&mut self) {
158        loop {
159            let (offset, newline) = {
160                let mut skip = SkipWhitespace {
161                    input: self.input.as_str(),
162                    newline: false,
163                    offset: 0,
164                };
165
166                skip.scan();
167
168                (skip.offset, skip.newline)
169            };
170
171            self.input.bump_bytes(offset as usize);
172            if newline {
173                self.state.had_line_break = true;
174            }
175
176            if LEX_COMMENTS && self.input.is_byte(b'/') {
177                if self.peek() == Some('/') {
178                    self.skip_line_comment(2);
179                    continue;
180                } else if self.peek() == Some('*') {
181                    self.skip_block_comment();
182                    continue;
183                }
184            }
185
186            break;
187        }
188    }
189
190    #[inline(never)]
191    pub(super) fn skip_line_comment(&mut self, start_skip: usize) {
192        let start = self.cur_pos();
193        self.input.bump_bytes(start_skip);
194        let slice_start = self.cur_pos();
195
196        // foo // comment for foo
197        // bar
198        //
199        // foo
200        // // comment for bar
201        // bar
202        //
203        let is_for_next = self.state.had_line_break || !self.state.can_have_trailing_line_comment();
204
205        // Optimization: Performance improvement with byte-based termination character
206        // search
207        let input_str = self.input.as_str();
208        let bytes = input_str.as_bytes();
209        let mut idx = 0;
210        let len = bytes.len();
211
212        // Direct search for line termination characters (ASCII case optimization)
213        while idx < len {
214            let b = bytes[idx];
215            if b == b'\r' || b == b'\n' {
216                self.state.had_line_break = true;
217                break;
218            } else if b > 127 {
219                // non-ASCII case: Check for Unicode line termination characters
220                let s = &input_str[idx..];
221                if let Some(first_char) = s.chars().next() {
222                    if first_char == '\u{2028}' || first_char == '\u{2029}' {
223                        self.state.had_line_break = true;
224                        break;
225                    }
226                    idx += first_char.len_utf8() - 1; // -1은 아래 증가분 고려
227                }
228            }
229            idx += 1;
230        }
231
232        // Process until the end of string if no line termination character is found
233        if idx == len {
234            idx = len;
235        }
236
237        self.input.bump_bytes(idx);
238        let end = self.cur_pos();
239
240        // Create and process slice only if comments need to be stored
241        if let Some(comments) = self.comments_buffer.as_mut() {
242            let s = unsafe {
243                // Safety: We know that the start and the end are valid
244                self.input.slice(slice_start, end)
245            };
246            let cmt = Comment {
247                kind: CommentKind::Line,
248                span: Span::new(start, end),
249                text: self.atoms.atom(s),
250            };
251
252            if is_for_next {
253                comments.push_pending_leading(cmt);
254            } else {
255                comments.push(BufferedComment {
256                    kind: BufferedCommentKind::Trailing,
257                    pos: self.state.prev_hi,
258                    comment: cmt,
259                });
260            }
261        }
262
263        unsafe {
264            // Safety: We got end from self.input
265            self.input.reset_to(end);
266        }
267    }
268
269    /// Expects current char to be '/' and next char to be '*'.
270    #[inline(never)]
271    pub(super) fn skip_block_comment(&mut self) {
272        let start = self.cur_pos();
273
274        debug_assert_eq!(self.cur(), Some('/'));
275        debug_assert_eq!(self.peek(), Some('*'));
276
277        self.input.bump_bytes(2);
278
279        // jsdoc
280        let slice_start = self.cur_pos();
281
282        // Check if there's an asterisk at the beginning (JSDoc style)
283        let mut was_star = if self.input.is_byte(b'*') {
284            self.bump();
285            true
286        } else {
287            false
288        };
289
290        let mut is_for_next = self.state.had_line_break || !self.state.can_have_trailing_comment();
291
292        // Optimization for finding block comment end position
293        let input_str = self.input.as_str();
294        let bytes = input_str.as_bytes();
295        let mut pos = 0;
296        let len = bytes.len();
297
298        // Byte-based scanning for faster search
299        while pos < len {
300            let b = bytes[pos];
301
302            if was_star && b == b'/' {
303                // Found comment end: "*/"
304                self.input.bump_bytes(pos + 1); // 종료 '/' 포함해서 이동
305
306                let end = self.cur_pos();
307
308                self.skip_space::<false>();
309
310                // Check if this is a comment before semicolon
311                if !self.state.had_line_break && self.input.is_byte(b';') {
312                    is_for_next = false;
313                }
314
315                self.store_comment(is_for_next, start, end, slice_start);
316
317                return;
318            }
319
320            // Check for line break characters - ASCII case
321            if b == b'\r' || b == b'\n' {
322                self.state.had_line_break = true;
323            }
324            // Check for Unicode line breaks (rare case)
325            else if b > 127 {
326                let remaining = &input_str[pos..];
327                if let Some(c) = remaining.chars().next() {
328                    if c == '\u{2028}' || c == '\u{2029}' {
329                        self.state.had_line_break = true;
330                    }
331                    // Skip multibyte characters
332                    pos += c.len_utf8() - 1; // -1은 아래 증가분 고려
333                }
334            }
335
336            was_star = b == b'*';
337            pos += 1;
338        }
339
340        // If we reached here, it's an unterminated block comment
341        self.input.bump_bytes(len); // 남은 입력 건너뛰기
342        let end = self.input.end_pos();
343        let span = Span::new(end, end);
344        self.emit_error_span(span, SyntaxError::UnterminatedBlockComment)
345    }
346
347    #[inline(never)]
348    fn store_comment(
349        &mut self,
350        is_for_next: bool,
351        start: BytePos,
352        end: BytePos,
353        slice_start: BytePos,
354    ) {
355        if let Some(comments) = self.comments_buffer.as_mut() {
356            let src = unsafe {
357                // Safety: We got slice_start and end from self.input so those are valid.
358                self.input.slice(slice_start, end)
359            };
360            let s = &src[..src.len() - 2];
361            let cmt = Comment {
362                kind: CommentKind::Block,
363                span: Span::new(start, end),
364                text: self.atoms.atom(s),
365            };
366
367            let _ = self.input.peek();
368            if is_for_next {
369                comments.push_pending_leading(cmt);
370            } else {
371                comments.push(BufferedComment {
372                    kind: BufferedCommentKind::Trailing,
373                    pos: self.state.prev_hi,
374                    comment: cmt,
375                });
376            }
377        }
378    }
379}
380
381/// Implemented for `char`.
382pub trait CharExt: Copy {
383    fn to_char(self) -> Option<char>;
384
385    /// Test whether a given character code starts an identifier.
386    ///
387    /// https://tc39.github.io/ecma262/#prod-IdentifierStart
388    #[inline]
389    fn is_ident_start(self) -> bool {
390        let c = match self.to_char() {
391            Some(c) => c,
392            None => return false,
393        };
394        Ident::is_valid_start(c)
395    }
396
397    /// Test whether a given character is part of an identifier.
398    #[inline]
399    fn is_ident_part(self) -> bool {
400        let c = match self.to_char() {
401            Some(c) => c,
402            None => return false,
403        };
404        Ident::is_valid_continue(c)
405    }
406
407    /// See https://tc39.github.io/ecma262/#sec-line-terminators
408    #[inline]
409    fn is_line_terminator(self) -> bool {
410        let c = match self.to_char() {
411            Some(c) => c,
412            None => return false,
413        };
414        matches!(c, '\r' | '\n' | '\u{2028}' | '\u{2029}')
415    }
416
417    /// See https://tc39.github.io/ecma262/#sec-literals-string-literals
418    #[inline]
419    fn is_line_break(self) -> bool {
420        let c = match self.to_char() {
421            Some(c) => c,
422            None => return false,
423        };
424        matches!(c, '\r' | '\n')
425    }
426
427    /// See https://tc39.github.io/ecma262/#sec-white-space
428    #[inline]
429    fn is_ws(self) -> bool {
430        let c = match self.to_char() {
431            Some(c) => c,
432            None => return false,
433        };
434        match c {
435            '\u{0009}' | '\u{000b}' | '\u{000c}' | '\u{0020}' | '\u{00a0}' | '\u{feff}' => true,
436            _ => {
437                if self.is_line_terminator() {
438                    // NOTE: Line terminator is not whitespace.
439                    false
440                } else {
441                    c.is_whitespace()
442                }
443            }
444        }
445    }
446}
447
448impl CharExt for Char {
449    #[inline(always)]
450    fn to_char(self) -> Option<char> {
451        char::from_u32(self.0)
452    }
453}
454
455impl CharExt for char {
456    #[inline(always)]
457    fn to_char(self) -> Option<char> {
458        Some(self)
459    }
460}
swc_ecma_parser/lexer/util.rs

swc_ecma_parser/lexer/
util.rs