Skip to main content

bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.86 keyword (ADR 0112): the `Duration` base type.
48    #[token("Duration")]
49    Duration,
50    // v0.90 keyword (ADR 0114): the `Instant` base type.
51    #[token("Instant")]
52    Instant,
53    // v0.110 keyword (ADR 0142): the `Bytes` base type.
54    #[token("Bytes")]
55    Bytes,
56    // v0.1 keywords
57    #[token("let")]
58    Let,
59    #[token("if")]
60    If,
61    #[token("else")]
62    Else,
63    #[token("Ok")]
64    Ok,
65    #[token("Err")]
66    Err,
67    #[token("Result")]
68    Result,
69    #[token("ValidationError")]
70    ValidationError,
71    // v0.22b keyword
72    #[token("JsonError")]
73    JsonError,
74    // v0.2 keywords
75    #[token("enum")]
76    Enum,
77    #[token("match")]
78    Match,
79    #[token("Option")]
80    Option,
81    #[token("record")]
82    Record,
83    #[token("self")]
84    Self_,
85    #[token("Some")]
86    Some,
87    #[token("None")]
88    None,
89    #[token("is")]
90    Is,
91    // v0.3 keywords
92    #[token("opaque")]
93    Opaque,
94    #[token("uses")]
95    Uses,
96    // v0.4 keywords
97    #[token("context")]
98    Context,
99    #[token("consumes")]
100    Consumes,
101    #[token("exports")]
102    Exports,
103    #[token("transparent")]
104    Transparent,
105    // v0.6 keywords
106    #[token("as")]
107    As,
108    // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`)
109    #[token("expect")]
110    Expect,
111    #[token("mocks")]
112    Mocks,
113    #[token("suite")]
114    Suite,
115    #[token("case")]
116    Case,
117    // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
118    // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
119    // and must stay a usable identifier. The `for all` binder is parsed
120    // contextually (two identifiers) inside a `property` body instead.
121    #[token("property")]
122    Property,
123    // v0.16 keyword
124    #[token("wires")]
125    Wires,
126    // v0.17 keywords
127    #[token("adapter")]
128    Adapter,
129    #[token("binding")]
130    Binding,
131    // v0.5 keywords
132    #[token("agent")]
133    Agent,
134    #[token("capability")]
135    Capability,
136    #[token("Effect")]
137    Effect,
138    #[token("given")]
139    Given,
140    #[token("on")]
141    On,
142    // v0.9 keyword
143    #[token("http")]
144    Http,
145    // v0.10a keyword
146    #[token("cron")]
147    Cron,
148    // v0.10b keyword
149    #[token("queue")]
150    Queue,
151    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
152    // reserved (protocols are a closed, compiler-known set — no declaration kind).
153    #[token("from")]
154    From,
155    #[token("protocol")]
156    Protocol,
157    #[token("provides")]
158    Provides,
159    #[token("service")]
160    Service,
161    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
162    // heads a handler's actor clause.
163    #[token("actor")]
164    Actor,
165    #[token("by")]
166    By,
167    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
168    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
169    #[token("invariant")]
170    Invariant,
171    #[token("implies")]
172    Implies,
173    /// `...` — used in record-spread expressions (v0.5).
174    #[token("...")]
175    DotDotDot,
176    /// `<-` — Effect bind operator (v0.5).
177    #[token("<-")]
178    LArrow,
179    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
180    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
181    /// call site shows whether the caller waits.
182    #[token("~>")]
183    TildeArrow,
184    /// `:=` — Cell write (v0.81, storage track). A handler statement
185    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
186    /// than `:`/`=` so logos matches it as one token.
187    #[token(":=")]
188    ColonEq,
189
190    /// A documentation block: `---` line ... `---` line. The token's span
191    /// covers the full block including both `---` markers. The body content
192    /// is recovered from the source via the span (see [`doc_block_content`]).
193    /// Inserted by [`tokenize`]; not lexed by logos directly.
194    DocBlock,
195
196    /// A line comment: `-- ...` running to end of line. The span starts at
197    /// the `--` marker and runs through the last character before the
198    /// terminating newline (exclusive). The trivia body (the text after the
199    /// `--` marker) is recovered from the source via the span. Inserted by
200    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
201    /// for an `--` operator sequence.
202    Comment,
203
204    // Identifier
205    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
206    Ident,
207
208    // Literals
209    #[regex(r"[0-9]+")]
210    IntLit,
211    // A float literal: fraction with a digit on both sides of the `.`, an
212    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
213    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
214    // as method calls on numeric literals.
215    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
216    FloatLit,
217    // A double-quoted string with simple escapes. The body excludes the closing
218    // quote; we accept any non-quote/non-backslash/non-newline char, or a
219    // backslash followed by one of the four allowed escapes.
220    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
221    StrLit,
222    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
223    // `tokenize` (logos cannot balance the holes' parens), never produced by
224    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
225    // The span covers the whole `"…"`; the parser splits chunks from holes.
226    InterpStr,
227
228    // Multi-char operators
229    #[token("->")]
230    Arrow,
231    #[token("==")]
232    EqEq,
233    #[token("!=")]
234    BangEq,
235    #[token("<=")]
236    LtEq,
237    #[token(">=")]
238    GtEq,
239    #[token("&&")]
240    AmpAmp,
241    #[token("||")]
242    PipePipe,
243
244    // Single-char operators
245    #[token("+")]
246    Plus,
247    #[token("-")]
248    Minus,
249    #[token("*")]
250    Star,
251    #[token("/")]
252    Slash,
253    #[token("!")]
254    Bang,
255    #[token("=")]
256    Eq,
257    #[token("<")]
258    Lt,
259    #[token(">")]
260    Gt,
261    // v0.1 postfix operator
262    #[token("?")]
263    Question,
264    // v0.2 match-arm arrow
265    #[token("=>")]
266    FatArrow,
267    // v0.2 wildcard pattern (also valid as identifier start; the lexer
268    // prefers identifier for any longer match, so `_foo` is still Ident).
269    #[token("_")]
270    Underscore,
271    // v0.2 sum-type variant separator (also used as future bitwise OR);
272    // single `|` distinct from `||`.
273    #[token("|")]
274    Pipe,
275    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
276    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
277    /// appears only in store-field-declaration position, never as an expression
278    /// operator.
279    #[token("@")]
280    At,
281
282    // Punctuation
283    #[token("(")]
284    LParen,
285    #[token(")")]
286    RParen,
287    #[token("{")]
288    LBrace,
289    #[token("}")]
290    RBrace,
291    #[token("[")]
292    LBracket,
293    #[token("]")]
294    RBracket,
295    #[token(",")]
296    Comma,
297    #[token(":")]
298    Colon,
299    #[token(".")]
300    Dot,
301}
302
303impl TokenKind {
304    /// Human-readable display name for diagnostics.
305    pub fn describe(self) -> &'static str {
306        use TokenKind::*;
307        match self {
308            Commons => "`commons`",
309            Type => "`type`",
310            Fn => "`fn`",
311            Where => "`where`",
312            And => "`and`",
313            True => "`true`",
314            False => "`false`",
315            Int => "`Int`",
316            String => "`String`",
317            Bool => "`Bool`",
318            Float => "`Float`",
319            Duration => "`Duration`",
320            Instant => "`Instant`",
321            Bytes => "`Bytes`",
322            Let => "`let`",
323            If => "`if`",
324            Else => "`else`",
325            Ok => "`Ok`",
326            Err => "`Err`",
327            Result => "`Result`",
328            ValidationError => "`ValidationError`",
329            JsonError => "`JsonError`",
330            Enum => "`enum`",
331            Match => "`match`",
332            Option => "`Option`",
333            Record => "`record`",
334            Self_ => "`self`",
335            Some => "`Some`",
336            None => "`None`",
337            Is => "`is`",
338            Opaque => "`opaque`",
339            Uses => "`uses`",
340            Context => "`context`",
341            Consumes => "`consumes`",
342            Exports => "`exports`",
343            Transparent => "`transparent`",
344            As => "`as`",
345            Expect => "`expect`",
346            Mocks => "`mocks`",
347            Suite => "`suite`",
348            Case => "`case`",
349            Property => "`property`",
350            Wires => "`wires`",
351            Adapter => "`adapter`",
352            Binding => "`binding`",
353            Agent => "`agent`",
354            Capability => "`capability`",
355            Effect => "`Effect`",
356            Given => "`given`",
357            On => "`on`",
358            Http => "`http`",
359            Cron => "`cron`",
360            Queue => "`queue`",
361            From => "`from`",
362            Protocol => "`protocol`",
363            Provides => "`provides`",
364            Service => "`service`",
365            Actor => "`actor`",
366            By => "`by`",
367            Invariant => "`invariant`",
368            Implies => "`implies`",
369            ColonEq => "`:=`",
370            DotDotDot => "`...`",
371            LArrow => "`<-`",
372            TildeArrow => "`~>`",
373            DocBlock => "documentation block",
374            Comment => "line comment",
375            Ident => "identifier",
376            IntLit => "integer literal",
377            FloatLit => "float literal",
378            StrLit => "string literal",
379            InterpStr => "interpolated string",
380            Arrow => "`->`",
381            EqEq => "`==`",
382            BangEq => "`!=`",
383            LtEq => "`<=`",
384            GtEq => "`>=`",
385            AmpAmp => "`&&`",
386            PipePipe => "`||`",
387            Plus => "`+`",
388            Minus => "`-`",
389            Star => "`*`",
390            Slash => "`/`",
391            Bang => "`!`",
392            Eq => "`=`",
393            Lt => "`<`",
394            Gt => "`>`",
395            Question => "`?`",
396            FatArrow => "`=>`",
397            Underscore => "`_`",
398            Pipe => "`|`",
399            At => "`@`",
400            LParen => "`(`",
401            RParen => "`)`",
402            LBrace => "`{`",
403            RBrace => "`}`",
404            LBracket => "`[`",
405            RBracket => "`]`",
406            Comma => "`,`",
407            Colon => "`:`",
408            Dot => "`.`",
409        }
410    }
411}
412
413/// A token plus its source span.
414#[derive(Debug, Clone, Copy)]
415pub struct Token {
416    pub kind: TokenKind,
417    pub span: Span,
418}
419
420/// Tokenise a source string. Returns the full token vector or the first
421/// lexical error.
422///
423/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
424/// outside the logos-generated lexer: we scan the source one segment at a
425/// time, dispatching to logos for ordinary tokens between non-token spans.
426pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
427    let mut tokens = Vec::new();
428    let bytes = source.as_bytes();
429    let mut pos = 0;
430    while pos < bytes.len() {
431        // Detect a `---` doc-block marker at the start of a line (the line may
432        // begin with leading whitespace; the marker itself must be alone on
433        // its line).
434        if let Some(open_end) = doc_block_open_at(source, pos) {
435            // Find the matching closing `---` line.
436            match doc_block_close(source, open_end) {
437                Some((close_start, close_end)) => {
438                    let span = Span::new(pos, close_end);
439                    tokens.push(Token {
440                        kind: TokenKind::DocBlock,
441                        span,
442                    });
443                    let _ = close_start;
444                    pos = close_end;
445                    continue;
446                }
447                None => {
448                    return Err(CompileError::new(
449                        "bynk.lex.unclosed_doc_block",
450                        Span::new(pos, open_end),
451                        "documentation block opened but never closed",
452                    )
453                    .with_note(
454                        "a doc block must be terminated by another `---` on a line by itself",
455                    ));
456                }
457            }
458        }
459        // A `--` line comment: emit a `Comment` token covering everything
460        // up to (but not including) the terminating newline. Doc-block
461        // detection above already ruled out a `---` marker at line start
462        // — and once we've consumed past the leading `--`, any further
463        // dashes are part of the comment body. Preserving comments as
464        // trivia tokens lets the parser attach them to declarations so
465        // the formatter can emit them in place (v1.1 LSP spec §3.5).
466        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
467            let start = pos;
468            while pos < bytes.len() && bytes[pos] != b'\n' {
469                pos += 1;
470            }
471            tokens.push(Token {
472                kind: TokenKind::Comment,
473                span: Span::new(start, pos),
474            });
475            continue;
476        }
477        // Skip ordinary whitespace inline (logos handles it too, but we may
478        // be in the middle of the source between specials).
479        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
480            pos += 1;
481            continue;
482        }
483        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
484        // actually contain a `\(` hole are hand-scanned here; plain strings
485        // fall through to the logos `StrLit` path unchanged. `\(` is an
486        // invalid escape in the logos grammar, so this never re-routes a
487        // currently-valid literal.
488        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
489            let end = scan_str(bytes, source, pos)?;
490            tokens.push(Token {
491                kind: TokenKind::InterpStr,
492                span: Span::new(pos, end),
493            });
494            pos = end;
495            continue;
496        }
497        // Otherwise dispatch a single logos token starting at `pos`.
498        let mut lex = TokenKind::lexer(&source[pos..]);
499        let Some(result) = lex.next() else {
500            // No token at this position; treat as unexpected character so
501            // the user sees something useful.
502            let ch = source[pos..].chars().next().unwrap_or('\0');
503            let span = Span::new(pos, pos + ch.len_utf8());
504            return Err(CompileError::new(
505                "bynk.lex.unexpected_character",
506                span,
507                format!("unexpected character `{ch}`"),
508            ));
509        };
510        let local = lex.span();
511        let span: Span = Span::new(pos + local.start, pos + local.end);
512        match result {
513            Ok(kind) => {
514                if kind == TokenKind::IntLit {
515                    let slice = &source[span.range()];
516                    if slice.parse::<i64>().is_err() {
517                        return Err(CompileError::new(
518                            "bynk.lex.integer_overflow",
519                            span,
520                            format!(
521                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
522                            ),
523                        )
524                        .with_note("the range is -2^63 to 2^63 - 1"));
525                    }
526                }
527                if kind == TokenKind::FloatLit {
528                    let slice = &source[span.range()];
529                    match slice.parse::<f64>() {
530                        Ok(v) if v.is_finite() => {}
531                        _ => {
532                            return Err(CompileError::new(
533                                "bynk.lex.float_literal_overflow",
534                                span,
535                                format!(
536                                    "float literal `{slice}` is out of range for a 64-bit float"
537                                ),
538                            )
539                            .with_note(
540                                "the literal does not fit a finite IEEE 754 double; \
541                                 the largest finite value is ~1.8e308",
542                            ));
543                        }
544                    }
545                }
546                tokens.push(Token { kind, span });
547                pos = span.end;
548            }
549            Err(()) => {
550                let slice = &source[span.range()];
551                let ch = slice.chars().next().unwrap_or('\0');
552                let err = if ch == '"' {
553                    CompileError::new(
554                        "bynk.lex.unterminated_string",
555                        span,
556                        "unterminated string literal",
557                    )
558                    .with_note(
559                        "string literals must close with `\"` on the same line; \
560                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
561                    )
562                } else {
563                    CompileError::new(
564                        "bynk.lex.unexpected_character",
565                        span,
566                        format!("unexpected character `{ch}`"),
567                    )
568                };
569                return Err(err);
570            }
571        }
572    }
573    Ok(tokens)
574}
575
576/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
577/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
578/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
579/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
580/// routed here so the hole-aware scanner produces the precise error.
581fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
582    let mut i = start + 1;
583    while i < bytes.len() {
584        match bytes[i] {
585            b'\n' | b'"' => return false,
586            b'\\' => {
587                if bytes.get(i + 1) == Some(&b'(') {
588                    return true;
589                }
590                i += 2;
591            }
592            _ => i += 1,
593        }
594    }
595    false
596}
597
598/// Scan a double-quoted string starting at `start` (the opening `"`), returning
599/// the byte offset just past the closing `"`. Recognises the four simple
600/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
601/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
602fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
603    debug_assert_eq!(bytes[start], b'"');
604    let mut i = start + 1;
605    loop {
606        if i >= bytes.len() || bytes[i] == b'\n' {
607            return Err(CompileError::new(
608                "bynk.lex.unterminated_string",
609                Span::new(start, i.min(bytes.len())),
610                "unterminated string literal",
611            )
612            .with_note(
613                "string literals must close with `\"` on the same line; \
614                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
615            ));
616        }
617        match bytes[i] {
618            b'"' => return Ok(i + 1),
619            b'\\' => match bytes.get(i + 1) {
620                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
621                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
622                other => {
623                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
624                    return Err(CompileError::new(
625                        "bynk.lex.bad_escape",
626                        Span::new(i, (i + 2).min(bytes.len())),
627                        format!("invalid escape sequence `\\{shown}` in string literal"),
628                    )
629                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
630                }
631            },
632            // Any other byte advances one position. UTF-8 continuation bytes
633            // are all >= 0x80, so they never collide with the ASCII specials.
634            _ => i += 1,
635        }
636    }
637}
638
639/// Scan an interpolation hole body. `start` points just past the `\(`; returns
640/// the offset just past the matching `)`. Tracks paren depth and skips nested
641/// strings (whose own parens must not close the hole), recursing through
642/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
643fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
644    let mut i = start;
645    let mut depth = 1usize;
646    loop {
647        if i >= bytes.len() || bytes[i] == b'\n' {
648            return Err(CompileError::new(
649                "bynk.lex.unterminated_interpolation",
650                Span::new(start.saturating_sub(2), i.min(bytes.len())),
651                "unterminated interpolation hole",
652            )
653            .with_note(
654                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
655            ));
656        }
657        match bytes[i] {
658            b'(' => {
659                depth += 1;
660                i += 1;
661            }
662            b')' => {
663                depth -= 1;
664                i += 1;
665                if depth == 0 {
666                    return Ok(i);
667                }
668            }
669            b'"' => i = scan_str(bytes, source, i)?,
670            _ => i += 1,
671        }
672    }
673}
674
675/// One segment of a split interpolated string (v0.43): literal text (escapes
676/// resolved) or the absolute source span of a hole's expression (the bytes
677/// between `\(` and its matching `)`). The parser turns the latter into a real
678/// `Expr`; the lexer owns only the scanning.
679pub(crate) enum InterpSegment {
680    Chunk(String),
681    Hole(Span),
682}
683
684/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
685/// and hole spans. Escapes in the chunks are resolved here (mirroring
686/// [`parse_string_literal`]); holes are returned as spans for the parser to
687/// re-lex and parse as expressions. (v0.43.)
688pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
689    let bytes = source.as_bytes();
690    let inner_end = span.end - 1; // the closing `"`
691    let mut segments = Vec::new();
692    let mut chunk = String::new();
693    let mut i = span.start + 1; // past the opening `"`
694    while i < inner_end {
695        match bytes[i] {
696            b'\\' => match bytes[i + 1] {
697                b'n' => {
698                    chunk.push('\n');
699                    i += 2;
700                }
701                b't' => {
702                    chunk.push('\t');
703                    i += 2;
704                }
705                b'"' => {
706                    chunk.push('"');
707                    i += 2;
708                }
709                b'\\' => {
710                    chunk.push('\\');
711                    i += 2;
712                }
713                b'(' => {
714                    if !chunk.is_empty() {
715                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
716                    }
717                    let hole_start = i + 2;
718                    let after = scan_hole(bytes, source, hole_start)?;
719                    // `after` is one past the matching `)`; the hole body is
720                    // everything up to that `)`.
721                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
722                    i = after;
723                }
724                // The lexer already validated every escape, so nothing else
725                // can appear here.
726                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
727            },
728            _ => {
729                let ch = source[i..].chars().next().unwrap();
730                chunk.push(ch);
731                i += ch.len_utf8();
732            }
733        }
734    }
735    if !chunk.is_empty() {
736        segments.push(InterpSegment::Chunk(chunk));
737    }
738    Ok(segments)
739}
740
741/// If a `---` doc-block marker line starts at or shortly after `pos` (which
742/// must be at a line boundary), return the byte offset just past the marker
743/// line (after the terminating newline, or at EOF). The doc-block grammar
744/// requires the marker to be alone on its line; leading horizontal whitespace
745/// is allowed and ignored.
746fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
747    let bytes = source.as_bytes();
748    if !at_line_start(source, pos) {
749        return None;
750    }
751    // Skip leading horizontal whitespace.
752    let mut i = pos;
753    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
754        i += 1;
755    }
756    if i + 3 > bytes.len() {
757        return None;
758    }
759    if &bytes[i..i + 3] != b"---" {
760        return None;
761    }
762    i += 3;
763    // The marker may have additional trailing dashes (per spec "three or more
764    // consecutive hyphens"). Consume them.
765    while i < bytes.len() && bytes[i] == b'-' {
766        i += 1;
767    }
768    // After the dashes, allow only horizontal whitespace then newline/EOF.
769    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
770        i += 1;
771    }
772    if i == bytes.len() {
773        return Some(i);
774    }
775    if bytes[i] == b'\n' {
776        return Some(i + 1);
777    }
778    None
779}
780
781/// Find the next closing `---` line at or after `pos`. Returns
782/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
783/// terminating newline, or at EOF).
784fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
785    let bytes = source.as_bytes();
786    while pos < bytes.len() {
787        // Advance pos to the start of a line.
788        let line_start = pos;
789        // Find the end of this line.
790        let mut line_end = line_start;
791        while line_end < bytes.len() && bytes[line_end] != b'\n' {
792            line_end += 1;
793        }
794        // Check this line.
795        if let Some(end) = doc_block_open_at(source, line_start) {
796            return Some((line_start, end));
797        }
798        // Move to the next line.
799        pos = if line_end < bytes.len() {
800            line_end + 1
801        } else {
802            line_end
803        };
804    }
805    None
806}
807
808/// Returns true if byte offset `pos` is at a line start (column 0).
809fn at_line_start(source: &str, pos: usize) -> bool {
810    if pos == 0 {
811        return true;
812    }
813    let bytes = source.as_bytes();
814    bytes[pos - 1] == b'\n'
815}
816
817/// Extract the body content of a doc-block token from its source span.
818/// Strips the leading and trailing `---` marker lines and returns the body
819/// verbatim. If every non-empty content line begins with the same horizontal
820/// whitespace prefix (e.g., because the doc block sits inside a brace-form
821/// commons body), that common prefix is removed so the body reads naturally
822/// when emitted as JSDoc.
823pub fn doc_block_content(source: &str, span: Span) -> String {
824    let slice = &source[span.range()];
825    // Drop the first line (opening marker).
826    let after_open = match slice.find('\n') {
827        Some(i) => &slice[i + 1..],
828        None => return String::new(),
829    };
830    let bytes = after_open.as_bytes();
831    // Trim the trailing closing-marker line.
832    let mut i = bytes.len();
833    if i > 0 && bytes[i - 1] == b'\n' {
834        i -= 1;
835    }
836    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
837        i -= 1;
838    }
839    while i > 0 && bytes[i - 1] == b'-' {
840        i -= 1;
841    }
842    if i > 0 && bytes[i - 1] == b'\n' {
843        i -= 1;
844    }
845    let body = &after_open[..i];
846
847    // Compute the common leading-whitespace prefix across all non-empty lines
848    // and strip it. This lets writers indent the doc block alongside the
849    // declaration it documents without bleeding the indent into the JSDoc.
850    let common: Option<usize> = body
851        .lines()
852        .filter(|l| !l.trim().is_empty())
853        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
854        .min();
855    let strip = common.unwrap_or(0);
856    if strip == 0 {
857        return body.to_string();
858    }
859    let mut out = String::with_capacity(body.len());
860    let mut first = true;
861    for line in body.lines() {
862        if !first {
863            out.push('\n');
864        }
865        first = false;
866        if line.trim().is_empty() {
867            // Preserve blank lines.
868            continue;
869        }
870        let leading: usize = line
871            .bytes()
872            .take_while(|&b| b == b' ' || b == b'\t')
873            .count();
874        let drop = strip.min(leading);
875        out.push_str(&line[drop..]);
876    }
877    out
878}
879
880/// Extract the body of a `Comment` trivia token: everything after the
881/// leading `--` marker, preserving its inline whitespace verbatim. Used by
882/// the parser when attaching comments to declarations.
883pub fn comment_body(source: &str, span: Span) -> &str {
884    let slice = &source[span.range()];
885    // Strip leading "--" if present (defensive — the lexer always emits
886    // Comment tokens whose span begins with `--`).
887    slice.strip_prefix("--").unwrap_or(slice)
888}
889
890/// Returns true if there is a blank line (a line containing only whitespace)
891/// in `source` strictly between byte offsets `from` (inclusive) and `to`
892/// (exclusive). Used by the parser to detect orphan doc blocks.
893///
894/// A doc-block token's span ends just past the closing-marker line's
895/// terminating newline. So if the next declaration begins on the immediately
896/// following line, the substring between contains no newline (only optional
897/// indentation). Any newline in the substring therefore implies at least one
898/// entirely-blank line separating the doc from the declaration.
899pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
900    if to <= from {
901        return false;
902    }
903    let bytes = source.as_bytes();
904    let mut i = from;
905    while i < to {
906        if bytes[i] == b'\n' {
907            return true;
908        }
909        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
910            return false;
911        }
912        i += 1;
913    }
914    false
915}
916
917#[cfg(test)]
918mod tests {
919    use super::*;
920
921    fn kinds(source: &str) -> Vec<TokenKind> {
922        tokenize(source)
923            .unwrap()
924            .into_iter()
925            .map(|t| t.kind)
926            .collect()
927    }
928
929    #[test]
930    fn keywords_and_idents() {
931        use TokenKind::*;
932        assert_eq!(
933            kinds("commons type fn where and true false Int String Bool foo bar"),
934            vec![
935                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
936            ],
937        );
938    }
939
940    #[test]
941    fn integer_and_string_literals() {
942        use TokenKind::*;
943        assert_eq!(
944            kinds(r#"0 42 "hello" "with\nescape""#),
945            vec![IntLit, IntLit, StrLit, StrLit]
946        );
947    }
948
949    #[test]
950    fn operators() {
951        use TokenKind::*;
952        assert_eq!(
953            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
954            vec![
955                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
956                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
957                At,
958            ],
959        );
960    }
961
962    #[test]
963    fn line_comments_emitted_as_trivia() {
964        // v1.1: line comments are preserved as Comment tokens so the
965        // formatter can attach and re-emit them.
966        use TokenKind::*;
967        let src = "-- a comment\ntype X = Int -- trailing\n";
968        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
969    }
970
971    #[test]
972    fn comment_body_extracts_text_after_marker() {
973        let toks = tokenize("-- hello world\n").unwrap();
974        assert_eq!(toks.len(), 1);
975        assert_eq!(toks[0].kind, TokenKind::Comment);
976        assert_eq!(
977            comment_body("-- hello world\n", toks[0].span),
978            " hello world"
979        );
980    }
981
982    #[test]
983    fn comment_does_not_consume_newline() {
984        // Two adjacent comment lines should produce two distinct tokens
985        // — the newline between them is not part of either comment's span.
986        let toks = tokenize("-- one\n-- two\n").unwrap();
987        assert_eq!(toks.len(), 2);
988        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
989    }
990
991    #[test]
992    fn unterminated_string_is_error() {
993        let err = tokenize("\"oops\n").unwrap_err();
994        assert_eq!(err.category, "bynk.lex.unterminated_string");
995    }
996
997    #[test]
998    fn integer_overflow_is_error() {
999        let err = tokenize("99999999999999999999").unwrap_err();
1000        assert_eq!(err.category, "bynk.lex.integer_overflow");
1001    }
1002
1003    #[test]
1004    fn unexpected_character_is_error() {
1005        let err = tokenize("type X = Int $").unwrap_err();
1006        assert_eq!(err.category, "bynk.lex.unexpected_character");
1007    }
1008
1009    #[test]
1010    fn v0_1_keywords() {
1011        use TokenKind::*;
1012        assert_eq!(
1013            kinds("let if else Ok Err Result ValidationError"),
1014            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1015        );
1016    }
1017
1018    #[test]
1019    fn question_token() {
1020        use TokenKind::*;
1021        assert_eq!(kinds("x?"), vec![Ident, Question]);
1022    }
1023
1024    #[test]
1025    fn v0_2_keywords() {
1026        use TokenKind::*;
1027        assert_eq!(
1028            kinds("enum match Option record self Some None is"),
1029            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1030        );
1031    }
1032
1033    #[test]
1034    fn pipe_and_pipe_pipe_disambiguated() {
1035        use TokenKind::*;
1036        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1037    }
1038
1039    #[test]
1040    fn v0_7_keywords() {
1041        use TokenKind::*;
1042        assert_eq!(
1043            kinds("expect mocks suite case"),
1044            vec![Expect, Mocks, Suite, Case],
1045        );
1046    }
1047
1048    #[test]
1049    fn fat_arrow_and_underscore() {
1050        use TokenKind::*;
1051        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1052    }
1053
1054    // -- v0.43 string interpolation --
1055
1056    #[test]
1057    fn interp_string_is_one_token() {
1058        use TokenKind::*;
1059        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1060        // A plain string (no hole) stays a `StrLit`, via the logos path.
1061        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1062    }
1063
1064    #[test]
1065    fn interp_balances_nested_parens_and_strings() {
1066        use TokenKind::*;
1067        // The `)` inside `f(x)` must not close the hole early.
1068        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1069        // A `)` inside a nested string inside the hole is also ignored.
1070        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1071        // A nested interpolated string inside a hole.
1072        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1073    }
1074
1075    #[test]
1076    fn escaped_open_paren_is_not_a_hole() {
1077        use TokenKind::*;
1078        // `\\(` is a literal backslash followed by `(` — no hole, so the
1079        // string lexes as a plain `StrLit` on the logos path.
1080        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1081    }
1082
1083    #[test]
1084    fn unterminated_hole_is_an_error() {
1085        // The hole runs to end of line without its closing `)`.
1086        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1087        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1088    }
1089
1090    #[test]
1091    fn unterminated_interp_string_is_an_error() {
1092        // A hole closes but the string never does (newline before the `"`).
1093        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1094        assert_eq!(err.category, "bynk.lex.unterminated_string");
1095    }
1096
1097    #[test]
1098    fn bad_escape_in_interp_string_is_an_error() {
1099        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1100        assert_eq!(err.category, "bynk.lex.bad_escape");
1101    }
1102}