damn_simple_architecture/compiler/src/frontend/dsc/lexer.rs

use std::iter::Peekable;
use std::str::Chars;

#[derive(Debug, PartialEq, Clone)]
pub enum Token {
    // Keywords
    Fn,
    Let,
    If,
    Else,
    Loop,
    While,
    Break,
    Return,
    Continue,
    Include,
    Static,
    Const,
    As,
    SizeOf,

    // Identifiers and literals
    Identifier(Name),
    String(String),
    Integer(u64),
    Char(char),

    // Delimiters
    LeftParen,    // (
    RightParen,   // )
    LeftBrace,    // {
    RightBrace,   // }
    LeftBracket,  // [
    RightBracket, // ]
    Semicolon,    // ;
    Colon,        // :
    Comma,        // ,
    Dot,          // .
    RightArrow,   // ->

    // Arithmetic operators
    Plus,       // +
    Minus,      // -
    Star,       // *
    Slash,      // /
    Percent,    // %
    PlusPlus,   // ++
    MinusMinus, // --

    // Bitwise operators
    Ampersand,  // &
    Pipe,       // |
    Caret,      // ^
    Tilde,      // ~
    LeftShift,  // <<
    RightShift, // >>

    // Logical operators
    Bang,       // !
    LogicalAnd, // &&
    LogicalOr,  // ||

    // Comparison operators
    EqualEqual,   // ==
    BangEqual,    // !=
    Less,         // <
    LessEqual,    // <=
    Greater,      // >
    GreaterEqual, // >=

    // Assignment operators
    Assign,       // =
    PlusEqual,    // +=
    MinusEqual,   // -=
    StarEqual,    // *=
    SlashEqual,   // /=
    PercentEqual, // %=
    AndEqual,     // &=
    OrEqual,      // |=
    XorEqual,     // ^=
    ShlEqual,     // <<=
    ShrEqual,     // >>=

    // Special
    Eof,
}

use crate::model::Name;
use std::fmt;

impl fmt::Display for Name {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if let Some(ref ns) = self.namespace {
            write!(f, "{}::{}", ns, self.name)
        } else {
            write!(f, "{}", self.name)
        }
    }
}

impl Token {
    pub fn tt(&self) -> &str {
        match self {
            Token::Const => "Const",
            Token::Static => "Static",
            Token::Include => "Include",
            Token::Fn => "Fn",
            Token::If => "If",
            Token::Let => "Let",
            Token::Else => "Else",
            Token::Loop => "Loop",
            Token::While => "While",
            Token::Break => "Break",
            Token::Return => "Return",
            Token::Continue => "Continue",
            Token::As => "As",
            Token::Identifier(_) => "Identifier",
            Token::String(_) => "String",
            Token::Integer(_) => "UnsignedInt",
            Token::Char(_) => "Char",
            Token::LeftParen => "LeftParen",
            Token::RightParen => "RightParen",
            Token::LeftBrace => "LeftBrace",
            Token::RightBrace => "RightBrace",
            Token::LeftBracket => "LeftBracket",
            Token::RightBracket => "RightBracket",
            Token::Semicolon => "Semicolon",
            Token::Colon => "Colon",
            Token::Comma => "Comma",
            Token::Dot => "Dot",
            Token::RightArrow => "RightArrow",
            Token::Plus => "Plus",
            Token::Minus => "Minus",
            Token::Star => "Star",
            Token::Slash => "Slash",
            Token::Percent => "Percent",
            Token::PlusPlus => "PlusPlus",
            Token::MinusMinus => "MinusMinus",
            Token::Ampersand => "Ampersand",
            Token::Pipe => "Pipe",
            Token::Caret => "Caret",
            Token::Tilde => "Tilde",
            Token::LeftShift => "LeftShift",
            Token::RightShift => "RightShift",
            Token::Bang => "Bang",
            Token::LogicalAnd => "LogicalAnd",
            Token::LogicalOr => "LogicalOr",
            Token::EqualEqual => "EqualEqual",
            Token::BangEqual => "BangEqual",
            Token::Less => "Less",
            Token::LessEqual => "LessEqual",
            Token::Greater => "Greater",
            Token::GreaterEqual => "GreaterEqual",
            Token::Assign => "Assign",
            Token::PlusEqual => "PlusEqual",
            Token::MinusEqual => "MinusEqual",
            Token::StarEqual => "StarEqual",
            Token::SlashEqual => "SlashEqual",
            Token::PercentEqual => "PercentEqual",
            Token::AndEqual => "AndEqual",
            Token::OrEqual => "OrEqual",
            Token::XorEqual => "XorEqual",
            Token::ShlEqual => "ShlEqual",
            Token::ShrEqual => "ShrEqual",
            Token::SizeOf => "SizeOf",
            Token::Eof => "Eof",
        }
    }
}

pub struct Lexer<'a> {
    chars: Peekable<Chars<'a>>,
    current: Option<char>,
    line: usize,
}

impl<'a> Lexer<'a> {
    pub fn new(input: &'a str) -> Self {
        let mut chars = input.chars().peekable();
        let current = chars.next();

        Lexer {
            chars,
            current,
            line: 1,
        }
    }

    // ========================================================================
    // Character Navigation
    // ========================================================================

    /// Advance to the next character and return it
    fn advance(&mut self) -> Option<char> {
        self.current = self.chars.next();
        self.current
    }

    /// Peek at the next character without consuming it
    fn peek(&mut self) -> Option<char> {
        self.chars.peek().copied()
    }

    /// Peek two characters ahead
    fn peek_second(&mut self) -> Option<char> {
        let mut temp = self.chars.clone();
        temp.next(); // Skip the first peek
        temp.next()
    }

    /// Check if the next character matches expected, and consume it if so
    fn match_next(&mut self, expected: char) -> bool {
        if self.peek() == Some(expected) {
            self.advance();
            true
        } else {
            false
        }
    }

    // ========================================================================
    // Whitespace and Comments
    // ========================================================================

    fn skip_whitespace(&mut self) {
        while let Some(c) = self.current {
            if c.is_whitespace() {
                if c == '\n' {
                    self.line += 1;
                }
                self.advance();
            } else {
                break;
            }
        }
    }

    fn skip_line_comment(&mut self) {
        // We're at the first '/', advance past '//'
        self.advance(); // consume first '/'
        self.advance(); // consume second '/'

        // Skip until newline or EOF
        while let Some(c) = self.current {
            if c == '\n' {
                self.line += 1;
                self.advance();
                break;
            }
            self.advance();
        }
    }

    fn skip_block_comment(&mut self) -> Result<(), String> {
        let start_line = self.line;

        // We're at '/', advance past '/*'
        self.advance(); // consume '/'
        self.advance(); // consume '*'

        // Look for closing '*/'
        while let Some(c) = self.current {
            if c == '\n' {
                self.line += 1;
            }

            if c == '*' && self.peek() == Some('/') {
                self.advance(); // consume '*'
                self.advance(); // consume '/'
                return Ok(());
            }

            self.advance();
        }

        Err(format!(
            "Unterminated block comment starting at line {}",
            start_line
        ))
    }

    fn skip_whitespace_and_comments(&mut self) {
        loop {
            self.skip_whitespace();

            // Check for comments
            if self.current == Some('/') {
                match self.peek() {
                    Some('/') => {
                        self.skip_line_comment();
                        continue;
                    }
                    Some('*') => {
                        if let Err(e) = self.skip_block_comment() {
                            self.error(&e);
                        }
                        continue;
                    }
                    _ => break,
                }
            }

            break;
        }
    }

    // ========================================================================
    // Identifiers and Keywords
    // ========================================================================

    fn read_identifier(&mut self) -> String {
        let mut ident = String::new();

        // Include the current character (already validated as alphabetic or '_')
        if let Some(c) = self.current {
            ident.push(c);
        }

        // Read remaining alphanumeric or underscore characters
        while let Some(c) = self.peek() {
            if c.is_alphanumeric() || c == '_' {
                self.advance();
                ident.push(c);
            } else {
                break;
            }
        }

        ident
    }

    fn scan_identifier_or_keyword(&mut self) -> Token {
        let first_part = self.read_identifier();

        // Check if it's a keyword (keywords cannot have namespaces)
        if let Some(keyword) = self.match_keyword(&first_part) {
            return keyword;
        }

        // Check for namespace separator '::'
        if self.peek() == Some(':') && self.peek_second() == Some(':') {
            // Consume '::'
            self.advance(); // consume first ':'
            self.advance(); // consume second ':'
            self.advance(); // move to the first character of the next identifier

            // Read the second part (the actual name)
            let second_part = self.read_identifier();

            return Token::Identifier(Name {
                namespace: Some(first_part),
                name: second_part,
            });
        }

        // Plain identifier without namespace
        Token::Identifier(Name {
            namespace: None,
            name: first_part,
        })
    }

    fn match_keyword(&self, word: &str) -> Option<Token> {
        match word {
            "fn" => Some(Token::Fn),
            "let" => Some(Token::Let),
            "if" => Some(Token::If),
            "else" => Some(Token::Else),
            "loop" => Some(Token::Loop),
            "while" => Some(Token::While),
            "break" => Some(Token::Break),
            "return" => Some(Token::Return),
            "continue" => Some(Token::Continue),
            "include" => Some(Token::Include),
            "const" => Some(Token::Const),
            "static" => Some(Token::Static),
            "as" => Some(Token::As),
            "sizeof" => Some(Token::SizeOf),
            _ => None,
        }
    }

    // ========================================================================
    // Numbers
    // ========================================================================

    fn scan_number(&mut self) -> Token {
        match self.read_number() {
            Ok(num) => Token::Integer(num),
            Err(e) => {
                self.error(&e);
                // Skip the invalid number
                while let Some(c) = self.peek() {
                    if !c.is_alphanumeric() && c != '_' {
                        break;
                    }
                    self.advance();
                }
                Token::Integer(0)
            }
        }
    }

    fn read_number(&mut self) -> Result<u64, String> {
        // Check for hex (0x) or binary (0b) prefix
        if self.current == Some('0') {
            match self.peek() {
                Some('x') | Some('X') => {
                    self.advance(); // consume '0'
                    self.advance(); // consume 'x'
                    return self.read_hex_number();
                }
                Some('b') | Some('B') => {
                    self.advance(); // consume '0'
                    self.advance(); // consume 'b'
                    return self.read_binary_number();
                }
                _ => {}
            }
        }

        // Read decimal number
        self.read_decimal_number()
    }

    fn read_decimal_number(&mut self) -> Result<u64, String> {
        let mut num_str = String::new();

        if let Some(c) = self.current {
            num_str.push(c);
        }

        while let Some(c) = self.peek() {
            if c.is_ascii_digit() {
                self.advance();
                num_str.push(c);
            } else if c == '_' {
                // Allow underscores as separators (like Rust)
                self.advance();
            } else {
                break;
            }
        }

        num_str
            .parse::<u64>()
            .map_err(|_| format!("Invalid decimal number: {}", num_str))
    }

    fn read_hex_number(&mut self) -> Result<u64, String> {
        let mut num_str = String::new();

        // Read the first hex digit (current character)
        if let Some(c) = self.current {
            if c.is_ascii_hexdigit() {
                num_str.push(c);
            }
        }

        while let Some(c) = self.peek() {
            if c.is_ascii_hexdigit() {
                self.advance();
                num_str.push(c);
            } else if c == '_' {
                self.advance(); // Allow underscores as separators
            } else {
                break;
            }
        }

        if num_str.is_empty() {
            return Err("Invalid hexadecimal number: no digits after 0x".to_string());
        }

        u64::from_str_radix(&num_str, 16)
            .map_err(|_| format!("Invalid hexadecimal number: {}", num_str))
    }

    fn read_binary_number(&mut self) -> Result<u64, String> {
        let mut num_str = String::new();

        // Read the first binary digit (current character)
        if let Some(c) = self.current {
            if c == '0' || c == '1' {
                num_str.push(c);
            }
        }

        while let Some(c) = self.peek() {
            if c == '0' || c == '1' {
                self.advance();
                num_str.push(c);
            } else if c == '_' {
                self.advance(); // Allow underscores as separators
            } else {
                break;
            }
        }

        if num_str.is_empty() {
            return Err("Invalid binary number: no digits after 0b".to_string());
        }

        u64::from_str_radix(&num_str, 2)
            .map_err(|_| format!("Invalid binary number: {}", num_str))
    }

    // ========================================================================
    // String and Character Literals
    // ========================================================================

    fn scan_string(&mut self) -> Token {
        match self.read_string() {
            Ok(s) => Token::String(s),
            Err(e) => {
                self.error(&e);
                // Skip to the end of the string or newline
                while let Some(c) = self.current {
                    if c == '"' || c == '\n' {
                        break;
                    }
                    self.advance();
                }
                Token::String(String::new())
            }
        }
    }

    fn read_string(&mut self) -> Result<String, String> {
        self.advance(); // Skip the opening quote
        let mut s = String::new();

        while let Some(c) = self.current {
            if c == '"' {
                return Ok(s);
            }

            if c == '\n' {
                return Err("Unterminated string literal (newline)".to_string());
            }

            // Handle escape sequences
            if c == '\\' {
                self.advance();
                if let Some(escaped) = self.current {
                    let escaped_char = match escaped {
                        'n' => '\n',
                        't' => '\t',
                        'r' => '\r',
                        '\\' => '\\',
                        '"' => '"',
                        '\'' => '\'',
                        '0' => '\0',
                        _ => {
                            return Err(format!(
                                "Invalid escape sequence: \\{}",
                                escaped
                            ));
                        }
                    };
                    s.push(escaped_char);
                } else {
                    return Err("Unexpected end of string after escape".to_string());
                }
            } else {
                s.push(c);
            }

            self.advance();
        }

        Err("Unterminated string literal".to_string())
    }

    fn scan_char(&mut self) -> Token {
        match self.read_char() {
            Ok(ch) => Token::Char(ch),
            Err(e) => {
                self.error(&e);
                // Skip to the end of the char literal
                while let Some(c) = self.current {
                    if c == '\'' || c == '\n' {
                        break;
                    }
                    self.advance();
                }
                Token::Char('\0')
            }
        }
    }

    fn read_char(&mut self) -> Result<char, String> {
        self.advance(); // Skip opening quote

        let ch = match self.current {
            Some('\\') => {
                // Handle escape sequences
                self.advance();
                match self.current {
                    Some('n') => '\n',
                    Some('t') => '\t',
                    Some('r') => '\r',
                    Some('\\') => '\\',
                    Some('\'') => '\'',
                    Some('"') => '"',
                    Some('0') => '\0',
                    Some(c) => return Err(format!("Invalid escape sequence: \\{}", c)),
                    None => {
                        return Err(
                            "Unexpected end after escape in char literal".to_string()
                        );
                    }
                }
            }
            Some('\'') => return Err("Empty character literal".to_string()),
            Some('\n') => return Err("Unterminated character literal".to_string()),
            Some(c) => c,
            None => return Err("Unterminated character literal".to_string()),
        };

        self.advance(); // Move to closing quote

        if self.current != Some('\'') {
            return Err(
                "Character literal must contain exactly one character".to_string()
            );
        }

        Ok(ch)
    }

    // ========================================================================
    // Operators and Punctuation
    // ========================================================================

    fn scan_operator(&mut self, c: char) -> Token {
        match c {
            // Single-character tokens that can't be extended
            '(' => Token::LeftParen,
            ')' => Token::RightParen,
            '{' => Token::LeftBrace,
            '}' => Token::RightBrace,
            '[' => Token::LeftBracket,
            ']' => Token::RightBracket,
            ';' => Token::Semicolon,
            ',' => Token::Comma,
            '.' => Token::Dot,
            '~' => Token::Tilde,
            ':' => Token::Colon, // '::' is handled in identifier scanning

            // Operators that may have compound forms
            '+' => {
                if self.match_next('+') {
                    Token::PlusPlus
                } else if self.match_next('=') {
                    Token::PlusEqual
                } else {
                    Token::Plus
                }
            }

            '-' => {
                if self.match_next('-') {
                    Token::MinusMinus
                } else if self.match_next('>') {
                    Token::RightArrow
                } else if self.match_next('=') {
                    Token::MinusEqual
                } else {
                    Token::Minus
                }
            }

            '*' => {
                if self.match_next('=') {
                    Token::StarEqual
                } else {
                    Token::Star
                }
            }

            '/' => {
                // Comments are handled in skip_whitespace_and_comments
                if self.match_next('=') {
                    Token::SlashEqual
                } else {
                    Token::Slash
                }
            }

            '%' => {
                if self.match_next('=') {
                    Token::PercentEqual
                } else {
                    Token::Percent
                }
            }

            '&' => {
                if self.match_next('&') {
                    Token::LogicalAnd
                } else if self.match_next('=') {
                    Token::AndEqual
                } else {
                    Token::Ampersand
                }
            }

            '|' => {
                if self.match_next('|') {
                    Token::LogicalOr
                } else if self.match_next('=') {
                    Token::OrEqual
                } else {
                    Token::Pipe
                }
            }

            '^' => {
                if self.match_next('=') {
                    Token::XorEqual
                } else {
                    Token::Caret
                }
            }

            '!' => {
                if self.match_next('=') {
                    Token::BangEqual
                } else {
                    Token::Bang
                }
            }

            '=' => {
                if self.match_next('=') {
                    Token::EqualEqual
                } else {
                    Token::Assign
                }
            }

            '<' => {
                if self.match_next('<') {
                    if self.match_next('=') {
                        Token::ShlEqual
                    } else {
                        Token::LeftShift
                    }
                } else if self.match_next('=') {
                    Token::LessEqual
                } else {
                    Token::Less
                }
            }

            '>' => {
                if self.match_next('>') {
                    if self.match_next('=') {
                        Token::ShrEqual
                    } else {
                        Token::RightShift
                    }
                } else if self.match_next('=') {
                    Token::GreaterEqual
                } else {
                    Token::Greater
                }
            }

            _ => {
                self.error(&format!("Unexpected character: '{}'", c));
                Token::Eof // This shouldn't happen
            }
        }
    }

    // ========================================================================
    // Main Token Scanning
    // ========================================================================

    pub fn next_token(&mut self) -> Token {
        self.skip_whitespace_and_comments();

        let Some(c) = self.current else {
            return Token::Eof;
        };

        let token = match c {
            // Identifiers and keywords
            'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier_or_keyword(),

            // Numbers
            '0'..='9' => self.scan_number(),

            // String literals
            '"' => self.scan_string(),

            // Character literals
            '\'' => self.scan_char(),

            // Operators and punctuation
            _ => self.scan_operator(c),
        };

        self.advance();
        token
    }

    // ========================================================================
    // Error Handling
    // ========================================================================

    fn error(&self, message: &str) {
        eprintln!("Lexer error on line {}: {}", self.line, message);
    }
}

// ========================================================================
// Iterator Implementation
// ========================================================================

impl<'a> Iterator for Lexer<'a> {
    type Item = Token;

    fn next(&mut self) -> Option<Self::Item> {
        match self.next_token() {
            Token::Eof => None,
            token => Some(token),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_operators() {
        let input = "+ ++ += - -- -= * *= / /= % %= & &= && | |= || ^ ^= ! != = == < <= << <<= > >= >> >>=";
        let mut lexer = Lexer::new(input);

        let expected = vec![
            Token::Plus,
            Token::PlusPlus,
            Token::PlusEqual,
            Token::Minus,
            Token::MinusMinus,
            Token::MinusEqual,
            Token::Star,
            Token::StarEqual,
            Token::Slash,
            Token::SlashEqual,
            Token::Percent,
            Token::PercentEqual,
            Token::Ampersand,
            Token::AndEqual,
            Token::LogicalAnd,
            Token::Pipe,
            Token::OrEqual,
            Token::LogicalOr,
            Token::Caret,
            Token::XorEqual,
            Token::Bang,
            Token::BangEqual,
            Token::Assign,
            Token::EqualEqual,
            Token::Less,
            Token::LessEqual,
            Token::LeftShift,
            Token::ShlEqual,
            Token::Greater,
            Token::GreaterEqual,
            Token::RightShift,
            Token::ShrEqual,
        ];

        for expected_token in expected {
            assert_eq!(lexer.next_token(), expected_token);
        }
    }

    #[test]
    fn test_numbers() {
        let input = "42 0x2A 0b101010 123_456";
        let mut lexer = Lexer::new(input);

        assert_eq!(lexer.next_token(), Token::Integer(42));
        assert_eq!(lexer.next_token(), Token::Integer(42));
        assert_eq!(lexer.next_token(), Token::Integer(42));
        assert_eq!(lexer.next_token(), Token::Integer(123456));
    }

    #[test]
    fn test_namespaced_identifier() {
        let input = "print::println std::io::read";
        let mut lexer = Lexer::new(input);

        let first = lexer.next_token();
        if let Token::Identifier(name) = first {
            assert_eq!(name.namespace, Some("print".to_string()));
            assert_eq!(name.name, "println");
        } else {
            panic!("Expected namespaced identifier");
        }
    }
}