use std::iter::Peekable; use std::str::Chars; #[derive(Debug, PartialEq, Clone)] pub enum Token { // Keywords Fn, Let, If, Else, Loop, While, Break, Return, Continue, Include, Static, Const, As, SizeOf, // Identifiers and literals Identifier(Name), String(String), Integer(u64), Char(char), // Delimiters LeftParen, // ( RightParen, // ) LeftBrace, // { RightBrace, // } LeftBracket, // [ RightBracket, // ] Semicolon, // ; Colon, // : Comma, // , Dot, // . RightArrow, // -> // Arithmetic operators Plus, // + Minus, // - Star, // * Slash, // / Percent, // % PlusPlus, // ++ MinusMinus, // -- // Bitwise operators Ampersand, // & Pipe, // | Caret, // ^ Tilde, // ~ LeftShift, // << RightShift, // >> // Logical operators Bang, // ! LogicalAnd, // && LogicalOr, // || // Comparison operators EqualEqual, // == BangEqual, // != Less, // < LessEqual, // <= Greater, // > GreaterEqual, // >= // Assignment operators Assign, // = PlusEqual, // += MinusEqual, // -= StarEqual, // *= SlashEqual, // /= PercentEqual, // %= AndEqual, // &= OrEqual, // |= XorEqual, // ^= ShlEqual, // <<= ShrEqual, // >>= // Special Eof, } use crate::model::Name; use std::fmt; impl fmt::Display for Name { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(ref ns) = self.namespace { write!(f, "{}::{}", ns, self.name) } else { write!(f, "{}", self.name) } } } impl Token { pub fn tt(&self) -> &str { match self { Token::Const => "Const", Token::Static => "Static", Token::Include => "Include", Token::Fn => "Fn", Token::If => "If", Token::Let => "Let", Token::Else => "Else", Token::Loop => "Loop", Token::While => "While", Token::Break => "Break", Token::Return => "Return", Token::Continue => "Continue", Token::As => "As", Token::Identifier(_) => "Identifier", Token::String(_) => "String", Token::Integer(_) => "UnsignedInt", Token::Char(_) => "Char", Token::LeftParen => "LeftParen", Token::RightParen => "RightParen", Token::LeftBrace => "LeftBrace", Token::RightBrace => "RightBrace", Token::LeftBracket => "LeftBracket", Token::RightBracket => "RightBracket", Token::Semicolon => "Semicolon", Token::Colon => "Colon", Token::Comma => "Comma", Token::Dot => "Dot", Token::RightArrow => "RightArrow", Token::Plus => "Plus", Token::Minus => "Minus", Token::Star => "Star", Token::Slash => "Slash", Token::Percent => "Percent", Token::PlusPlus => "PlusPlus", Token::MinusMinus => "MinusMinus", Token::Ampersand => "Ampersand", Token::Pipe => "Pipe", Token::Caret => "Caret", Token::Tilde => "Tilde", Token::LeftShift => "LeftShift", Token::RightShift => "RightShift", Token::Bang => "Bang", Token::LogicalAnd => "LogicalAnd", Token::LogicalOr => "LogicalOr", Token::EqualEqual => "EqualEqual", Token::BangEqual => "BangEqual", Token::Less => "Less", Token::LessEqual => "LessEqual", Token::Greater => "Greater", Token::GreaterEqual => "GreaterEqual", Token::Assign => "Assign", Token::PlusEqual => "PlusEqual", Token::MinusEqual => "MinusEqual", Token::StarEqual => "StarEqual", Token::SlashEqual => "SlashEqual", Token::PercentEqual => "PercentEqual", Token::AndEqual => "AndEqual", Token::OrEqual => "OrEqual", Token::XorEqual => "XorEqual", Token::ShlEqual => "ShlEqual", Token::ShrEqual => "ShrEqual", Token::SizeOf => "SizeOf", Token::Eof => "Eof", } } } pub struct Lexer<'a> { chars: Peekable>, current: Option, line: usize, } impl<'a> Lexer<'a> { pub fn new(input: &'a str) -> Self { let mut chars = input.chars().peekable(); let current = chars.next(); Lexer { chars, current, line: 1, } } // ======================================================================== // Character Navigation // ======================================================================== /// Advance to the next character and return it fn advance(&mut self) -> Option { self.current = self.chars.next(); self.current } /// Peek at the next character without consuming it fn peek(&mut self) -> Option { self.chars.peek().copied() } /// Peek two characters ahead fn peek_second(&mut self) -> Option { let mut temp = self.chars.clone(); temp.next(); // Skip the first peek temp.next() } /// Check if the next character matches expected, and consume it if so fn match_next(&mut self, expected: char) -> bool { if self.peek() == Some(expected) { self.advance(); true } else { false } } // ======================================================================== // Whitespace and Comments // ======================================================================== fn skip_whitespace(&mut self) { while let Some(c) = self.current { if c.is_whitespace() { if c == '\n' { self.line += 1; } self.advance(); } else { break; } } } fn skip_line_comment(&mut self) { // We're at the first '/', advance past '//' self.advance(); // consume first '/' self.advance(); // consume second '/' // Skip until newline or EOF while let Some(c) = self.current { if c == '\n' { self.line += 1; self.advance(); break; } self.advance(); } } fn skip_block_comment(&mut self) -> Result<(), String> { let start_line = self.line; // We're at '/', advance past '/*' self.advance(); // consume '/' self.advance(); // consume '*' // Look for closing '*/' while let Some(c) = self.current { if c == '\n' { self.line += 1; } if c == '*' && self.peek() == Some('/') { self.advance(); // consume '*' self.advance(); // consume '/' return Ok(()); } self.advance(); } Err(format!( "Unterminated block comment starting at line {}", start_line )) } fn skip_whitespace_and_comments(&mut self) { loop { self.skip_whitespace(); // Check for comments if self.current == Some('/') { match self.peek() { Some('/') => { self.skip_line_comment(); continue; } Some('*') => { if let Err(e) = self.skip_block_comment() { self.error(&e); } continue; } _ => break, } } break; } } // ======================================================================== // Identifiers and Keywords // ======================================================================== fn read_identifier(&mut self) -> String { let mut ident = String::new(); // Include the current character (already validated as alphabetic or '_') if let Some(c) = self.current { ident.push(c); } // Read remaining alphanumeric or underscore characters while let Some(c) = self.peek() { if c.is_alphanumeric() || c == '_' { self.advance(); ident.push(c); } else { break; } } ident } fn scan_identifier_or_keyword(&mut self) -> Token { let first_part = self.read_identifier(); // Check if it's a keyword (keywords cannot have namespaces) if let Some(keyword) = self.match_keyword(&first_part) { return keyword; } // Check for namespace separator '::' if self.peek() == Some(':') && self.peek_second() == Some(':') { // Consume '::' self.advance(); // consume first ':' self.advance(); // consume second ':' self.advance(); // move to the first character of the next identifier // Read the second part (the actual name) let second_part = self.read_identifier(); return Token::Identifier(Name { namespace: Some(first_part), name: second_part, }); } // Plain identifier without namespace Token::Identifier(Name { namespace: None, name: first_part, }) } fn match_keyword(&self, word: &str) -> Option { match word { "fn" => Some(Token::Fn), "let" => Some(Token::Let), "if" => Some(Token::If), "else" => Some(Token::Else), "loop" => Some(Token::Loop), "while" => Some(Token::While), "break" => Some(Token::Break), "return" => Some(Token::Return), "continue" => Some(Token::Continue), "include" => Some(Token::Include), "const" => Some(Token::Const), "static" => Some(Token::Static), "as" => Some(Token::As), "sizeof" => Some(Token::SizeOf), _ => None, } } // ======================================================================== // Numbers // ======================================================================== fn scan_number(&mut self) -> Token { match self.read_number() { Ok(num) => Token::Integer(num), Err(e) => { self.error(&e); // Skip the invalid number while let Some(c) = self.peek() { if !c.is_alphanumeric() && c != '_' { break; } self.advance(); } Token::Integer(0) } } } fn read_number(&mut self) -> Result { // Check for hex (0x) or binary (0b) prefix if self.current == Some('0') { match self.peek() { Some('x') | Some('X') => { self.advance(); // consume '0' self.advance(); // consume 'x' return self.read_hex_number(); } Some('b') | Some('B') => { self.advance(); // consume '0' self.advance(); // consume 'b' return self.read_binary_number(); } _ => {} } } // Read decimal number self.read_decimal_number() } fn read_decimal_number(&mut self) -> Result { let mut num_str = String::new(); if let Some(c) = self.current { num_str.push(c); } while let Some(c) = self.peek() { if c.is_ascii_digit() { self.advance(); num_str.push(c); } else if c == '_' { // Allow underscores as separators (like Rust) self.advance(); } else { break; } } num_str .parse::() .map_err(|_| format!("Invalid decimal number: {}", num_str)) } fn read_hex_number(&mut self) -> Result { let mut num_str = String::new(); // Read the first hex digit (current character) if let Some(c) = self.current { if c.is_ascii_hexdigit() { num_str.push(c); } } while let Some(c) = self.peek() { if c.is_ascii_hexdigit() { self.advance(); num_str.push(c); } else if c == '_' { self.advance(); // Allow underscores as separators } else { break; } } if num_str.is_empty() { return Err("Invalid hexadecimal number: no digits after 0x".to_string()); } u64::from_str_radix(&num_str, 16) .map_err(|_| format!("Invalid hexadecimal number: {}", num_str)) } fn read_binary_number(&mut self) -> Result { let mut num_str = String::new(); // Read the first binary digit (current character) if let Some(c) = self.current { if c == '0' || c == '1' { num_str.push(c); } } while let Some(c) = self.peek() { if c == '0' || c == '1' { self.advance(); num_str.push(c); } else if c == '_' { self.advance(); // Allow underscores as separators } else { break; } } if num_str.is_empty() { return Err("Invalid binary number: no digits after 0b".to_string()); } u64::from_str_radix(&num_str, 2) .map_err(|_| format!("Invalid binary number: {}", num_str)) } // ======================================================================== // String and Character Literals // ======================================================================== fn scan_string(&mut self) -> Token { match self.read_string() { Ok(s) => Token::String(s), Err(e) => { self.error(&e); // Skip to the end of the string or newline while let Some(c) = self.current { if c == '"' || c == '\n' { break; } self.advance(); } Token::String(String::new()) } } } fn read_string(&mut self) -> Result { self.advance(); // Skip the opening quote let mut s = String::new(); while let Some(c) = self.current { if c == '"' { return Ok(s); } if c == '\n' { return Err("Unterminated string literal (newline)".to_string()); } // Handle escape sequences if c == '\\' { self.advance(); if let Some(escaped) = self.current { let escaped_char = match escaped { 'n' => '\n', 't' => '\t', 'r' => '\r', '\\' => '\\', '"' => '"', '\'' => '\'', '0' => '\0', _ => { return Err(format!( "Invalid escape sequence: \\{}", escaped )); } }; s.push(escaped_char); } else { return Err("Unexpected end of string after escape".to_string()); } } else { s.push(c); } self.advance(); } Err("Unterminated string literal".to_string()) } fn scan_char(&mut self) -> Token { match self.read_char() { Ok(ch) => Token::Char(ch), Err(e) => { self.error(&e); // Skip to the end of the char literal while let Some(c) = self.current { if c == '\'' || c == '\n' { break; } self.advance(); } Token::Char('\0') } } } fn read_char(&mut self) -> Result { self.advance(); // Skip opening quote let ch = match self.current { Some('\\') => { // Handle escape sequences self.advance(); match self.current { Some('n') => '\n', Some('t') => '\t', Some('r') => '\r', Some('\\') => '\\', Some('\'') => '\'', Some('"') => '"', Some('0') => '\0', Some(c) => return Err(format!("Invalid escape sequence: \\{}", c)), None => { return Err( "Unexpected end after escape in char literal".to_string() ); } } } Some('\'') => return Err("Empty character literal".to_string()), Some('\n') => return Err("Unterminated character literal".to_string()), Some(c) => c, None => return Err("Unterminated character literal".to_string()), }; self.advance(); // Move to closing quote if self.current != Some('\'') { return Err( "Character literal must contain exactly one character".to_string() ); } Ok(ch) } // ======================================================================== // Operators and Punctuation // ======================================================================== fn scan_operator(&mut self, c: char) -> Token { match c { // Single-character tokens that can't be extended '(' => Token::LeftParen, ')' => Token::RightParen, '{' => Token::LeftBrace, '}' => Token::RightBrace, '[' => Token::LeftBracket, ']' => Token::RightBracket, ';' => Token::Semicolon, ',' => Token::Comma, '.' => Token::Dot, '~' => Token::Tilde, ':' => Token::Colon, // '::' is handled in identifier scanning // Operators that may have compound forms '+' => { if self.match_next('+') { Token::PlusPlus } else if self.match_next('=') { Token::PlusEqual } else { Token::Plus } } '-' => { if self.match_next('-') { Token::MinusMinus } else if self.match_next('>') { Token::RightArrow } else if self.match_next('=') { Token::MinusEqual } else { Token::Minus } } '*' => { if self.match_next('=') { Token::StarEqual } else { Token::Star } } '/' => { // Comments are handled in skip_whitespace_and_comments if self.match_next('=') { Token::SlashEqual } else { Token::Slash } } '%' => { if self.match_next('=') { Token::PercentEqual } else { Token::Percent } } '&' => { if self.match_next('&') { Token::LogicalAnd } else if self.match_next('=') { Token::AndEqual } else { Token::Ampersand } } '|' => { if self.match_next('|') { Token::LogicalOr } else if self.match_next('=') { Token::OrEqual } else { Token::Pipe } } '^' => { if self.match_next('=') { Token::XorEqual } else { Token::Caret } } '!' => { if self.match_next('=') { Token::BangEqual } else { Token::Bang } } '=' => { if self.match_next('=') { Token::EqualEqual } else { Token::Assign } } '<' => { if self.match_next('<') { if self.match_next('=') { Token::ShlEqual } else { Token::LeftShift } } else if self.match_next('=') { Token::LessEqual } else { Token::Less } } '>' => { if self.match_next('>') { if self.match_next('=') { Token::ShrEqual } else { Token::RightShift } } else if self.match_next('=') { Token::GreaterEqual } else { Token::Greater } } _ => { self.error(&format!("Unexpected character: '{}'", c)); Token::Eof // This shouldn't happen } } } // ======================================================================== // Main Token Scanning // ======================================================================== pub fn next_token(&mut self) -> Token { self.skip_whitespace_and_comments(); let Some(c) = self.current else { return Token::Eof; }; let token = match c { // Identifiers and keywords 'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier_or_keyword(), // Numbers '0'..='9' => self.scan_number(), // String literals '"' => self.scan_string(), // Character literals '\'' => self.scan_char(), // Operators and punctuation _ => self.scan_operator(c), }; self.advance(); token } // ======================================================================== // Error Handling // ======================================================================== fn error(&self, message: &str) { eprintln!("Lexer error on line {}: {}", self.line, message); } } // ======================================================================== // Iterator Implementation // ======================================================================== impl<'a> Iterator for Lexer<'a> { type Item = Token; fn next(&mut self) -> Option { match self.next_token() { Token::Eof => None, token => Some(token), } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_operators() { let input = "+ ++ += - -- -= * *= / /= % %= & &= && | |= || ^ ^= ! != = == < <= << <<= > >= >> >>="; let mut lexer = Lexer::new(input); let expected = vec![ Token::Plus, Token::PlusPlus, Token::PlusEqual, Token::Minus, Token::MinusMinus, Token::MinusEqual, Token::Star, Token::StarEqual, Token::Slash, Token::SlashEqual, Token::Percent, Token::PercentEqual, Token::Ampersand, Token::AndEqual, Token::LogicalAnd, Token::Pipe, Token::OrEqual, Token::LogicalOr, Token::Caret, Token::XorEqual, Token::Bang, Token::BangEqual, Token::Assign, Token::EqualEqual, Token::Less, Token::LessEqual, Token::LeftShift, Token::ShlEqual, Token::Greater, Token::GreaterEqual, Token::RightShift, Token::ShrEqual, ]; for expected_token in expected { assert_eq!(lexer.next_token(), expected_token); } } #[test] fn test_numbers() { let input = "42 0x2A 0b101010 123_456"; let mut lexer = Lexer::new(input); assert_eq!(lexer.next_token(), Token::Integer(42)); assert_eq!(lexer.next_token(), Token::Integer(42)); assert_eq!(lexer.next_token(), Token::Integer(42)); assert_eq!(lexer.next_token(), Token::Integer(123456)); } #[test] fn test_namespaced_identifier() { let input = "print::println std::io::read"; let mut lexer = Lexer::new(input); let first = lexer.next_token(); if let Token::Identifier(name) = first { assert_eq!(name.namespace, Some("print".to_string())); assert_eq!(name.name, "println"); } else { panic!("Expected namespaced identifier"); } } }