Files
damn_simple_architecture/compiler/src/frontend/dsc/lexer.rs
T
zxq5 328741eb51 updated compiler with support for more operators.
(only the unary operators from this are implemented for now)
2026-02-08 20:03:31 +00:00

908 lines
26 KiB
Rust

use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, PartialEq, Clone)]
pub enum Token {
// Keywords
Fn,
Let,
If,
Else,
Loop,
While,
Break,
Return,
Continue,
Include,
Static,
Const,
As,
SizeOf,
// Identifiers and literals
Identifier(Name),
String(String),
Integer(u64),
Char(char),
// Delimiters
LeftParen, // (
RightParen, // )
LeftBrace, // {
RightBrace, // }
LeftBracket, // [
RightBracket, // ]
Semicolon, // ;
Colon, // :
Comma, // ,
Dot, // .
RightArrow, // ->
// Arithmetic operators
Plus, // +
Minus, // -
Star, // *
Slash, // /
Percent, // %
PlusPlus, // ++
MinusMinus, // --
// Bitwise operators
Ampersand, // &
Pipe, // |
Caret, // ^
Tilde, // ~
LeftShift, // <<
RightShift, // >>
// Logical operators
Bang, // !
LogicalAnd, // &&
LogicalOr, // ||
// Comparison operators
EqualEqual, // ==
BangEqual, // !=
Less, // <
LessEqual, // <=
Greater, // >
GreaterEqual, // >=
// Assignment operators
Assign, // =
PlusEqual, // +=
MinusEqual, // -=
StarEqual, // *=
SlashEqual, // /=
PercentEqual, // %=
AndEqual, // &=
OrEqual, // |=
XorEqual, // ^=
ShlEqual, // <<=
ShrEqual, // >>=
// Special
Eof,
}
use crate::model::Name;
use std::fmt;
impl fmt::Display for Name {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(ref ns) = self.namespace {
write!(f, "{}::{}", ns, self.name)
} else {
write!(f, "{}", self.name)
}
}
}
impl Token {
pub fn tt(&self) -> &str {
match self {
Token::Const => "Const",
Token::Static => "Static",
Token::Include => "Include",
Token::Fn => "Fn",
Token::If => "If",
Token::Let => "Let",
Token::Else => "Else",
Token::Loop => "Loop",
Token::While => "While",
Token::Break => "Break",
Token::Return => "Return",
Token::Continue => "Continue",
Token::As => "As",
Token::Identifier(_) => "Identifier",
Token::String(_) => "String",
Token::Integer(_) => "UnsignedInt",
Token::Char(_) => "Char",
Token::LeftParen => "LeftParen",
Token::RightParen => "RightParen",
Token::LeftBrace => "LeftBrace",
Token::RightBrace => "RightBrace",
Token::LeftBracket => "LeftBracket",
Token::RightBracket => "RightBracket",
Token::Semicolon => "Semicolon",
Token::Colon => "Colon",
Token::Comma => "Comma",
Token::Dot => "Dot",
Token::RightArrow => "RightArrow",
Token::Plus => "Plus",
Token::Minus => "Minus",
Token::Star => "Star",
Token::Slash => "Slash",
Token::Percent => "Percent",
Token::PlusPlus => "PlusPlus",
Token::MinusMinus => "MinusMinus",
Token::Ampersand => "Ampersand",
Token::Pipe => "Pipe",
Token::Caret => "Caret",
Token::Tilde => "Tilde",
Token::LeftShift => "LeftShift",
Token::RightShift => "RightShift",
Token::Bang => "Bang",
Token::LogicalAnd => "LogicalAnd",
Token::LogicalOr => "LogicalOr",
Token::EqualEqual => "EqualEqual",
Token::BangEqual => "BangEqual",
Token::Less => "Less",
Token::LessEqual => "LessEqual",
Token::Greater => "Greater",
Token::GreaterEqual => "GreaterEqual",
Token::Assign => "Assign",
Token::PlusEqual => "PlusEqual",
Token::MinusEqual => "MinusEqual",
Token::StarEqual => "StarEqual",
Token::SlashEqual => "SlashEqual",
Token::PercentEqual => "PercentEqual",
Token::AndEqual => "AndEqual",
Token::OrEqual => "OrEqual",
Token::XorEqual => "XorEqual",
Token::ShlEqual => "ShlEqual",
Token::ShrEqual => "ShrEqual",
Token::SizeOf => "SizeOf",
Token::Eof => "Eof",
}
}
}
pub struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
current: Option<char>,
line: usize,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
let mut chars = input.chars().peekable();
let current = chars.next();
Lexer {
chars,
current,
line: 1,
}
}
// ========================================================================
// Character Navigation
// ========================================================================
/// Advance to the next character and return it
fn advance(&mut self) -> Option<char> {
self.current = self.chars.next();
self.current
}
/// Peek at the next character without consuming it
fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
/// Peek two characters ahead
fn peek_second(&mut self) -> Option<char> {
let mut temp = self.chars.clone();
temp.next(); // Skip the first peek
temp.next()
}
/// Check if the next character matches expected, and consume it if so
fn match_next(&mut self, expected: char) -> bool {
if self.peek() == Some(expected) {
self.advance();
true
} else {
false
}
}
// ========================================================================
// Whitespace and Comments
// ========================================================================
fn skip_whitespace(&mut self) {
while let Some(c) = self.current {
if c.is_whitespace() {
if c == '\n' {
self.line += 1;
}
self.advance();
} else {
break;
}
}
}
fn skip_line_comment(&mut self) {
// We're at the first '/', advance past '//'
self.advance(); // consume first '/'
self.advance(); // consume second '/'
// Skip until newline or EOF
while let Some(c) = self.current {
if c == '\n' {
self.line += 1;
self.advance();
break;
}
self.advance();
}
}
fn skip_block_comment(&mut self) -> Result<(), String> {
let start_line = self.line;
// We're at '/', advance past '/*'
self.advance(); // consume '/'
self.advance(); // consume '*'
// Look for closing '*/'
while let Some(c) = self.current {
if c == '\n' {
self.line += 1;
}
if c == '*' && self.peek() == Some('/') {
self.advance(); // consume '*'
self.advance(); // consume '/'
return Ok(());
}
self.advance();
}
Err(format!(
"Unterminated block comment starting at line {}",
start_line
))
}
fn skip_whitespace_and_comments(&mut self) {
loop {
self.skip_whitespace();
// Check for comments
if self.current == Some('/') {
match self.peek() {
Some('/') => {
self.skip_line_comment();
continue;
}
Some('*') => {
if let Err(e) = self.skip_block_comment() {
self.error(&e);
}
continue;
}
_ => break,
}
}
break;
}
}
// ========================================================================
// Identifiers and Keywords
// ========================================================================
fn read_identifier(&mut self) -> String {
let mut ident = String::new();
// Include the current character (already validated as alphabetic or '_')
if let Some(c) = self.current {
ident.push(c);
}
// Read remaining alphanumeric or underscore characters
while let Some(c) = self.peek() {
if c.is_alphanumeric() || c == '_' {
self.advance();
ident.push(c);
} else {
break;
}
}
ident
}
fn scan_identifier_or_keyword(&mut self) -> Token {
let first_part = self.read_identifier();
// Check if it's a keyword (keywords cannot have namespaces)
if let Some(keyword) = self.match_keyword(&first_part) {
return keyword;
}
// Check for namespace separator '::'
if self.peek() == Some(':') && self.peek_second() == Some(':') {
// Consume '::'
self.advance(); // consume first ':'
self.advance(); // consume second ':'
self.advance(); // move to the first character of the next identifier
// Read the second part (the actual name)
let second_part = self.read_identifier();
return Token::Identifier(Name {
namespace: Some(first_part),
name: second_part,
});
}
// Plain identifier without namespace
Token::Identifier(Name {
namespace: None,
name: first_part,
})
}
fn match_keyword(&self, word: &str) -> Option<Token> {
match word {
"fn" => Some(Token::Fn),
"let" => Some(Token::Let),
"if" => Some(Token::If),
"else" => Some(Token::Else),
"loop" => Some(Token::Loop),
"while" => Some(Token::While),
"break" => Some(Token::Break),
"return" => Some(Token::Return),
"continue" => Some(Token::Continue),
"include" => Some(Token::Include),
"const" => Some(Token::Const),
"static" => Some(Token::Static),
"as" => Some(Token::As),
"sizeof" => Some(Token::SizeOf),
_ => None,
}
}
// ========================================================================
// Numbers
// ========================================================================
fn scan_number(&mut self) -> Token {
match self.read_number() {
Ok(num) => Token::Integer(num),
Err(e) => {
self.error(&e);
// Skip the invalid number
while let Some(c) = self.peek() {
if !c.is_alphanumeric() && c != '_' {
break;
}
self.advance();
}
Token::Integer(0)
}
}
}
fn read_number(&mut self) -> Result<u64, String> {
// Check for hex (0x) or binary (0b) prefix
if self.current == Some('0') {
match self.peek() {
Some('x') | Some('X') => {
self.advance(); // consume '0'
self.advance(); // consume 'x'
return self.read_hex_number();
}
Some('b') | Some('B') => {
self.advance(); // consume '0'
self.advance(); // consume 'b'
return self.read_binary_number();
}
_ => {}
}
}
// Read decimal number
self.read_decimal_number()
}
fn read_decimal_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
if let Some(c) = self.current {
num_str.push(c);
}
while let Some(c) = self.peek() {
if c.is_ascii_digit() {
self.advance();
num_str.push(c);
} else if c == '_' {
// Allow underscores as separators (like Rust)
self.advance();
} else {
break;
}
}
num_str
.parse::<u64>()
.map_err(|_| format!("Invalid decimal number: {}", num_str))
}
fn read_hex_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
// Read the first hex digit (current character)
if let Some(c) = self.current {
if c.is_ascii_hexdigit() {
num_str.push(c);
}
}
while let Some(c) = self.peek() {
if c.is_ascii_hexdigit() {
self.advance();
num_str.push(c);
} else if c == '_' {
self.advance(); // Allow underscores as separators
} else {
break;
}
}
if num_str.is_empty() {
return Err("Invalid hexadecimal number: no digits after 0x".to_string());
}
u64::from_str_radix(&num_str, 16)
.map_err(|_| format!("Invalid hexadecimal number: {}", num_str))
}
fn read_binary_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
// Read the first binary digit (current character)
if let Some(c) = self.current {
if c == '0' || c == '1' {
num_str.push(c);
}
}
while let Some(c) = self.peek() {
if c == '0' || c == '1' {
self.advance();
num_str.push(c);
} else if c == '_' {
self.advance(); // Allow underscores as separators
} else {
break;
}
}
if num_str.is_empty() {
return Err("Invalid binary number: no digits after 0b".to_string());
}
u64::from_str_radix(&num_str, 2)
.map_err(|_| format!("Invalid binary number: {}", num_str))
}
// ========================================================================
// String and Character Literals
// ========================================================================
fn scan_string(&mut self) -> Token {
match self.read_string() {
Ok(s) => Token::String(s),
Err(e) => {
self.error(&e);
// Skip to the end of the string or newline
while let Some(c) = self.current {
if c == '"' || c == '\n' {
break;
}
self.advance();
}
Token::String(String::new())
}
}
}
fn read_string(&mut self) -> Result<String, String> {
self.advance(); // Skip the opening quote
let mut s = String::new();
while let Some(c) = self.current {
if c == '"' {
return Ok(s);
}
if c == '\n' {
return Err("Unterminated string literal (newline)".to_string());
}
// Handle escape sequences
if c == '\\' {
self.advance();
if let Some(escaped) = self.current {
let escaped_char = match escaped {
'n' => '\n',
't' => '\t',
'r' => '\r',
'\\' => '\\',
'"' => '"',
'\'' => '\'',
'0' => '\0',
_ => {
return Err(format!(
"Invalid escape sequence: \\{}",
escaped
));
}
};
s.push(escaped_char);
} else {
return Err("Unexpected end of string after escape".to_string());
}
} else {
s.push(c);
}
self.advance();
}
Err("Unterminated string literal".to_string())
}
fn scan_char(&mut self) -> Token {
match self.read_char() {
Ok(ch) => Token::Char(ch),
Err(e) => {
self.error(&e);
// Skip to the end of the char literal
while let Some(c) = self.current {
if c == '\'' || c == '\n' {
break;
}
self.advance();
}
Token::Char('\0')
}
}
}
fn read_char(&mut self) -> Result<char, String> {
self.advance(); // Skip opening quote
let ch = match self.current {
Some('\\') => {
// Handle escape sequences
self.advance();
match self.current {
Some('n') => '\n',
Some('t') => '\t',
Some('r') => '\r',
Some('\\') => '\\',
Some('\'') => '\'',
Some('"') => '"',
Some('0') => '\0',
Some(c) => return Err(format!("Invalid escape sequence: \\{}", c)),
None => {
return Err(
"Unexpected end after escape in char literal".to_string()
);
}
}
}
Some('\'') => return Err("Empty character literal".to_string()),
Some('\n') => return Err("Unterminated character literal".to_string()),
Some(c) => c,
None => return Err("Unterminated character literal".to_string()),
};
self.advance(); // Move to closing quote
if self.current != Some('\'') {
return Err(
"Character literal must contain exactly one character".to_string()
);
}
Ok(ch)
}
// ========================================================================
// Operators and Punctuation
// ========================================================================
fn scan_operator(&mut self, c: char) -> Token {
match c {
// Single-character tokens that can't be extended
'(' => Token::LeftParen,
')' => Token::RightParen,
'{' => Token::LeftBrace,
'}' => Token::RightBrace,
'[' => Token::LeftBracket,
']' => Token::RightBracket,
';' => Token::Semicolon,
',' => Token::Comma,
'.' => Token::Dot,
'~' => Token::Tilde,
':' => Token::Colon, // '::' is handled in identifier scanning
// Operators that may have compound forms
'+' => {
if self.match_next('+') {
Token::PlusPlus
} else if self.match_next('=') {
Token::PlusEqual
} else {
Token::Plus
}
}
'-' => {
if self.match_next('-') {
Token::MinusMinus
} else if self.match_next('>') {
Token::RightArrow
} else if self.match_next('=') {
Token::MinusEqual
} else {
Token::Minus
}
}
'*' => {
if self.match_next('=') {
Token::StarEqual
} else {
Token::Star
}
}
'/' => {
// Comments are handled in skip_whitespace_and_comments
if self.match_next('=') {
Token::SlashEqual
} else {
Token::Slash
}
}
'%' => {
if self.match_next('=') {
Token::PercentEqual
} else {
Token::Percent
}
}
'&' => {
if self.match_next('&') {
Token::LogicalAnd
} else if self.match_next('=') {
Token::AndEqual
} else {
Token::Ampersand
}
}
'|' => {
if self.match_next('|') {
Token::LogicalOr
} else if self.match_next('=') {
Token::OrEqual
} else {
Token::Pipe
}
}
'^' => {
if self.match_next('=') {
Token::XorEqual
} else {
Token::Caret
}
}
'!' => {
if self.match_next('=') {
Token::BangEqual
} else {
Token::Bang
}
}
'=' => {
if self.match_next('=') {
Token::EqualEqual
} else {
Token::Assign
}
}
'<' => {
if self.match_next('<') {
if self.match_next('=') {
Token::ShlEqual
} else {
Token::LeftShift
}
} else if self.match_next('=') {
Token::LessEqual
} else {
Token::Less
}
}
'>' => {
if self.match_next('>') {
if self.match_next('=') {
Token::ShrEqual
} else {
Token::RightShift
}
} else if self.match_next('=') {
Token::GreaterEqual
} else {
Token::Greater
}
}
_ => {
self.error(&format!("Unexpected character: '{}'", c));
Token::Eof // This shouldn't happen
}
}
}
// ========================================================================
// Main Token Scanning
// ========================================================================
pub fn next_token(&mut self) -> Token {
self.skip_whitespace_and_comments();
let Some(c) = self.current else {
return Token::Eof;
};
let token = match c {
// Identifiers and keywords
'a'..='z' | 'A'..='Z' | '_' => self.scan_identifier_or_keyword(),
// Numbers
'0'..='9' => self.scan_number(),
// String literals
'"' => self.scan_string(),
// Character literals
'\'' => self.scan_char(),
// Operators and punctuation
_ => self.scan_operator(c),
};
self.advance();
token
}
// ========================================================================
// Error Handling
// ========================================================================
fn error(&self, message: &str) {
eprintln!("Lexer error on line {}: {}", self.line, message);
}
}
// ========================================================================
// Iterator Implementation
// ========================================================================
impl<'a> Iterator for Lexer<'a> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
match self.next_token() {
Token::Eof => None,
token => Some(token),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_operators() {
let input = "+ ++ += - -- -= * *= / /= % %= & &= && | |= || ^ ^= ! != = == < <= << <<= > >= >> >>=";
let mut lexer = Lexer::new(input);
let expected = vec![
Token::Plus,
Token::PlusPlus,
Token::PlusEqual,
Token::Minus,
Token::MinusMinus,
Token::MinusEqual,
Token::Star,
Token::StarEqual,
Token::Slash,
Token::SlashEqual,
Token::Percent,
Token::PercentEqual,
Token::Ampersand,
Token::AndEqual,
Token::LogicalAnd,
Token::Pipe,
Token::OrEqual,
Token::LogicalOr,
Token::Caret,
Token::XorEqual,
Token::Bang,
Token::BangEqual,
Token::Assign,
Token::EqualEqual,
Token::Less,
Token::LessEqual,
Token::LeftShift,
Token::ShlEqual,
Token::Greater,
Token::GreaterEqual,
Token::RightShift,
Token::ShrEqual,
];
for expected_token in expected {
assert_eq!(lexer.next_token(), expected_token);
}
}
#[test]
fn test_numbers() {
let input = "42 0x2A 0b101010 123_456";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Integer(42));
assert_eq!(lexer.next_token(), Token::Integer(42));
assert_eq!(lexer.next_token(), Token::Integer(42));
assert_eq!(lexer.next_token(), Token::Integer(123456));
}
#[test]
fn test_namespaced_identifier() {
let input = "print::println std::io::read";
let mut lexer = Lexer::new(input);
let first = lexer.next_token();
if let Token::Identifier(name) = first {
assert_eq!(name.namespace, Some("print".to_string()));
assert_eq!(name.name, "println");
} else {
panic!("Expected namespaced identifier");
}
}
}