Files
damn_simple_architecture/compiler/src/lexer.rs
T

359 lines
10 KiB
Rust

use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, PartialEq, Clone)]
pub enum Token {
// Keywords
Fn,
Let,
If,
Else,
Loop,
Break,
Return,
Continue,
Include,
// Identifiers and literals
Identifier(String),
String(String),
Number(i64),
// Symbols
LeftParen, // (
RightParen, // )
LeftBrace, // {
RightBrace, // }
Semicolon, // ;
Colon, // :
Comma, // ,
// Pipe, // |
// Operators
Plus, // +
Minus, // -
Star, // *
Slash, // /
Assign, // =
EqualEqual, // ==
Bang, // !
BangEqual, // !=
Less, // <
LessEqual, // <=
Greater, // >
GreaterEqual, // >=
RightArrow, // ->
// Special
Eof,
}
impl Token {
pub fn tt(&self) -> &str {
match self {
Token::Include => "Include",
Token::Fn => "Fn",
Token::If => "If",
Token::Let => "Let",
Token::Else => "Else",
Token::Loop => "Loop",
Token::Break => "Break",
Token::Return => "Return",
Token::Continue => "Continue",
Token::Identifier(_) => "Identifier",
Token::String(_) => "String",
Token::Number(_) => "Number",
Token::LeftParen => "LeftParen",
Token::RightParen => "RightParen",
Token::LeftBrace => "LeftBrace",
Token::RightBrace => "RightBrace",
Token::Semicolon => "Semicolon",
Token::Colon => "Colon",
Token::Comma => "Comma",
Token::RightArrow => "RightArrow",
// Token::Pipe => "Pipe",
Token::Plus => "Plus",
Token::Minus => "Minus",
Token::Star => "Star",
Token::Slash => "Slash",
Token::Assign => "Assign",
Token::EqualEqual => "EqualEqual",
Token::Bang => "Bang",
Token::BangEqual => "BangEqual",
Token::Less => "Less",
Token::LessEqual => "LessEqual",
Token::Greater => "Greater",
Token::GreaterEqual => "GreaterEqual",
Token::Eof => "Eof",
}
}
}
#[derive(Debug)]
pub struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
current: Option<char>,
line: usize,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
let mut chars = input.chars().peekable();
let current = chars.next();
Lexer {
chars,
current,
line: 1,
}
}
fn advance(&mut self) -> Option<char> {
self.current = self.chars.next();
self.current
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.current {
if !c.is_whitespace() {
break;
}
if c == '\n' {
self.line += 1;
}
self.advance();
}
}
fn read_identifier(&mut self) -> String {
let mut ident = String::new();
while let Some(&c) = self.peek() {
if c.is_alphanumeric() || c == '_' {
ident.push(c);
self.advance();
} else {
break;
}
}
ident
}
fn read_number(&mut self) -> i64 {
let mut num_str = String::new();
while let Some(&c) = self.peek() {
if c.is_ascii_digit() {
num_str.push(c);
self.advance();
} else {
break;
}
}
num_str.parse().unwrap_or(0)
}
fn match_next(&mut self, expected: char) -> bool {
match self.peek() {
Some(&c) if c == expected => {
self.advance();
true
}
_ => false,
}
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace();
let token = match self.current {
Some('(') => Token::LeftParen,
Some(')') => Token::RightParen,
Some('{') => Token::LeftBrace,
Some('}') => Token::RightBrace,
Some(';') => Token::Semicolon,
Some(':') => Token::Colon,
Some(',') => Token::Comma,
// Some('|') => Token::Pipe,
Some('+') => Token::Plus,
Some('*') => Token::Star,
Some('/') => Token::Slash,
Some('-') => {
if self.match_next('>') {
Token::RightArrow
} else {
Token::Minus
}
}
Some('!') => {
if self.match_next('=') {
Token::BangEqual
} else {
Token::Bang
}
}
Some('=') => {
if self.match_next('=') {
Token::EqualEqual
} else {
Token::Assign
}
}
Some('<') => {
if self.match_next('=') {
Token::LessEqual
} else {
Token::Less
}
}
Some('>') => {
if self.match_next('=') {
Token::GreaterEqual
} else {
Token::Greater
}
}
Some('"') => {
self.advance(); // Skip the opening quote
let mut s = String::new();
while let Some(c) = self.current {
if c == '"' {
break;
}
s.push(c);
self.advance();
}
Token::String(s)
}
Some(c) => {
if c.is_alphabetic() || c == '_' {
let mut ident = c.to_string();
ident.push_str(&self.read_identifier());
match ident.as_str() {
"fn" => Token::Fn,
"if" => Token::If,
"else" => Token::Else,
"loop" => Token::Loop,
"break" => Token::Break,
"return" => Token::Return,
"continue" => Token::Continue,
"include" => Token::Include,
_ => Token::Identifier(ident),
}
} else if c.is_ascii_digit() {
Token::Number(self.read_number())
} else {
// Skip unknown characters for now
self.advance();
return self.next_token();
}
}
None => Token::Eof,
};
if token != Token::Eof {
self.advance();
}
token
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
match self.next_token() {
Token::Eof => None,
token => Some(token),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_keywords() {
let input = "if else loop break return continue";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::If);
assert_eq!(lexer.next_token(), Token::Else);
assert_eq!(lexer.next_token(), Token::Loop);
assert_eq!(lexer.next_token(), Token::Break);
assert_eq!(lexer.next_token(), Token::Return);
assert_eq!(lexer.next_token(), Token::Continue);
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_identifiers_and_numbers() {
let input = "x y42 _test 123 45";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Identifier("y42".to_string()));
assert_eq!(lexer.next_token(), Token::Identifier("_test".to_string()));
assert_eq!(lexer.next_token(), Token::Number(123));
assert_eq!(lexer.next_token(), Token::Number(45));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_operators() {
let input = "= == ! != < <= > >=";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::EqualEqual);
assert_eq!(lexer.next_token(), Token::Bang);
assert_eq!(lexer.next_token(), Token::BangEqual);
assert_eq!(lexer.next_token(), Token::Less);
assert_eq!(lexer.next_token(), Token::LessEqual);
assert_eq!(lexer.next_token(), Token::Greater);
assert_eq!(lexer.next_token(), Token::GreaterEqual);
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_example_syntax() {
let input = r#"
main: Func = | x: U32, y: U32 | {
res = add(x, y);
print(res);
if res > 10 {
print("res is greater than 10");
}
}
"#;
let mut lexer = Lexer::new(input);
// Skip whitespace and newlines
while let Some(c) = lexer.current {
if !c.is_whitespace() {
break;
}
lexer.advance();
}
// Test the first few tokens
assert_eq!(lexer.next_token(), Token::Identifier("main".to_string()));
assert_eq!(lexer.next_token(), Token::Colon);
assert_eq!(lexer.next_token(), Token::Identifier("Func".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
// assert_eq!(lexer.next_token(), Token::Pipe);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Colon);
assert_eq!(lexer.next_token(), Token::Identifier("U32".to_string()));
assert_eq!(lexer.next_token(), Token::Comma);
// The rest of the tokens would be tested similarly
}
}