359 lines
10 KiB
Rust
359 lines
10 KiB
Rust
use std::iter::Peekable;
|
|
use std::str::Chars;
|
|
|
|
#[derive(Debug, PartialEq, Clone)]
|
|
pub enum Token {
|
|
// Keywords
|
|
Fn,
|
|
Let,
|
|
If,
|
|
Else,
|
|
Loop,
|
|
Break,
|
|
Return,
|
|
Continue,
|
|
Include,
|
|
|
|
// Identifiers and literals
|
|
Identifier(String),
|
|
String(String),
|
|
Number(i64),
|
|
|
|
// Symbols
|
|
LeftParen, // (
|
|
RightParen, // )
|
|
LeftBrace, // {
|
|
RightBrace, // }
|
|
Semicolon, // ;
|
|
Colon, // :
|
|
Comma, // ,
|
|
// Pipe, // |
|
|
|
|
// Operators
|
|
Plus, // +
|
|
Minus, // -
|
|
Star, // *
|
|
Slash, // /
|
|
Assign, // =
|
|
EqualEqual, // ==
|
|
Bang, // !
|
|
BangEqual, // !=
|
|
Less, // <
|
|
LessEqual, // <=
|
|
Greater, // >
|
|
GreaterEqual, // >=
|
|
RightArrow, // ->
|
|
|
|
// Special
|
|
Eof,
|
|
}
|
|
|
|
impl Token {
|
|
pub fn tt(&self) -> &str {
|
|
match self {
|
|
Token::Include => "Include",
|
|
Token::Fn => "Fn",
|
|
Token::If => "If",
|
|
Token::Let => "Let",
|
|
Token::Else => "Else",
|
|
Token::Loop => "Loop",
|
|
Token::Break => "Break",
|
|
Token::Return => "Return",
|
|
Token::Continue => "Continue",
|
|
Token::Identifier(_) => "Identifier",
|
|
Token::String(_) => "String",
|
|
Token::Number(_) => "Number",
|
|
Token::LeftParen => "LeftParen",
|
|
Token::RightParen => "RightParen",
|
|
Token::LeftBrace => "LeftBrace",
|
|
Token::RightBrace => "RightBrace",
|
|
Token::Semicolon => "Semicolon",
|
|
Token::Colon => "Colon",
|
|
Token::Comma => "Comma",
|
|
Token::RightArrow => "RightArrow",
|
|
// Token::Pipe => "Pipe",
|
|
Token::Plus => "Plus",
|
|
Token::Minus => "Minus",
|
|
Token::Star => "Star",
|
|
Token::Slash => "Slash",
|
|
Token::Assign => "Assign",
|
|
Token::EqualEqual => "EqualEqual",
|
|
Token::Bang => "Bang",
|
|
Token::BangEqual => "BangEqual",
|
|
Token::Less => "Less",
|
|
Token::LessEqual => "LessEqual",
|
|
Token::Greater => "Greater",
|
|
Token::GreaterEqual => "GreaterEqual",
|
|
Token::Eof => "Eof",
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct Lexer<'a> {
|
|
chars: Peekable<Chars<'a>>,
|
|
current: Option<char>,
|
|
line: usize,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn new(input: &'a str) -> Self {
|
|
let mut chars = input.chars().peekable();
|
|
let current = chars.next();
|
|
|
|
Lexer {
|
|
chars,
|
|
current,
|
|
line: 1,
|
|
}
|
|
}
|
|
|
|
fn advance(&mut self) -> Option<char> {
|
|
self.current = self.chars.next();
|
|
self.current
|
|
}
|
|
|
|
fn peek(&mut self) -> Option<&char> {
|
|
self.chars.peek()
|
|
}
|
|
|
|
fn skip_whitespace(&mut self) {
|
|
while let Some(c) = self.current {
|
|
if !c.is_whitespace() {
|
|
break;
|
|
}
|
|
if c == '\n' {
|
|
self.line += 1;
|
|
}
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
fn read_identifier(&mut self) -> String {
|
|
let mut ident = String::new();
|
|
while let Some(&c) = self.peek() {
|
|
if c.is_alphanumeric() || c == '_' {
|
|
ident.push(c);
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
ident
|
|
}
|
|
|
|
fn read_number(&mut self) -> i64 {
|
|
let mut num_str = String::new();
|
|
while let Some(&c) = self.peek() {
|
|
if c.is_ascii_digit() {
|
|
num_str.push(c);
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
num_str.parse().unwrap_or(0)
|
|
}
|
|
|
|
fn match_next(&mut self, expected: char) -> bool {
|
|
match self.peek() {
|
|
Some(&c) if c == expected => {
|
|
self.advance();
|
|
true
|
|
}
|
|
_ => false,
|
|
}
|
|
}
|
|
|
|
pub fn next_token(&mut self) -> Token {
|
|
self.skip_whitespace();
|
|
|
|
let token = match self.current {
|
|
Some('(') => Token::LeftParen,
|
|
Some(')') => Token::RightParen,
|
|
Some('{') => Token::LeftBrace,
|
|
Some('}') => Token::RightBrace,
|
|
Some(';') => Token::Semicolon,
|
|
Some(':') => Token::Colon,
|
|
Some(',') => Token::Comma,
|
|
// Some('|') => Token::Pipe,
|
|
Some('+') => Token::Plus,
|
|
Some('*') => Token::Star,
|
|
Some('/') => Token::Slash,
|
|
Some('-') => {
|
|
if self.match_next('>') {
|
|
Token::RightArrow
|
|
} else {
|
|
Token::Minus
|
|
}
|
|
}
|
|
Some('!') => {
|
|
if self.match_next('=') {
|
|
Token::BangEqual
|
|
} else {
|
|
Token::Bang
|
|
}
|
|
}
|
|
Some('=') => {
|
|
if self.match_next('=') {
|
|
Token::EqualEqual
|
|
} else {
|
|
Token::Assign
|
|
}
|
|
}
|
|
Some('<') => {
|
|
if self.match_next('=') {
|
|
Token::LessEqual
|
|
} else {
|
|
Token::Less
|
|
}
|
|
}
|
|
Some('>') => {
|
|
if self.match_next('=') {
|
|
Token::GreaterEqual
|
|
} else {
|
|
Token::Greater
|
|
}
|
|
}
|
|
Some('"') => {
|
|
self.advance(); // Skip the opening quote
|
|
let mut s = String::new();
|
|
while let Some(c) = self.current {
|
|
if c == '"' {
|
|
break;
|
|
}
|
|
s.push(c);
|
|
self.advance();
|
|
}
|
|
Token::String(s)
|
|
}
|
|
Some(c) => {
|
|
if c.is_alphabetic() || c == '_' {
|
|
let mut ident = c.to_string();
|
|
ident.push_str(&self.read_identifier());
|
|
match ident.as_str() {
|
|
"fn" => Token::Fn,
|
|
"if" => Token::If,
|
|
"else" => Token::Else,
|
|
"loop" => Token::Loop,
|
|
"break" => Token::Break,
|
|
"return" => Token::Return,
|
|
"continue" => Token::Continue,
|
|
"include" => Token::Include,
|
|
_ => Token::Identifier(ident),
|
|
}
|
|
} else if c.is_ascii_digit() {
|
|
Token::Number(self.read_number())
|
|
} else {
|
|
// Skip unknown characters for now
|
|
self.advance();
|
|
return self.next_token();
|
|
}
|
|
}
|
|
None => Token::Eof,
|
|
};
|
|
|
|
if token != Token::Eof {
|
|
self.advance();
|
|
}
|
|
|
|
token
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for Lexer<'a> {
|
|
type Item = Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match self.next_token() {
|
|
Token::Eof => None,
|
|
token => Some(token),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_keywords() {
|
|
let input = "if else loop break return continue";
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
assert_eq!(lexer.next_token(), Token::If);
|
|
assert_eq!(lexer.next_token(), Token::Else);
|
|
assert_eq!(lexer.next_token(), Token::Loop);
|
|
assert_eq!(lexer.next_token(), Token::Break);
|
|
assert_eq!(lexer.next_token(), Token::Return);
|
|
assert_eq!(lexer.next_token(), Token::Continue);
|
|
assert_eq!(lexer.next_token(), Token::Eof);
|
|
}
|
|
|
|
#[test]
|
|
fn test_identifiers_and_numbers() {
|
|
let input = "x y42 _test 123 45";
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Identifier("y42".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Identifier("_test".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Number(123));
|
|
assert_eq!(lexer.next_token(), Token::Number(45));
|
|
assert_eq!(lexer.next_token(), Token::Eof);
|
|
}
|
|
|
|
#[test]
|
|
fn test_operators() {
|
|
let input = "= == ! != < <= > >=";
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
assert_eq!(lexer.next_token(), Token::Assign);
|
|
assert_eq!(lexer.next_token(), Token::EqualEqual);
|
|
assert_eq!(lexer.next_token(), Token::Bang);
|
|
assert_eq!(lexer.next_token(), Token::BangEqual);
|
|
assert_eq!(lexer.next_token(), Token::Less);
|
|
assert_eq!(lexer.next_token(), Token::LessEqual);
|
|
assert_eq!(lexer.next_token(), Token::Greater);
|
|
assert_eq!(lexer.next_token(), Token::GreaterEqual);
|
|
assert_eq!(lexer.next_token(), Token::Eof);
|
|
}
|
|
|
|
#[test]
|
|
fn test_example_syntax() {
|
|
let input = r#"
|
|
main: Func = | x: U32, y: U32 | {
|
|
res = add(x, y);
|
|
print(res);
|
|
|
|
if res > 10 {
|
|
print("res is greater than 10");
|
|
}
|
|
}
|
|
"#;
|
|
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
// Skip whitespace and newlines
|
|
while let Some(c) = lexer.current {
|
|
if !c.is_whitespace() {
|
|
break;
|
|
}
|
|
lexer.advance();
|
|
}
|
|
|
|
// Test the first few tokens
|
|
assert_eq!(lexer.next_token(), Token::Identifier("main".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Colon);
|
|
assert_eq!(lexer.next_token(), Token::Identifier("Func".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Assign);
|
|
// assert_eq!(lexer.next_token(), Token::Pipe);
|
|
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Colon);
|
|
assert_eq!(lexer.next_token(), Token::Identifier("U32".to_string()));
|
|
assert_eq!(lexer.next_token(), Token::Comma);
|
|
|
|
// The rest of the tokens would be tested similarly
|
|
}
|
|
}
|