From d9807b5b361d8fe8d673439a0a240f81d2c5531b Mon Sep 17 00:00:00 2001 From: "J. Hinchliffe" Date: Wed, 25 Jun 2025 19:15:51 +0100 Subject: [PATCH] assembler: update tokeniser to allow extra prefixes and separators (0xDEAD_BEEF) --- assembler/src/source/source_info.rs | 12 +- assembler/src/source/tokeniser.rs | 21 ++- assembler/src/source/tokeniser/tests.rs | 193 ++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 8 deletions(-) create mode 100644 assembler/src/source/tokeniser/tests.rs diff --git a/assembler/src/source/source_info.rs b/assembler/src/source/source_info.rs index 54f6955..30079a7 100644 --- a/assembler/src/source/source_info.rs +++ b/assembler/src/source/source_info.rs @@ -21,7 +21,7 @@ use crate::{ #[derive(Debug, Clone)] pub struct SourceInfo { /// The line number within the source file underpinned by `module_id`. - pub line_no: usize, + pub line_number: usize, pub module: Arc, /// The indexes where this token may be found (line-local). pub span: std::ops::Range, @@ -33,7 +33,7 @@ impl Display for SourceInfo { f, "{}:{}, column {}", self.module.path.display(), - self.line_no, + self.line_number, self.span.start ) } @@ -47,7 +47,7 @@ impl SourceInfo { span: std::ops::Range, ) -> Self { Self { - line_no, + line_number: line_no, module, span, } @@ -61,7 +61,7 @@ impl SourceInfo { let mut lines = LinesWithSpans::new(rdr); - let Some(line_result) = lines.nth(self.line_no - 1) else { + let Some(line_result) = lines.nth(self.line_number - 1) else { // Handle a line not existing. return Err(AssembleError::new_source_error( self.clone(), @@ -69,7 +69,7 @@ impl SourceInfo { IoErrorKind::Other, Some(format!( "the line {} does not exist in input file `{}` but source info suggested otherwise!.", - self.line_no, + self.line_number, self.module.path.display() )), )), @@ -79,7 +79,7 @@ impl SourceInfo { let line_span = line_result?; // Print the line number and line content. - println!("{:>4} | {}", self.line_no, line_span.content); + println!("{:>4} | {}", self.line_number, line_span.content); let mut underline = String::new(); write!(underline, "{:>4} | ", "")?; diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index a7e34d8..6834c73 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -24,6 +24,8 @@ use crate::{ }; pub mod error; +#[cfg(test)] +mod tests; /// Consumes a [`Vec`] and outputs a [`Vec`] of [Token]'s. pub struct Tokeniser { @@ -54,8 +56,10 @@ impl Tokeniser { .expect("Failed to compile label regex pattern"), register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)") .expect("Failed to compile register regex pattern"), - immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)") - .expect("Failed to compile immediate regex pattern"), + immediate_regex: Regex::new( + r"^(0x[0-9a-fA-F_]+|0b[0-1_]+|0o[0-7_]+|[0-9_]+)", + ) + .expect("Failed to compile immediate regex pattern"), directive_regex: Regex::new(r"^\.([a-zA-Z]+)") .expect("Failed to compile directive regex pattern"), instruction_regex: Regex::new( @@ -142,6 +146,14 @@ impl Tokeniser { // Try to match a token. let (token_type, consumed) = self.match_token(remaining)?; + // Filter out comments. + if token_type == TokenType::Comment { + // Advance position. + remaining = remaining[consumed..].trim_start(); + + continue; + } + tokens.push(Token::new( token_type, SourceInfo::new( @@ -186,6 +198,11 @@ impl Tokeniser { let value_str = caps.get(1)?.as_str(); let len = caps.get(0)?.len(); + // Remove any underscores that were inserted for readability. + let value_str = value_str.replace('_', ""); + + dbg!(&value_str); + let value = if let Some(hex_part) = value_str.strip_prefix("0x") { u32::from_str_radix(hex_part, 16).ok()? } else if let Some(bin_part) = value_str.strip_prefix("0b") { diff --git a/assembler/src/source/tokeniser/tests.rs b/assembler/src/source/tokeniser/tests.rs new file mode 100644 index 0000000..f36f003 --- /dev/null +++ b/assembler/src/source/tokeniser/tests.rs @@ -0,0 +1,193 @@ +//! Unit tests for the tokenizer + +use crate::{ + context::AssemblerContext, + source::{ + token::{Token, TokenType}, + tokeniser::Tokeniser, + }, +}; +use std::path::PathBuf; + +/// Helper function to create a tokenizer from source text +fn create_tokenizer_from_source(source: &str) -> Tokeniser { + let data = source.as_bytes().to_vec(); + let path = PathBuf::from("test.dsa"); + Tokeniser::from_data(data, path) +} + +/// Helper function to tokenize source and return tokens +fn tokenize_source(source: &str) -> Result, crate::error::AssembleError> { + let tokenizer = create_tokenizer_from_source(source); + let context = AssemblerContext::new(); + tokenizer.tokenise(&context) +} + +/// Helper function to extract token types from a token vector +fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> { + tokens.iter().map(|t| &t.token_type).collect() +} + +#[test] +fn test_empty_source() { + let tokens = tokenize_source("").expect("Failed to tokenize empty source"); + + // Should have at least EOF token + assert!(!tokens.is_empty()); + assert!(matches!( + tokens + .last() + .expect("Expected at least one token") + .token_type, + TokenType::Eof + )); +} + +#[test] +fn test_whitespace_only() { + let tokens = tokenize_source(" \n \n ").expect("Failed to tokenize whitespace"); + + // Should have newlines and EOF + let token_types = extract_token_types(&tokens); + assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline))); + assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof))); +} + +#[test] +fn test_single_instruction() { + let tokens = tokenize_source("add").expect("Failed to tokenize instruction"); + let token_types = extract_token_types(&tokens); + + // Should have instruction, newline, and EOF + assert!( + token_types + .iter() + .any(|t| matches!(t, TokenType::Instruction(_))) + ); + if let TokenType::Instruction(instr) = &tokens[0].token_type { + assert_eq!(instr.mnemonic, "add"); + } else { + panic!("Expected instruction token"); + } +} + +#[test] +fn test_all_instructions() { + let instructions = [ + "add", "sub", "mul", "div", "jmp", "call", "ret", "lli", "nop", "halt", + ]; + + for instr in &instructions { + let tokens = tokenize_source(instr).expect("Failed to tokenize instruction"); + + if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type { + assert_eq!(parsed_instr.mnemonic, *instr); + } else { + panic!("Expected instruction token for {instr}"); + } + } +} + +#[test] +fn test_registers() { + let test_cases = [ + ("r0", "r0"), + ("r15", "r15"), + ("sp", "sp"), + ("fp", "fp"), + ("pc", "pc"), + ]; + + for (input, expected) in &test_cases { + let tokens = tokenize_source(input).expect("Failed to tokenize register"); + + if let TokenType::Register(reg) = &tokens[0].token_type { + assert_eq!(reg.name, *expected); + } else { + panic!("Expected register token for {input}"); + } + } +} + +#[test] +fn test_immediates() { + let test_cases = [ + ("42", 42), + ("0", 0), + ("0xFF", 255), + ("0x1234", 0x1234), + ("0xDEADBEEF", 0xDEAD_BEEF), + ("0o12", 0o12), + ("0b101", 0b101), + ]; + + for (input, expected) in &test_cases { + let tokens = tokenize_source(input).expect("Failed to tokenize immediate"); + + if let TokenType::Immediate(value) = &tokens[0].token_type { + assert_eq!(*value, *expected); + } else { + panic!("Expected immediate token for {input}"); + } + } +} + +#[test] +fn test_labels() { + let test_cases = [ + ("loop_start:", "loop_start"), + ("main:", "main"), + ("_private_label:", "_private_label"), + ("Label123:", "Label123"), + ]; + + for (input, expected) in &test_cases { + let tokens = tokenize_source(input).expect("Failed to tokenize label"); + + if let TokenType::Label(label) = &tokens[0].token_type { + assert_eq!(label.name, *expected); + } else { + panic!("Expected label token for {input}"); + } + } +} + +#[test] +fn test_directives() { + let test_cases = [ + (".global", "global"), + (".section", "section"), + (".data", "data"), + (".text", "text"), + ]; + + for (input, expected) in &test_cases { + let tokens = tokenize_source(input).expect("Failed to tokenize directive"); + + if let TokenType::Directive(directive) = &tokens[0].token_type { + assert_eq!(directive.directive, *expected); + } else { + panic!("Expected directive token for {input}"); + } + } +} + +#[test] +fn test_symbols() { + let test_cases = [ + ("my_symbol", "my_symbol"), + ("_private", "_private"), + ("Symbol123", "Symbol123"), + ("camelCase", "camelCase"), + ]; + + for (input, expected) in &test_cases { + let tokens = tokenize_source(input).expect("Failed to tokenize symbol"); + + if let TokenType::Symbol(symbol) = &tokens[0].token_type { + assert_eq!(symbol.name, *expected); + } else { + panic!("Expected symbol token for {input}"); + } + } +}