assembler: update tokeniser to allow extra prefixes and separators (0xDEAD_BEEF)

2025-06-25 19:15:51 +01:00
parent 7cb7525484
commit d9807b5b36
3 changed files with 218 additions and 8 deletions
@@ -21,7 +21,7 @@ use crate::{
 #[derive(Debug, Clone)]
 pub struct SourceInfo {
    /// The line number within the source file underpinned by `module_id`.
-    pub line_no: usize,
+    pub line_number: usize,
    pub module: Arc<Module>,
    /// The indexes where this token may be found (line-local).
    pub span: std::ops::Range<usize>,
@@ -33,7 +33,7 @@ impl Display for SourceInfo {
            f,
            "{}:{}, column {}",
            self.module.path.display(),
-            self.line_no,
+            self.line_number,
            self.span.start
        )
    }
@@ -47,7 +47,7 @@ impl SourceInfo {
        span: std::ops::Range<usize>,
    ) -> Self {
        Self {
-            line_no,
+            line_number: line_no,
            module,
            span,
        }
@@ -61,7 +61,7 @@ impl SourceInfo {
        let mut lines = LinesWithSpans::new(rdr);
-        let Some(line_result) = lines.nth(self.line_no - 1) else {
+        let Some(line_result) = lines.nth(self.line_number - 1) else {
            // Handle a line not existing.
            return Err(AssembleError::new_source_error(
                self.clone(),
@@ -69,7 +69,7 @@ impl SourceInfo {
                    IoErrorKind::Other,
                    Some(format!(
                        "the line {} does not exist in input file `{}` but source info suggested otherwise!.",
-                        self.line_no,
+                        self.line_number,
                        self.module.path.display()
                    )),
                )),
@@ -79,7 +79,7 @@ impl SourceInfo {
        let line_span = line_result?;
        // Print the line number and line content.
-        println!("{:>4} | {}", self.line_no, line_span.content);
+        println!("{:>4} | {}", self.line_number, line_span.content);
        let mut underline = String::new();
        write!(underline, "{:>4} | ", "")?;
@@ -24,6 +24,8 @@ use crate::{
 };
 pub mod error;
 #[cfg(test)]
 mod tests;
 /// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
 pub struct Tokeniser {
@@ -54,7 +56,9 @@ impl Tokeniser {
                .expect("Failed to compile label regex pattern"),
            register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)")
                .expect("Failed to compile register regex pattern"),
-            immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)")
+            immediate_regex: Regex::new(
                r"^(0x[0-9a-fA-F_]+|0b[0-1_]+|0o[0-7_]+|[0-9_]+)",
            )
            .expect("Failed to compile immediate regex pattern"),
            directive_regex: Regex::new(r"^\.([a-zA-Z]+)")
                .expect("Failed to compile directive regex pattern"),
@@ -142,6 +146,14 @@ impl Tokeniser {
            // Try to match a token.
            let (token_type, consumed) = self.match_token(remaining)?;
            // Filter out comments.
            if token_type == TokenType::Comment {
                // Advance position.
                remaining = remaining[consumed..].trim_start();
                continue;
            }
            tokens.push(Token::new(
                token_type,
                SourceInfo::new(
@@ -186,6 +198,11 @@ impl Tokeniser {
        let value_str = caps.get(1)?.as_str();
        let len = caps.get(0)?.len();
        // Remove any underscores that were inserted for readability.
        let value_str = value_str.replace('_', "");
        dbg!(&value_str);
        let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
            u32::from_str_radix(hex_part, 16).ok()?
        } else if let Some(bin_part) = value_str.strip_prefix("0b") {
@@ -0,0 +1,193 @@
 //! Unit tests for the tokenizer
 use crate::{
    context::AssemblerContext,
    source::{
        token::{Token, TokenType},
        tokeniser::Tokeniser,
    },
 };
 use std::path::PathBuf;
 /// Helper function to create a tokenizer from source text
 fn create_tokenizer_from_source(source: &str) -> Tokeniser {
    let data = source.as_bytes().to_vec();
    let path = PathBuf::from("test.dsa");
    Tokeniser::from_data(data, path)
 }
 /// Helper function to tokenize source and return tokens
 fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
    let tokenizer = create_tokenizer_from_source(source);
    let context = AssemblerContext::new();
    tokenizer.tokenise(&context)
 }
 /// Helper function to extract token types from a token vector
 fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> {
    tokens.iter().map(|t| &t.token_type).collect()
 }
 #[test]
 fn test_empty_source() {
    let tokens = tokenize_source("").expect("Failed to tokenize empty source");
    // Should have at least EOF token
    assert!(!tokens.is_empty());
    assert!(matches!(
        tokens
            .last()
            .expect("Expected at least one token")
            .token_type,
        TokenType::Eof
    ));
 }
 #[test]
 fn test_whitespace_only() {
    let tokens = tokenize_source("   \n  \n  ").expect("Failed to tokenize whitespace");
    // Should have newlines and EOF
    let token_types = extract_token_types(&tokens);
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline)));
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof)));
 }
 #[test]
 fn test_single_instruction() {
    let tokens = tokenize_source("add").expect("Failed to tokenize instruction");
    let token_types = extract_token_types(&tokens);
    // Should have instruction, newline, and EOF
    assert!(
        token_types
            .iter()
            .any(|t| matches!(t, TokenType::Instruction(_)))
    );
    if let TokenType::Instruction(instr) = &tokens[0].token_type {
        assert_eq!(instr.mnemonic, "add");
    } else {
        panic!("Expected instruction token");
    }
 }
 #[test]
 fn test_all_instructions() {
    let instructions = [
        "add", "sub", "mul", "div", "jmp", "call", "ret", "lli", "nop", "halt",
    ];
    for instr in &instructions {
        let tokens = tokenize_source(instr).expect("Failed to tokenize instruction");
        if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type {
            assert_eq!(parsed_instr.mnemonic, *instr);
        } else {
            panic!("Expected instruction token for {instr}");
        }
    }
 }
 #[test]
 fn test_registers() {
    let test_cases = [
        ("r0", "r0"),
        ("r15", "r15"),
        ("sp", "sp"),
        ("fp", "fp"),
        ("pc", "pc"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize register");
        if let TokenType::Register(reg) = &tokens[0].token_type {
            assert_eq!(reg.name, *expected);
        } else {
            panic!("Expected register token for {input}");
        }
    }
 }
 #[test]
 fn test_immediates() {
    let test_cases = [
        ("42", 42),
        ("0", 0),
        ("0xFF", 255),
        ("0x1234", 0x1234),
        ("0xDEADBEEF", 0xDEAD_BEEF),
        ("0o12", 0o12),
        ("0b101", 0b101),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize immediate");
        if let TokenType::Immediate(value) = &tokens[0].token_type {
            assert_eq!(*value, *expected);
        } else {
            panic!("Expected immediate token for {input}");
        }
    }
 }
 #[test]
 fn test_labels() {
    let test_cases = [
        ("loop_start:", "loop_start"),
        ("main:", "main"),
        ("_private_label:", "_private_label"),
        ("Label123:", "Label123"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize label");
        if let TokenType::Label(label) = &tokens[0].token_type {
            assert_eq!(label.name, *expected);
        } else {
            panic!("Expected label token for {input}");
        }
    }
 }
 #[test]
 fn test_directives() {
    let test_cases = [
        (".global", "global"),
        (".section", "section"),
        (".data", "data"),
        (".text", "text"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize directive");
        if let TokenType::Directive(directive) = &tokens[0].token_type {
            assert_eq!(directive.directive, *expected);
        } else {
            panic!("Expected directive token for {input}");
        }
    }
 }
 #[test]
 fn test_symbols() {
    let test_cases = [
        ("my_symbol", "my_symbol"),
        ("_private", "_private"),
        ("Symbol123", "Symbol123"),
        ("camelCase", "camelCase"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize symbol");
        if let TokenType::Symbol(symbol) = &tokens[0].token_type {
            assert_eq!(symbol.name, *expected);
        } else {
            panic!("Expected symbol token for {input}");
        }
    }
 }