assembler: start tokenising multiline strings (WIP)

2025-06-26 17:42:48 +01:00
parent ed4fcc8495
commit eebea82c4a
3 changed files with 153 additions and 42 deletions
@@ -10,6 +10,7 @@ use crate::source::{
    token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
 };
 /// Represents the different types of tokens that can be produced by the tokeniser.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum TokenType {
    /// Symbol reference (e.g., `loop_start`, `my_data`).
@@ -20,20 +21,22 @@ pub enum TokenType {
    Immediate(u32),
    /// String literal (e.g., `"hello world"`).
    String(String),
    /// Intermediate token for multiline strings (filtered out in final output)
    StringContinuation,
    /// Assembly instruction (e.g., `add`, `jmp`, `nop`).
    Instruction(Opcode),
    /// Label definition (e.g., `loop_start:`).
    Label(LabelToken),
-    /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
+    /// Assembler directive (e.g., `.global`, `.section`, `.dw`).
    Directive(DirectiveToken),
    /// Comment (e.g., `// this is a comment`).
    Comment,
    /// Comma separator.
    Comma,
    /// End of line.
    Newline,
    /// End of file.
    Eof,
    /// A line comment. This is to be filtered out of the token stream.
    Comment,
 }
 #[derive(Debug)]
@@ -43,8 +43,13 @@ pub struct Tokeniser {
    directive_regex: Regex,
    instruction_regex: Regex,
    symbol_regex: Regex,
    string_regex: Regex,
    comment_regex: Regex,
    // String parsing state
    in_string: bool,
    string_buffer: String,
    string_start_line: usize,
    string_start_column: usize,
 }
 impl Tokeniser {
@@ -70,12 +75,16 @@ impl Tokeniser {
                r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
            )
            .expect("Failed to compile instruction regex pattern"),
-            symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
+            symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
                .expect("Failed to compile symbol regex pattern"),
            string_regex: Regex::new(r#"^"([^"]*)"#)
                .expect("Failed to compile string regex pattern"),
            comment_regex: Regex::new("^//.*")
                .expect("Failed to compile comment regex pattern"),
            // Initialize string parsing state
            in_string: false,
            string_buffer: String::new(),
            string_start_line: 0,
            string_start_column: 0,
        }
    }
@@ -88,7 +97,10 @@ impl Tokeniser {
    }
    // Note that modules are tokenised in their own threads, possibly in parallel.
-    pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
+    pub fn tokenise(
        mut self,
        ctx: &AssemblerContext,
    ) -> Result<Vec<Token>, AssembleError> {
        let module_name = self.extract_module_name()?;
        // Create a module for the source file being processed.
@@ -116,8 +128,8 @@ impl Tokeniser {
                continue;
            }
-            // Actually tokenize the line content
+            // Actually tokenise the line content
-            let line_tokens = self.tokenize_line(&line_span, &module)?;
+            let line_tokens = self.tokenise_line(&line_span, &module)?;
            token_stream.extend(line_tokens);
            // Add newline token at end of line
@@ -137,27 +149,37 @@ impl Tokeniser {
        Ok(token_stream)
    }
-    fn tokenize_line(
+    fn tokenise_line(
-        &self,
+        &mut self, // Changed to &mut self
        line_span: &LineSpan,
        module: &Arc<Module>,
    ) -> Result<Vec<Token>, AssembleError> {
        let mut tokens = Vec::new();
-        let mut remaining = line_span.content.trim();
+        let mut remaining = line_span.content.as_str();
-        let start_column = line_span.start_char;
+        let mut column = 0;
        // Skip leading whitespace
        let trimmed_start = remaining.trim_start();
        column += remaining.len() - trimmed_start.len();
        remaining = trimmed_start;
        while !remaining.is_empty() {
-            // Try to match a token.
+            let start_column = column;
            let (token_type, consumed) = self.match_token(remaining)?;
-            // Filter out comments.
+            // Try to match a token
-            if token_type == TokenType::Comment {
+            let (token_type, consumed) =
-                // Advance position.
+                self.match_token(&remaining, line_span.line_number, column)?;
                remaining = remaining[consumed..].trim_start();
-                continue;
+            // Filter out string continuation tokens and comments
            match token_type {
                TokenType::StringContinuation => {
                    // Don't add to token stream, just consume input
                }
-
+                TokenType::Comment => {
                    // Don't add to token stream, consume rest of line
                    break;
                }
                _ => {
                    tokens.push(Token::new(
                        token_type,
                        SourceInfo::new(
@@ -166,9 +188,17 @@ impl Tokeniser {
                            start_column..start_column + consumed,
                        ),
                    ));
                }
            }
-            // Advance position.
+            // Advance position
-            remaining = remaining[consumed..].trim_start();
+            remaining = &remaining[consumed..];
            column += consumed;
            // Skip whitespace
            let before_trim = remaining.len();
            remaining = remaining.trim_start();
            column += before_trim - remaining.len();
        }
        Ok(tokens)
@@ -254,15 +284,87 @@ impl Tokeniser {
        Some((TokenType::Symbol(SymbolToken { name }), len))
    }
-    fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
+    fn try_match_string(
-        let caps = self.string_regex.captures(input)?;
+        &mut self,
-        let content = caps.get(1)?.as_str().to_string();
+        input: &str,
-        let len = caps.get(0)?.len();
+        line_number: usize,
-
+        column: usize,
-        Some((TokenType::String(content), len))
+    ) -> Option<(TokenType, usize)> {
        if self.in_string {
            // We're continuing a multiline string
            self.handle_string_continuation(input, line_number, column)
        } else {
            // Look for the start of a new string
            self.handle_string_start(input, line_number, column)
        }
    }
    fn handle_string_start(
        &mut self,
        input: &str,
        line_number: usize,
        column: usize,
    ) -> Option<(TokenType, usize)> {
        if !input.starts_with('"') {
            return None;
        }
        // Find the closing quote on the same line
        if let Some(end_pos) = input[1..].find('"') {
            // Complete string on one line
            let content = input[1..=end_pos].to_string();
            let len = end_pos + 2; // +2 for both quotes
            Some((TokenType::String(content), len))
        } else {
            // Start of multiline string
            self.in_string = true;
            self.string_start_line = line_number;
            self.string_start_column = column;
            self.string_buffer = input[1..].to_string(); // Everything after opening quote
            self.string_buffer.push('\n'); // Add newline for multiline
            // Consume the entire rest of the line
            Some((TokenType::StringContinuation, input.len()))
        }
    }
    fn handle_string_continuation(
        &mut self,
        input: &str,
        _line_number: usize,
        _column: usize,
    ) -> Option<(TokenType, usize)> {
        // Look for closing quote
        if let Some(end_pos) = input.find('"') {
            // End of multiline string found
            self.string_buffer.push_str(&input[..end_pos]);
            self.in_string = false;
            let content = std::mem::take(&mut self.string_buffer);
            let len = end_pos + 1; // +1 for the closing quote
            Some((TokenType::String(content), len))
        } else {
            // Continue multiline string
            self.string_buffer.push_str(input);
            self.string_buffer.push('\n'); // Add newline
            // Consume the entire line
            Some((TokenType::StringContinuation, input.len()))
        }
    }
    fn match_token(
        &mut self,
        input: &str,
        line_number: usize,
        column: usize,
    ) -> Result<(TokenType, usize), AssembleError> {
        // Check for string first (including multiline continuations).
        if let Some(m) = self.try_match_string(input, line_number, column) {
            return Ok(m);
        }
    fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
        if let Some(m) = self.try_match_directive(input) {
            return Ok(m);
        }
@@ -287,10 +389,6 @@ impl Tokeniser {
            return Ok(m);
        }
        if let Some(m) = self.try_match_string(input) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_symbol(input) {
            return Ok(m);
        }
@@ -235,3 +235,13 @@ fn test_multiline_with_comments() {
        assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
    }
 }
 #[test]
 fn test_tokenise_brainf_interpreter() {
    const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
    let tokens =
        tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
    dbg!(tokens);
 }