assembler: start tokenising multiline strings (WIP)

2025-06-26 17:42:48 +01:00
parent ed4fcc8495
commit eebea82c4a
3 changed files with 153 additions and 42 deletions
@@ -10,6 +10,7 @@ use crate::source::{
    token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
 };

+/// Represents the different types of tokens that can be produced by the tokeniser.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum TokenType {
    /// Symbol reference (e.g., `loop_start`, `my_data`).
@@ -20,20 +21,22 @@ pub enum TokenType {
    Immediate(u32),
    /// String literal (e.g., `"hello world"`).
    String(String),
+    /// Intermediate token for multiline strings (filtered out in final output)
+    StringContinuation,
    /// Assembly instruction (e.g., `add`, `jmp`, `nop`).
    Instruction(Opcode),
    /// Label definition (e.g., `loop_start:`).
    Label(LabelToken),
-    /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
+    /// Assembler directive (e.g., `.global`, `.section`, `.dw`).
    Directive(DirectiveToken),
+    /// Comment (e.g., `// this is a comment`).
+    Comment,
    /// Comma separator.
    Comma,
    /// End of line.
    Newline,
    /// End of file.
    Eof,
-    /// A line comment. This is to be filtered out of the token stream.
-    Comment,
 }

 #[derive(Debug)]
@@ -43,8 +43,13 @@ pub struct Tokeniser {
    directive_regex: Regex,
    instruction_regex: Regex,
    symbol_regex: Regex,
-    string_regex: Regex,
    comment_regex: Regex,
+
+    // String parsing state
+    in_string: bool,
+    string_buffer: String,
+    string_start_line: usize,
+    string_start_column: usize,
 }

 impl Tokeniser {
@@ -70,12 +75,16 @@ impl Tokeniser {
                r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
            )
            .expect("Failed to compile instruction regex pattern"),
-            symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
+            symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
                .expect("Failed to compile symbol regex pattern"),
-            string_regex: Regex::new(r#"^"([^"]*)"#)
-                .expect("Failed to compile string regex pattern"),
            comment_regex: Regex::new("^//.*")
                .expect("Failed to compile comment regex pattern"),
+
+            // Initialize string parsing state
+            in_string: false,
+            string_buffer: String::new(),
+            string_start_line: 0,
+            string_start_column: 0,
        }
    }

@@ -88,7 +97,10 @@ impl Tokeniser {
    }

    // Note that modules are tokenised in their own threads, possibly in parallel.
-    pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
+    pub fn tokenise(
+        mut self,
+        ctx: &AssemblerContext,
+    ) -> Result<Vec<Token>, AssembleError> {
        let module_name = self.extract_module_name()?;

        // Create a module for the source file being processed.
@@ -116,8 +128,8 @@ impl Tokeniser {
                continue;
            }

-            // Actually tokenize the line content
-            let line_tokens = self.tokenize_line(&line_span, &module)?;
+            // Actually tokenise the line content
+            let line_tokens = self.tokenise_line(&line_span, &module)?;
            token_stream.extend(line_tokens);

            // Add newline token at end of line
@@ -137,38 +149,56 @@ impl Tokeniser {
        Ok(token_stream)
    }

-    fn tokenize_line(
-        &self,
+    fn tokenise_line(
+        &mut self, // Changed to &mut self
        line_span: &LineSpan,
        module: &Arc<Module>,
    ) -> Result<Vec<Token>, AssembleError> {
        let mut tokens = Vec::new();
-        let mut remaining = line_span.content.trim();
-        let start_column = line_span.start_char;
+        let mut remaining = line_span.content.as_str();
+        let mut column = 0;
+
+        // Skip leading whitespace
+        let trimmed_start = remaining.trim_start();
+        column += remaining.len() - trimmed_start.len();
+        remaining = trimmed_start;

        while !remaining.is_empty() {
-            // Try to match a token.
-            let (token_type, consumed) = self.match_token(remaining)?;
+            let start_column = column;

-            // Filter out comments.
-            if token_type == TokenType::Comment {
-                // Advance position.
-                remaining = remaining[consumed..].trim_start();
+            // Try to match a token
+            let (token_type, consumed) =
+                self.match_token(&remaining, line_span.line_number, column)?;

-                continue;
+            // Filter out string continuation tokens and comments
+            match token_type {
+                TokenType::StringContinuation => {
+                    // Don't add to token stream, just consume input
+                }
+                TokenType::Comment => {
+                    // Don't add to token stream, consume rest of line
+                    break;
+                }
+                _ => {
+                    tokens.push(Token::new(
+                        token_type,
+                        SourceInfo::new(
+                            line_span.line_number,
+                            module.clone(),
+                            start_column..start_column + consumed,
+                        ),
+                    ));
+                }
            }

-            tokens.push(Token::new(
-                token_type,
-                SourceInfo::new(
-                    line_span.line_number,
-                    module.clone(),
-                    start_column..start_column + consumed,
-                ),
-            ));
+            // Advance position
+            remaining = &remaining[consumed..];
+            column += consumed;

-            // Advance position.
-            remaining = remaining[consumed..].trim_start();
+            // Skip whitespace
+            let before_trim = remaining.len();
+            remaining = remaining.trim_start();
+            column += before_trim - remaining.len();
        }

        Ok(tokens)
@@ -254,15 +284,87 @@ impl Tokeniser {
        Some((TokenType::Symbol(SymbolToken { name }), len))
    }

-    fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
-        let caps = self.string_regex.captures(input)?;
-        let content = caps.get(1)?.as_str().to_string();
-        let len = caps.get(0)?.len();
-
-        Some((TokenType::String(content), len))
+    fn try_match_string(
+        &mut self,
+        input: &str,
+        line_number: usize,
+        column: usize,
+    ) -> Option<(TokenType, usize)> {
+        if self.in_string {
+            // We're continuing a multiline string
+            self.handle_string_continuation(input, line_number, column)
+        } else {
+            // Look for the start of a new string
+            self.handle_string_start(input, line_number, column)
+        }
    }

-    fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
+    fn handle_string_start(
+        &mut self,
+        input: &str,
+        line_number: usize,
+        column: usize,
+    ) -> Option<(TokenType, usize)> {
+        if !input.starts_with('"') {
+            return None;
+        }
+
+        // Find the closing quote on the same line
+        if let Some(end_pos) = input[1..].find('"') {
+            // Complete string on one line
+            let content = input[1..=end_pos].to_string();
+            let len = end_pos + 2; // +2 for both quotes
+            Some((TokenType::String(content), len))
+        } else {
+            // Start of multiline string
+            self.in_string = true;
+            self.string_start_line = line_number;
+            self.string_start_column = column;
+            self.string_buffer = input[1..].to_string(); // Everything after opening quote
+            self.string_buffer.push('\n'); // Add newline for multiline
+
+            // Consume the entire rest of the line
+            Some((TokenType::StringContinuation, input.len()))
+        }
+    }
+
+    fn handle_string_continuation(
+        &mut self,
+        input: &str,
+        _line_number: usize,
+        _column: usize,
+    ) -> Option<(TokenType, usize)> {
+        // Look for closing quote
+        if let Some(end_pos) = input.find('"') {
+            // End of multiline string found
+            self.string_buffer.push_str(&input[..end_pos]);
+            self.in_string = false;
+
+            let content = std::mem::take(&mut self.string_buffer);
+            let len = end_pos + 1; // +1 for the closing quote
+
+            Some((TokenType::String(content), len))
+        } else {
+            // Continue multiline string
+            self.string_buffer.push_str(input);
+            self.string_buffer.push('\n'); // Add newline
+
+            // Consume the entire line
+            Some((TokenType::StringContinuation, input.len()))
+        }
+    }
+
+    fn match_token(
+        &mut self,
+        input: &str,
+        line_number: usize,
+        column: usize,
+    ) -> Result<(TokenType, usize), AssembleError> {
+        // Check for string first (including multiline continuations).
+        if let Some(m) = self.try_match_string(input, line_number, column) {
+            return Ok(m);
+        }
+
        if let Some(m) = self.try_match_directive(input) {
            return Ok(m);
        }
@@ -287,10 +389,6 @@ impl Tokeniser {
            return Ok(m);
        }

-        if let Some(m) = self.try_match_string(input) {
-            return Ok(m);
-        }
-
        if let Some(m) = self.try_match_symbol(input) {
            return Ok(m);
        }
@@ -235,3 +235,13 @@ fn test_multiline_with_comments() {
        assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
    }
 }
+
+#[test]
+fn test_tokenise_brainf_interpreter() {
+    const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
+
+    let tokens =
+        tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
+
+    dbg!(tokens);
+}