diff --git a/assembler/src/source/token.rs b/assembler/src/source/token.rs index 899de4d..993d388 100644 --- a/assembler/src/source/token.rs +++ b/assembler/src/source/token.rs @@ -10,6 +10,7 @@ use crate::source::{ token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken}, }; +/// Represents the different types of tokens that can be produced by the tokeniser. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum TokenType { /// Symbol reference (e.g., `loop_start`, `my_data`). @@ -20,20 +21,22 @@ pub enum TokenType { Immediate(u32), /// String literal (e.g., `"hello world"`). String(String), + /// Intermediate token for multiline strings (filtered out in final output) + StringContinuation, /// Assembly instruction (e.g., `add`, `jmp`, `nop`). Instruction(Opcode), /// Label definition (e.g., `loop_start:`). Label(LabelToken), - /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`). + /// Assembler directive (e.g., `.global`, `.section`, `.dw`). Directive(DirectiveToken), + /// Comment (e.g., `// this is a comment`). + Comment, /// Comma separator. Comma, /// End of line. Newline, /// End of file. Eof, - /// A line comment. This is to be filtered out of the token stream. - Comment, } #[derive(Debug)] diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index 08c0ba0..c1ecd5a 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -43,8 +43,13 @@ pub struct Tokeniser { directive_regex: Regex, instruction_regex: Regex, symbol_regex: Regex, - string_regex: Regex, comment_regex: Regex, + + // String parsing state + in_string: bool, + string_buffer: String, + string_start_line: usize, + string_start_column: usize, } impl Tokeniser { @@ -70,12 +75,16 @@ impl Tokeniser { r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b", ) .expect("Failed to compile instruction regex pattern"), - symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)") + symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*") .expect("Failed to compile symbol regex pattern"), - string_regex: Regex::new(r#"^"([^"]*)"#) - .expect("Failed to compile string regex pattern"), comment_regex: Regex::new("^//.*") .expect("Failed to compile comment regex pattern"), + + // Initialize string parsing state + in_string: false, + string_buffer: String::new(), + string_start_line: 0, + string_start_column: 0, } } @@ -88,7 +97,10 @@ impl Tokeniser { } // Note that modules are tokenised in their own threads, possibly in parallel. - pub fn tokenise(self, ctx: &AssemblerContext) -> Result, AssembleError> { + pub fn tokenise( + mut self, + ctx: &AssemblerContext, + ) -> Result, AssembleError> { let module_name = self.extract_module_name()?; // Create a module for the source file being processed. @@ -116,8 +128,8 @@ impl Tokeniser { continue; } - // Actually tokenize the line content - let line_tokens = self.tokenize_line(&line_span, &module)?; + // Actually tokenise the line content + let line_tokens = self.tokenise_line(&line_span, &module)?; token_stream.extend(line_tokens); // Add newline token at end of line @@ -137,38 +149,56 @@ impl Tokeniser { Ok(token_stream) } - fn tokenize_line( - &self, + fn tokenise_line( + &mut self, // Changed to &mut self line_span: &LineSpan, module: &Arc, ) -> Result, AssembleError> { let mut tokens = Vec::new(); - let mut remaining = line_span.content.trim(); - let start_column = line_span.start_char; + let mut remaining = line_span.content.as_str(); + let mut column = 0; + + // Skip leading whitespace + let trimmed_start = remaining.trim_start(); + column += remaining.len() - trimmed_start.len(); + remaining = trimmed_start; while !remaining.is_empty() { - // Try to match a token. - let (token_type, consumed) = self.match_token(remaining)?; + let start_column = column; - // Filter out comments. - if token_type == TokenType::Comment { - // Advance position. - remaining = remaining[consumed..].trim_start(); + // Try to match a token + let (token_type, consumed) = + self.match_token(&remaining, line_span.line_number, column)?; - continue; + // Filter out string continuation tokens and comments + match token_type { + TokenType::StringContinuation => { + // Don't add to token stream, just consume input + } + TokenType::Comment => { + // Don't add to token stream, consume rest of line + break; + } + _ => { + tokens.push(Token::new( + token_type, + SourceInfo::new( + line_span.line_number, + module.clone(), + start_column..start_column + consumed, + ), + )); + } } - tokens.push(Token::new( - token_type, - SourceInfo::new( - line_span.line_number, - module.clone(), - start_column..start_column + consumed, - ), - )); + // Advance position + remaining = &remaining[consumed..]; + column += consumed; - // Advance position. - remaining = remaining[consumed..].trim_start(); + // Skip whitespace + let before_trim = remaining.len(); + remaining = remaining.trim_start(); + column += before_trim - remaining.len(); } Ok(tokens) @@ -254,15 +284,87 @@ impl Tokeniser { Some((TokenType::Symbol(SymbolToken { name }), len)) } - fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> { - let caps = self.string_regex.captures(input)?; - let content = caps.get(1)?.as_str().to_string(); - let len = caps.get(0)?.len(); - - Some((TokenType::String(content), len)) + fn try_match_string( + &mut self, + input: &str, + line_number: usize, + column: usize, + ) -> Option<(TokenType, usize)> { + if self.in_string { + // We're continuing a multiline string + self.handle_string_continuation(input, line_number, column) + } else { + // Look for the start of a new string + self.handle_string_start(input, line_number, column) + } } - fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> { + fn handle_string_start( + &mut self, + input: &str, + line_number: usize, + column: usize, + ) -> Option<(TokenType, usize)> { + if !input.starts_with('"') { + return None; + } + + // Find the closing quote on the same line + if let Some(end_pos) = input[1..].find('"') { + // Complete string on one line + let content = input[1..=end_pos].to_string(); + let len = end_pos + 2; // +2 for both quotes + Some((TokenType::String(content), len)) + } else { + // Start of multiline string + self.in_string = true; + self.string_start_line = line_number; + self.string_start_column = column; + self.string_buffer = input[1..].to_string(); // Everything after opening quote + self.string_buffer.push('\n'); // Add newline for multiline + + // Consume the entire rest of the line + Some((TokenType::StringContinuation, input.len())) + } + } + + fn handle_string_continuation( + &mut self, + input: &str, + _line_number: usize, + _column: usize, + ) -> Option<(TokenType, usize)> { + // Look for closing quote + if let Some(end_pos) = input.find('"') { + // End of multiline string found + self.string_buffer.push_str(&input[..end_pos]); + self.in_string = false; + + let content = std::mem::take(&mut self.string_buffer); + let len = end_pos + 1; // +1 for the closing quote + + Some((TokenType::String(content), len)) + } else { + // Continue multiline string + self.string_buffer.push_str(input); + self.string_buffer.push('\n'); // Add newline + + // Consume the entire line + Some((TokenType::StringContinuation, input.len())) + } + } + + fn match_token( + &mut self, + input: &str, + line_number: usize, + column: usize, + ) -> Result<(TokenType, usize), AssembleError> { + // Check for string first (including multiline continuations). + if let Some(m) = self.try_match_string(input, line_number, column) { + return Ok(m); + } + if let Some(m) = self.try_match_directive(input) { return Ok(m); } @@ -287,10 +389,6 @@ impl Tokeniser { return Ok(m); } - if let Some(m) = self.try_match_string(input) { - return Ok(m); - } - if let Some(m) = self.try_match_symbol(input) { return Ok(m); } diff --git a/assembler/src/source/tokeniser/tests.rs b/assembler/src/source/tokeniser/tests.rs index 6aab304..f9fe225 100644 --- a/assembler/src/source/tokeniser/tests.rs +++ b/assembler/src/source/tokeniser/tests.rs @@ -235,3 +235,13 @@ fn test_multiline_with_comments() { assert!(!(expected != *got), "Expected {expected:?}, got {got:?}"); } } + +#[test] +fn test_tokenise_brainf_interpreter() { + const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa"); + + let tokens = + tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!"); + + dbg!(tokens); +}