assembler: start tokenising multiline strings (WIP)

This commit is contained in:
2025-06-26 17:42:48 +01:00
parent ed4fcc8495
commit eebea82c4a
3 changed files with 153 additions and 42 deletions
+6 -3
View File
@@ -10,6 +10,7 @@ use crate::source::{
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
};
/// Represents the different types of tokens that can be produced by the tokeniser.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenType {
/// Symbol reference (e.g., `loop_start`, `my_data`).
@@ -20,20 +21,22 @@ pub enum TokenType {
Immediate(u32),
/// String literal (e.g., `"hello world"`).
String(String),
/// Intermediate token for multiline strings (filtered out in final output)
StringContinuation,
/// Assembly instruction (e.g., `add`, `jmp`, `nop`).
Instruction(Opcode),
/// Label definition (e.g., `loop_start:`).
Label(LabelToken),
/// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
/// Assembler directive (e.g., `.global`, `.section`, `.dw`).
Directive(DirectiveToken),
/// Comment (e.g., `// this is a comment`).
Comment,
/// Comma separator.
Comma,
/// End of line.
Newline,
/// End of file.
Eof,
/// A line comment. This is to be filtered out of the token stream.
Comment,
}
#[derive(Debug)]
+137 -39
View File
@@ -43,8 +43,13 @@ pub struct Tokeniser {
directive_regex: Regex,
instruction_regex: Regex,
symbol_regex: Regex,
string_regex: Regex,
comment_regex: Regex,
// String parsing state
in_string: bool,
string_buffer: String,
string_start_line: usize,
string_start_column: usize,
}
impl Tokeniser {
@@ -70,12 +75,16 @@ impl Tokeniser {
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
)
.expect("Failed to compile instruction regex pattern"),
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
.expect("Failed to compile symbol regex pattern"),
string_regex: Regex::new(r#"^"([^"]*)"#)
.expect("Failed to compile string regex pattern"),
comment_regex: Regex::new("^//.*")
.expect("Failed to compile comment regex pattern"),
// Initialize string parsing state
in_string: false,
string_buffer: String::new(),
string_start_line: 0,
string_start_column: 0,
}
}
@@ -88,7 +97,10 @@ impl Tokeniser {
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
pub fn tokenise(
mut self,
ctx: &AssemblerContext,
) -> Result<Vec<Token>, AssembleError> {
let module_name = self.extract_module_name()?;
// Create a module for the source file being processed.
@@ -116,8 +128,8 @@ impl Tokeniser {
continue;
}
// Actually tokenize the line content
let line_tokens = self.tokenize_line(&line_span, &module)?;
// Actually tokenise the line content
let line_tokens = self.tokenise_line(&line_span, &module)?;
token_stream.extend(line_tokens);
// Add newline token at end of line
@@ -137,38 +149,56 @@ impl Tokeniser {
Ok(token_stream)
}
fn tokenize_line(
&self,
fn tokenise_line(
&mut self, // Changed to &mut self
line_span: &LineSpan,
module: &Arc<Module>,
) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new();
let mut remaining = line_span.content.trim();
let start_column = line_span.start_char;
let mut remaining = line_span.content.as_str();
let mut column = 0;
// Skip leading whitespace
let trimmed_start = remaining.trim_start();
column += remaining.len() - trimmed_start.len();
remaining = trimmed_start;
while !remaining.is_empty() {
// Try to match a token.
let (token_type, consumed) = self.match_token(remaining)?;
let start_column = column;
// Filter out comments.
if token_type == TokenType::Comment {
// Advance position.
remaining = remaining[consumed..].trim_start();
// Try to match a token
let (token_type, consumed) =
self.match_token(&remaining, line_span.line_number, column)?;
continue;
// Filter out string continuation tokens and comments
match token_type {
TokenType::StringContinuation => {
// Don't add to token stream, just consume input
}
TokenType::Comment => {
// Don't add to token stream, consume rest of line
break;
}
_ => {
tokens.push(Token::new(
token_type,
SourceInfo::new(
line_span.line_number,
module.clone(),
start_column..start_column + consumed,
),
));
}
}
tokens.push(Token::new(
token_type,
SourceInfo::new(
line_span.line_number,
module.clone(),
start_column..start_column + consumed,
),
));
// Advance position
remaining = &remaining[consumed..];
column += consumed;
// Advance position.
remaining = remaining[consumed..].trim_start();
// Skip whitespace
let before_trim = remaining.len();
remaining = remaining.trim_start();
column += before_trim - remaining.len();
}
Ok(tokens)
@@ -254,15 +284,87 @@ impl Tokeniser {
Some((TokenType::Symbol(SymbolToken { name }), len))
}
fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.string_regex.captures(input)?;
let content = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::String(content), len))
fn try_match_string(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Option<(TokenType, usize)> {
if self.in_string {
// We're continuing a multiline string
self.handle_string_continuation(input, line_number, column)
} else {
// Look for the start of a new string
self.handle_string_start(input, line_number, column)
}
}
fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
fn handle_string_start(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Option<(TokenType, usize)> {
if !input.starts_with('"') {
return None;
}
// Find the closing quote on the same line
if let Some(end_pos) = input[1..].find('"') {
// Complete string on one line
let content = input[1..=end_pos].to_string();
let len = end_pos + 2; // +2 for both quotes
Some((TokenType::String(content), len))
} else {
// Start of multiline string
self.in_string = true;
self.string_start_line = line_number;
self.string_start_column = column;
self.string_buffer = input[1..].to_string(); // Everything after opening quote
self.string_buffer.push('\n'); // Add newline for multiline
// Consume the entire rest of the line
Some((TokenType::StringContinuation, input.len()))
}
}
fn handle_string_continuation(
&mut self,
input: &str,
_line_number: usize,
_column: usize,
) -> Option<(TokenType, usize)> {
// Look for closing quote
if let Some(end_pos) = input.find('"') {
// End of multiline string found
self.string_buffer.push_str(&input[..end_pos]);
self.in_string = false;
let content = std::mem::take(&mut self.string_buffer);
let len = end_pos + 1; // +1 for the closing quote
Some((TokenType::String(content), len))
} else {
// Continue multiline string
self.string_buffer.push_str(input);
self.string_buffer.push('\n'); // Add newline
// Consume the entire line
Some((TokenType::StringContinuation, input.len()))
}
}
fn match_token(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Result<(TokenType, usize), AssembleError> {
// Check for string first (including multiline continuations).
if let Some(m) = self.try_match_string(input, line_number, column) {
return Ok(m);
}
if let Some(m) = self.try_match_directive(input) {
return Ok(m);
}
@@ -287,10 +389,6 @@ impl Tokeniser {
return Ok(m);
}
if let Some(m) = self.try_match_string(input) {
return Ok(m);
}
if let Some(m) = self.try_match_symbol(input) {
return Ok(m);
}
+10
View File
@@ -235,3 +235,13 @@ fn test_multiline_with_comments() {
assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
}
}
#[test]
fn test_tokenise_brainf_interpreter() {
const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
let tokens =
tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
dbg!(tokens);
}