assembler: start tokenising multiline strings (WIP)
This commit is contained in:
@@ -10,6 +10,7 @@ use crate::source::{
|
||||
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
|
||||
};
|
||||
|
||||
/// Represents the different types of tokens that can be produced by the tokeniser.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum TokenType {
|
||||
/// Symbol reference (e.g., `loop_start`, `my_data`).
|
||||
@@ -20,20 +21,22 @@ pub enum TokenType {
|
||||
Immediate(u32),
|
||||
/// String literal (e.g., `"hello world"`).
|
||||
String(String),
|
||||
/// Intermediate token for multiline strings (filtered out in final output)
|
||||
StringContinuation,
|
||||
/// Assembly instruction (e.g., `add`, `jmp`, `nop`).
|
||||
Instruction(Opcode),
|
||||
/// Label definition (e.g., `loop_start:`).
|
||||
Label(LabelToken),
|
||||
/// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
|
||||
/// Assembler directive (e.g., `.global`, `.section`, `.dw`).
|
||||
Directive(DirectiveToken),
|
||||
/// Comment (e.g., `// this is a comment`).
|
||||
Comment,
|
||||
/// Comma separator.
|
||||
Comma,
|
||||
/// End of line.
|
||||
Newline,
|
||||
/// End of file.
|
||||
Eof,
|
||||
/// A line comment. This is to be filtered out of the token stream.
|
||||
Comment,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -43,8 +43,13 @@ pub struct Tokeniser {
|
||||
directive_regex: Regex,
|
||||
instruction_regex: Regex,
|
||||
symbol_regex: Regex,
|
||||
string_regex: Regex,
|
||||
comment_regex: Regex,
|
||||
|
||||
// String parsing state
|
||||
in_string: bool,
|
||||
string_buffer: String,
|
||||
string_start_line: usize,
|
||||
string_start_column: usize,
|
||||
}
|
||||
|
||||
impl Tokeniser {
|
||||
@@ -70,12 +75,16 @@ impl Tokeniser {
|
||||
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
|
||||
)
|
||||
.expect("Failed to compile instruction regex pattern"),
|
||||
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
|
||||
symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
|
||||
.expect("Failed to compile symbol regex pattern"),
|
||||
string_regex: Regex::new(r#"^"([^"]*)"#)
|
||||
.expect("Failed to compile string regex pattern"),
|
||||
comment_regex: Regex::new("^//.*")
|
||||
.expect("Failed to compile comment regex pattern"),
|
||||
|
||||
// Initialize string parsing state
|
||||
in_string: false,
|
||||
string_buffer: String::new(),
|
||||
string_start_line: 0,
|
||||
string_start_column: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,7 +97,10 @@ impl Tokeniser {
|
||||
}
|
||||
|
||||
// Note that modules are tokenised in their own threads, possibly in parallel.
|
||||
pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
|
||||
pub fn tokenise(
|
||||
mut self,
|
||||
ctx: &AssemblerContext,
|
||||
) -> Result<Vec<Token>, AssembleError> {
|
||||
let module_name = self.extract_module_name()?;
|
||||
|
||||
// Create a module for the source file being processed.
|
||||
@@ -116,8 +128,8 @@ impl Tokeniser {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Actually tokenize the line content
|
||||
let line_tokens = self.tokenize_line(&line_span, &module)?;
|
||||
// Actually tokenise the line content
|
||||
let line_tokens = self.tokenise_line(&line_span, &module)?;
|
||||
token_stream.extend(line_tokens);
|
||||
|
||||
// Add newline token at end of line
|
||||
@@ -137,38 +149,56 @@ impl Tokeniser {
|
||||
Ok(token_stream)
|
||||
}
|
||||
|
||||
fn tokenize_line(
|
||||
&self,
|
||||
fn tokenise_line(
|
||||
&mut self, // Changed to &mut self
|
||||
line_span: &LineSpan,
|
||||
module: &Arc<Module>,
|
||||
) -> Result<Vec<Token>, AssembleError> {
|
||||
let mut tokens = Vec::new();
|
||||
let mut remaining = line_span.content.trim();
|
||||
let start_column = line_span.start_char;
|
||||
let mut remaining = line_span.content.as_str();
|
||||
let mut column = 0;
|
||||
|
||||
// Skip leading whitespace
|
||||
let trimmed_start = remaining.trim_start();
|
||||
column += remaining.len() - trimmed_start.len();
|
||||
remaining = trimmed_start;
|
||||
|
||||
while !remaining.is_empty() {
|
||||
// Try to match a token.
|
||||
let (token_type, consumed) = self.match_token(remaining)?;
|
||||
let start_column = column;
|
||||
|
||||
// Filter out comments.
|
||||
if token_type == TokenType::Comment {
|
||||
// Advance position.
|
||||
remaining = remaining[consumed..].trim_start();
|
||||
// Try to match a token
|
||||
let (token_type, consumed) =
|
||||
self.match_token(&remaining, line_span.line_number, column)?;
|
||||
|
||||
continue;
|
||||
// Filter out string continuation tokens and comments
|
||||
match token_type {
|
||||
TokenType::StringContinuation => {
|
||||
// Don't add to token stream, just consume input
|
||||
}
|
||||
TokenType::Comment => {
|
||||
// Don't add to token stream, consume rest of line
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
tokens.push(Token::new(
|
||||
token_type,
|
||||
SourceInfo::new(
|
||||
line_span.line_number,
|
||||
module.clone(),
|
||||
start_column..start_column + consumed,
|
||||
),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
tokens.push(Token::new(
|
||||
token_type,
|
||||
SourceInfo::new(
|
||||
line_span.line_number,
|
||||
module.clone(),
|
||||
start_column..start_column + consumed,
|
||||
),
|
||||
));
|
||||
// Advance position
|
||||
remaining = &remaining[consumed..];
|
||||
column += consumed;
|
||||
|
||||
// Advance position.
|
||||
remaining = remaining[consumed..].trim_start();
|
||||
// Skip whitespace
|
||||
let before_trim = remaining.len();
|
||||
remaining = remaining.trim_start();
|
||||
column += before_trim - remaining.len();
|
||||
}
|
||||
|
||||
Ok(tokens)
|
||||
@@ -254,15 +284,87 @@ impl Tokeniser {
|
||||
Some((TokenType::Symbol(SymbolToken { name }), len))
|
||||
}
|
||||
|
||||
fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
|
||||
let caps = self.string_regex.captures(input)?;
|
||||
let content = caps.get(1)?.as_str().to_string();
|
||||
let len = caps.get(0)?.len();
|
||||
|
||||
Some((TokenType::String(content), len))
|
||||
fn try_match_string(
|
||||
&mut self,
|
||||
input: &str,
|
||||
line_number: usize,
|
||||
column: usize,
|
||||
) -> Option<(TokenType, usize)> {
|
||||
if self.in_string {
|
||||
// We're continuing a multiline string
|
||||
self.handle_string_continuation(input, line_number, column)
|
||||
} else {
|
||||
// Look for the start of a new string
|
||||
self.handle_string_start(input, line_number, column)
|
||||
}
|
||||
}
|
||||
|
||||
fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
|
||||
fn handle_string_start(
|
||||
&mut self,
|
||||
input: &str,
|
||||
line_number: usize,
|
||||
column: usize,
|
||||
) -> Option<(TokenType, usize)> {
|
||||
if !input.starts_with('"') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Find the closing quote on the same line
|
||||
if let Some(end_pos) = input[1..].find('"') {
|
||||
// Complete string on one line
|
||||
let content = input[1..=end_pos].to_string();
|
||||
let len = end_pos + 2; // +2 for both quotes
|
||||
Some((TokenType::String(content), len))
|
||||
} else {
|
||||
// Start of multiline string
|
||||
self.in_string = true;
|
||||
self.string_start_line = line_number;
|
||||
self.string_start_column = column;
|
||||
self.string_buffer = input[1..].to_string(); // Everything after opening quote
|
||||
self.string_buffer.push('\n'); // Add newline for multiline
|
||||
|
||||
// Consume the entire rest of the line
|
||||
Some((TokenType::StringContinuation, input.len()))
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_string_continuation(
|
||||
&mut self,
|
||||
input: &str,
|
||||
_line_number: usize,
|
||||
_column: usize,
|
||||
) -> Option<(TokenType, usize)> {
|
||||
// Look for closing quote
|
||||
if let Some(end_pos) = input.find('"') {
|
||||
// End of multiline string found
|
||||
self.string_buffer.push_str(&input[..end_pos]);
|
||||
self.in_string = false;
|
||||
|
||||
let content = std::mem::take(&mut self.string_buffer);
|
||||
let len = end_pos + 1; // +1 for the closing quote
|
||||
|
||||
Some((TokenType::String(content), len))
|
||||
} else {
|
||||
// Continue multiline string
|
||||
self.string_buffer.push_str(input);
|
||||
self.string_buffer.push('\n'); // Add newline
|
||||
|
||||
// Consume the entire line
|
||||
Some((TokenType::StringContinuation, input.len()))
|
||||
}
|
||||
}
|
||||
|
||||
fn match_token(
|
||||
&mut self,
|
||||
input: &str,
|
||||
line_number: usize,
|
||||
column: usize,
|
||||
) -> Result<(TokenType, usize), AssembleError> {
|
||||
// Check for string first (including multiline continuations).
|
||||
if let Some(m) = self.try_match_string(input, line_number, column) {
|
||||
return Ok(m);
|
||||
}
|
||||
|
||||
if let Some(m) = self.try_match_directive(input) {
|
||||
return Ok(m);
|
||||
}
|
||||
@@ -287,10 +389,6 @@ impl Tokeniser {
|
||||
return Ok(m);
|
||||
}
|
||||
|
||||
if let Some(m) = self.try_match_string(input) {
|
||||
return Ok(m);
|
||||
}
|
||||
|
||||
if let Some(m) = self.try_match_symbol(input) {
|
||||
return Ok(m);
|
||||
}
|
||||
|
||||
@@ -235,3 +235,13 @@ fn test_multiline_with_comments() {
|
||||
assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenise_brainf_interpreter() {
|
||||
const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
|
||||
|
||||
let tokens =
|
||||
tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
|
||||
|
||||
dbg!(tokens);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user