assembler: start tokenising multiline strings (WIP)
This commit is contained in:
@@ -10,6 +10,7 @@ use crate::source::{
|
|||||||
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
|
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Represents the different types of tokens that can be produced by the tokeniser.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum TokenType {
|
pub enum TokenType {
|
||||||
/// Symbol reference (e.g., `loop_start`, `my_data`).
|
/// Symbol reference (e.g., `loop_start`, `my_data`).
|
||||||
@@ -20,20 +21,22 @@ pub enum TokenType {
|
|||||||
Immediate(u32),
|
Immediate(u32),
|
||||||
/// String literal (e.g., `"hello world"`).
|
/// String literal (e.g., `"hello world"`).
|
||||||
String(String),
|
String(String),
|
||||||
|
/// Intermediate token for multiline strings (filtered out in final output)
|
||||||
|
StringContinuation,
|
||||||
/// Assembly instruction (e.g., `add`, `jmp`, `nop`).
|
/// Assembly instruction (e.g., `add`, `jmp`, `nop`).
|
||||||
Instruction(Opcode),
|
Instruction(Opcode),
|
||||||
/// Label definition (e.g., `loop_start:`).
|
/// Label definition (e.g., `loop_start:`).
|
||||||
Label(LabelToken),
|
Label(LabelToken),
|
||||||
/// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
|
/// Assembler directive (e.g., `.global`, `.section`, `.dw`).
|
||||||
Directive(DirectiveToken),
|
Directive(DirectiveToken),
|
||||||
|
/// Comment (e.g., `// this is a comment`).
|
||||||
|
Comment,
|
||||||
/// Comma separator.
|
/// Comma separator.
|
||||||
Comma,
|
Comma,
|
||||||
/// End of line.
|
/// End of line.
|
||||||
Newline,
|
Newline,
|
||||||
/// End of file.
|
/// End of file.
|
||||||
Eof,
|
Eof,
|
||||||
/// A line comment. This is to be filtered out of the token stream.
|
|
||||||
Comment,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
@@ -43,8 +43,13 @@ pub struct Tokeniser {
|
|||||||
directive_regex: Regex,
|
directive_regex: Regex,
|
||||||
instruction_regex: Regex,
|
instruction_regex: Regex,
|
||||||
symbol_regex: Regex,
|
symbol_regex: Regex,
|
||||||
string_regex: Regex,
|
|
||||||
comment_regex: Regex,
|
comment_regex: Regex,
|
||||||
|
|
||||||
|
// String parsing state
|
||||||
|
in_string: bool,
|
||||||
|
string_buffer: String,
|
||||||
|
string_start_line: usize,
|
||||||
|
string_start_column: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokeniser {
|
impl Tokeniser {
|
||||||
@@ -70,12 +75,16 @@ impl Tokeniser {
|
|||||||
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
|
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
|
||||||
)
|
)
|
||||||
.expect("Failed to compile instruction regex pattern"),
|
.expect("Failed to compile instruction regex pattern"),
|
||||||
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
|
symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
|
||||||
.expect("Failed to compile symbol regex pattern"),
|
.expect("Failed to compile symbol regex pattern"),
|
||||||
string_regex: Regex::new(r#"^"([^"]*)"#)
|
|
||||||
.expect("Failed to compile string regex pattern"),
|
|
||||||
comment_regex: Regex::new("^//.*")
|
comment_regex: Regex::new("^//.*")
|
||||||
.expect("Failed to compile comment regex pattern"),
|
.expect("Failed to compile comment regex pattern"),
|
||||||
|
|
||||||
|
// Initialize string parsing state
|
||||||
|
in_string: false,
|
||||||
|
string_buffer: String::new(),
|
||||||
|
string_start_line: 0,
|
||||||
|
string_start_column: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +97,10 @@ impl Tokeniser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Note that modules are tokenised in their own threads, possibly in parallel.
|
// Note that modules are tokenised in their own threads, possibly in parallel.
|
||||||
pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
|
pub fn tokenise(
|
||||||
|
mut self,
|
||||||
|
ctx: &AssemblerContext,
|
||||||
|
) -> Result<Vec<Token>, AssembleError> {
|
||||||
let module_name = self.extract_module_name()?;
|
let module_name = self.extract_module_name()?;
|
||||||
|
|
||||||
// Create a module for the source file being processed.
|
// Create a module for the source file being processed.
|
||||||
@@ -116,8 +128,8 @@ impl Tokeniser {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actually tokenize the line content
|
// Actually tokenise the line content
|
||||||
let line_tokens = self.tokenize_line(&line_span, &module)?;
|
let line_tokens = self.tokenise_line(&line_span, &module)?;
|
||||||
token_stream.extend(line_tokens);
|
token_stream.extend(line_tokens);
|
||||||
|
|
||||||
// Add newline token at end of line
|
// Add newline token at end of line
|
||||||
@@ -137,27 +149,37 @@ impl Tokeniser {
|
|||||||
Ok(token_stream)
|
Ok(token_stream)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize_line(
|
fn tokenise_line(
|
||||||
&self,
|
&mut self, // Changed to &mut self
|
||||||
line_span: &LineSpan,
|
line_span: &LineSpan,
|
||||||
module: &Arc<Module>,
|
module: &Arc<Module>,
|
||||||
) -> Result<Vec<Token>, AssembleError> {
|
) -> Result<Vec<Token>, AssembleError> {
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
let mut remaining = line_span.content.trim();
|
let mut remaining = line_span.content.as_str();
|
||||||
let start_column = line_span.start_char;
|
let mut column = 0;
|
||||||
|
|
||||||
|
// Skip leading whitespace
|
||||||
|
let trimmed_start = remaining.trim_start();
|
||||||
|
column += remaining.len() - trimmed_start.len();
|
||||||
|
remaining = trimmed_start;
|
||||||
|
|
||||||
while !remaining.is_empty() {
|
while !remaining.is_empty() {
|
||||||
// Try to match a token.
|
let start_column = column;
|
||||||
let (token_type, consumed) = self.match_token(remaining)?;
|
|
||||||
|
|
||||||
// Filter out comments.
|
// Try to match a token
|
||||||
if token_type == TokenType::Comment {
|
let (token_type, consumed) =
|
||||||
// Advance position.
|
self.match_token(&remaining, line_span.line_number, column)?;
|
||||||
remaining = remaining[consumed..].trim_start();
|
|
||||||
|
|
||||||
continue;
|
// Filter out string continuation tokens and comments
|
||||||
|
match token_type {
|
||||||
|
TokenType::StringContinuation => {
|
||||||
|
// Don't add to token stream, just consume input
|
||||||
}
|
}
|
||||||
|
TokenType::Comment => {
|
||||||
|
// Don't add to token stream, consume rest of line
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
tokens.push(Token::new(
|
tokens.push(Token::new(
|
||||||
token_type,
|
token_type,
|
||||||
SourceInfo::new(
|
SourceInfo::new(
|
||||||
@@ -166,9 +188,17 @@ impl Tokeniser {
|
|||||||
start_column..start_column + consumed,
|
start_column..start_column + consumed,
|
||||||
),
|
),
|
||||||
));
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Advance position.
|
// Advance position
|
||||||
remaining = remaining[consumed..].trim_start();
|
remaining = &remaining[consumed..];
|
||||||
|
column += consumed;
|
||||||
|
|
||||||
|
// Skip whitespace
|
||||||
|
let before_trim = remaining.len();
|
||||||
|
remaining = remaining.trim_start();
|
||||||
|
column += before_trim - remaining.len();
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(tokens)
|
Ok(tokens)
|
||||||
@@ -254,15 +284,87 @@ impl Tokeniser {
|
|||||||
Some((TokenType::Symbol(SymbolToken { name }), len))
|
Some((TokenType::Symbol(SymbolToken { name }), len))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
|
fn try_match_string(
|
||||||
let caps = self.string_regex.captures(input)?;
|
&mut self,
|
||||||
let content = caps.get(1)?.as_str().to_string();
|
input: &str,
|
||||||
let len = caps.get(0)?.len();
|
line_number: usize,
|
||||||
|
column: usize,
|
||||||
Some((TokenType::String(content), len))
|
) -> Option<(TokenType, usize)> {
|
||||||
|
if self.in_string {
|
||||||
|
// We're continuing a multiline string
|
||||||
|
self.handle_string_continuation(input, line_number, column)
|
||||||
|
} else {
|
||||||
|
// Look for the start of a new string
|
||||||
|
self.handle_string_start(input, line_number, column)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_string_start(
|
||||||
|
&mut self,
|
||||||
|
input: &str,
|
||||||
|
line_number: usize,
|
||||||
|
column: usize,
|
||||||
|
) -> Option<(TokenType, usize)> {
|
||||||
|
if !input.starts_with('"') {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the closing quote on the same line
|
||||||
|
if let Some(end_pos) = input[1..].find('"') {
|
||||||
|
// Complete string on one line
|
||||||
|
let content = input[1..=end_pos].to_string();
|
||||||
|
let len = end_pos + 2; // +2 for both quotes
|
||||||
|
Some((TokenType::String(content), len))
|
||||||
|
} else {
|
||||||
|
// Start of multiline string
|
||||||
|
self.in_string = true;
|
||||||
|
self.string_start_line = line_number;
|
||||||
|
self.string_start_column = column;
|
||||||
|
self.string_buffer = input[1..].to_string(); // Everything after opening quote
|
||||||
|
self.string_buffer.push('\n'); // Add newline for multiline
|
||||||
|
|
||||||
|
// Consume the entire rest of the line
|
||||||
|
Some((TokenType::StringContinuation, input.len()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn handle_string_continuation(
|
||||||
|
&mut self,
|
||||||
|
input: &str,
|
||||||
|
_line_number: usize,
|
||||||
|
_column: usize,
|
||||||
|
) -> Option<(TokenType, usize)> {
|
||||||
|
// Look for closing quote
|
||||||
|
if let Some(end_pos) = input.find('"') {
|
||||||
|
// End of multiline string found
|
||||||
|
self.string_buffer.push_str(&input[..end_pos]);
|
||||||
|
self.in_string = false;
|
||||||
|
|
||||||
|
let content = std::mem::take(&mut self.string_buffer);
|
||||||
|
let len = end_pos + 1; // +1 for the closing quote
|
||||||
|
|
||||||
|
Some((TokenType::String(content), len))
|
||||||
|
} else {
|
||||||
|
// Continue multiline string
|
||||||
|
self.string_buffer.push_str(input);
|
||||||
|
self.string_buffer.push('\n'); // Add newline
|
||||||
|
|
||||||
|
// Consume the entire line
|
||||||
|
Some((TokenType::StringContinuation, input.len()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn match_token(
|
||||||
|
&mut self,
|
||||||
|
input: &str,
|
||||||
|
line_number: usize,
|
||||||
|
column: usize,
|
||||||
|
) -> Result<(TokenType, usize), AssembleError> {
|
||||||
|
// Check for string first (including multiline continuations).
|
||||||
|
if let Some(m) = self.try_match_string(input, line_number, column) {
|
||||||
|
return Ok(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
|
|
||||||
if let Some(m) = self.try_match_directive(input) {
|
if let Some(m) = self.try_match_directive(input) {
|
||||||
return Ok(m);
|
return Ok(m);
|
||||||
}
|
}
|
||||||
@@ -287,10 +389,6 @@ impl Tokeniser {
|
|||||||
return Ok(m);
|
return Ok(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(m) = self.try_match_string(input) {
|
|
||||||
return Ok(m);
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(m) = self.try_match_symbol(input) {
|
if let Some(m) = self.try_match_symbol(input) {
|
||||||
return Ok(m);
|
return Ok(m);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -235,3 +235,13 @@ fn test_multiline_with_comments() {
|
|||||||
assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
|
assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenise_brainf_interpreter() {
|
||||||
|
const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
|
||||||
|
|
||||||
|
let tokens =
|
||||||
|
tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
|
||||||
|
|
||||||
|
dbg!(tokens);
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user