tokeniser: bugfixes to comma handling, regexes

TODO: Verify output is as expected, perhaps I can dump to file and compare token stream with known valid one?

Will add some extra tests of course!
This commit is contained in:
2025-06-29 00:11:36 +01:00
parent 8bb252e941
commit 6ceb35d439
+39 -46
View File
@@ -58,7 +58,7 @@ impl Tokeniser {
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"), .expect("Failed to compile label regex pattern"),
register_regex: Regex::new( register_regex: Regex::new(
r"^(rg([0-9]|[a-f])|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b", r"^(rg[0-9a-f]+|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b",
) )
.expect("Failed to compile register regex pattern"), .expect("Failed to compile register regex pattern"),
immediate_regex: Regex::new( immediate_regex: Regex::new(
@@ -71,7 +71,7 @@ impl Tokeniser {
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b", r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
) )
.expect("Failed to compile instruction regex pattern"), .expect("Failed to compile instruction regex pattern"),
symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*") symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)::{2}([a-zA-Z0-9_]*)|([a-zA-Z_][a-zA-Z0-9_]*)")
.expect("Failed to compile symbol regex pattern"), .expect("Failed to compile symbol regex pattern"),
comment_regex: Regex::new("^//.*") comment_regex: Regex::new("^//.*")
.expect("Failed to compile comment regex pattern"), .expect("Failed to compile comment regex pattern"),
@@ -217,16 +217,11 @@ impl Tokeniser {
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> { fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.register_regex.captures(input)?; let caps = self.register_regex.captures(input)?;
let reg = caps.get(1)?.as_str();
let captured_group = caps.get(1)?.as_str();
let len = caps.get(0)?.len(); let len = caps.get(0)?.len();
let reg = match Register::try_from(reg) { let reg = Register::try_from(captured_group).ok()?;
Ok(reg) => reg,
Err(_why) => {
// Probably ignore the error.
return None;
}
};
Some((TokenType::Register(RegisterToken { reg }), len)) Some((TokenType::Register(RegisterToken { reg }), len))
} }
@@ -274,9 +269,18 @@ impl Tokeniser {
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> { fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.symbol_regex.captures(input)?; let caps = self.symbol_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len(); let len = caps.get(0)?.len();
// Check which capture group matched.
let name = if let Some(scoped_name) = caps.get(1) {
// Matched the scoped symbol pattern (name::scope).
format!("{}::{}", scoped_name.as_str(), caps.get(2)?.as_str())
} else if let Some(simple_name) = caps.get(3) {
simple_name.as_str().to_string()
} else {
return None;
};
Some((TokenType::Symbol(SymbolToken { name }), len)) Some((TokenType::Symbol(SymbolToken { name }), len))
} }
@@ -357,6 +361,12 @@ impl Tokeniser {
line_number: usize, line_number: usize,
column: usize, column: usize,
) -> Result<(TokenType, usize), AssembleError> { ) -> Result<(TokenType, usize), AssembleError> {
dbg!(input);
if input.starts_with(',') {
return Ok((TokenType::Comma, 1));
}
// Check for string first (including multiline continuations). // Check for string first (including multiline continuations).
if let Some(m) = self.try_match_string(input, line_number, column) { if let Some(m) = self.try_match_string(input, line_number, column) {
return Ok(m); return Ok(m);
@@ -390,43 +400,26 @@ impl Tokeniser {
return Ok(m); return Ok(m);
} }
let mut idx_iter = 0..; let mut idx_iter = (column + 1)..;
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source = SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
// Handle miscellaneous characters. // Handle miscellaneous characters.
match input.chars().next() { if let Some(c) = input.chars().next() {
Some(',') => { Err(AssembleError::new_source_error(
_ = idx_iter.next(); source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
Ok((TokenType::Comma, 1)) ))
} } else {
Some(c) => { Err(AssembleError::new_source_error(
let Some(idx) = idx_iter.next() else { source,
unreachable!() AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
}; input.len(),
)),
let source = ))
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
))
}
None => {
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source =
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
input.len(),
)),
))
}
} }
} }
} }