tokeniser: bugfixes to comma handling, regexes

TODO: Verify output is as expected, perhaps I can dump to file and compare token stream with known valid one?

Will add some extra tests of course!
This commit is contained in:
2025-06-29 00:11:36 +01:00
parent 8bb252e941
commit 6ceb35d439
+39 -46
View File
@@ -58,7 +58,7 @@ impl Tokeniser {
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"),
register_regex: Regex::new(
r"^(rg([0-9]|[a-f])|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b",
r"^(rg[0-9a-f]+|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b",
)
.expect("Failed to compile register regex pattern"),
immediate_regex: Regex::new(
@@ -71,7 +71,7 @@ impl Tokeniser {
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
)
.expect("Failed to compile instruction regex pattern"),
symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*")
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)::{2}([a-zA-Z0-9_]*)|([a-zA-Z_][a-zA-Z0-9_]*)")
.expect("Failed to compile symbol regex pattern"),
comment_regex: Regex::new("^//.*")
.expect("Failed to compile comment regex pattern"),
@@ -217,16 +217,11 @@ impl Tokeniser {
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.register_regex.captures(input)?;
let reg = caps.get(1)?.as_str();
let captured_group = caps.get(1)?.as_str();
let len = caps.get(0)?.len();
let reg = match Register::try_from(reg) {
Ok(reg) => reg,
Err(_why) => {
// Probably ignore the error.
return None;
}
};
let reg = Register::try_from(captured_group).ok()?;
Some((TokenType::Register(RegisterToken { reg }), len))
}
@@ -274,9 +269,18 @@ impl Tokeniser {
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.symbol_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
// Check which capture group matched.
let name = if let Some(scoped_name) = caps.get(1) {
// Matched the scoped symbol pattern (name::scope).
format!("{}::{}", scoped_name.as_str(), caps.get(2)?.as_str())
} else if let Some(simple_name) = caps.get(3) {
simple_name.as_str().to_string()
} else {
return None;
};
Some((TokenType::Symbol(SymbolToken { name }), len))
}
@@ -357,6 +361,12 @@ impl Tokeniser {
line_number: usize,
column: usize,
) -> Result<(TokenType, usize), AssembleError> {
dbg!(input);
if input.starts_with(',') {
return Ok((TokenType::Comma, 1));
}
// Check for string first (including multiline continuations).
if let Some(m) = self.try_match_string(input, line_number, column) {
return Ok(m);
@@ -390,43 +400,26 @@ impl Tokeniser {
return Ok(m);
}
let mut idx_iter = 0..;
let mut idx_iter = (column + 1)..;
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source = SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
// Handle miscellaneous characters.
match input.chars().next() {
Some(',') => {
_ = idx_iter.next();
Ok((TokenType::Comma, 1))
}
Some(c) => {
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source =
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
))
}
None => {
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source =
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
input.len(),
)),
))
}
if let Some(c) = input.chars().next() {
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
))
} else {
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
input.len(),
)),
))
}
}
}