From 6ceb35d4390768c5a9fee22b51b0d2af97fc0922 Mon Sep 17 00:00:00 2001 From: "J. Hinchliffe" Date: Sun, 29 Jun 2025 00:11:36 +0100 Subject: [PATCH] tokeniser: bugfixes to comma handling, regexes TODO: Verify output is as expected, perhaps I can dump to file and compare token stream with known valid one? Will add some extra tests of course! --- assembler/src/source/tokeniser.rs | 85 ++++++++++++++----------------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index 261562d..d180657 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -58,7 +58,7 @@ impl Tokeniser { label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") .expect("Failed to compile label regex pattern"), register_regex: Regex::new( - r"^(rg([0-9]|[a-f])|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b", + r"^(rg[0-9a-f]+|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b", ) .expect("Failed to compile register regex pattern"), immediate_regex: Regex::new( @@ -71,7 +71,7 @@ impl Tokeniser { r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b", ) .expect("Failed to compile instruction regex pattern"), - symbol_regex: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*:{2}[a-zA-Z0-9_]*|[a-zA-Z_][a-zA-Z_0-9]*") + symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)::{2}([a-zA-Z0-9_]*)|([a-zA-Z_][a-zA-Z0-9_]*)") .expect("Failed to compile symbol regex pattern"), comment_regex: Regex::new("^//.*") .expect("Failed to compile comment regex pattern"), @@ -217,16 +217,11 @@ impl Tokeniser { fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> { let caps = self.register_regex.captures(input)?; - let reg = caps.get(1)?.as_str(); + + let captured_group = caps.get(1)?.as_str(); let len = caps.get(0)?.len(); - let reg = match Register::try_from(reg) { - Ok(reg) => reg, - Err(_why) => { - // Probably ignore the error. - return None; - } - }; + let reg = Register::try_from(captured_group).ok()?; Some((TokenType::Register(RegisterToken { reg }), len)) } @@ -274,9 +269,18 @@ impl Tokeniser { fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> { let caps = self.symbol_regex.captures(input)?; - let name = caps.get(1)?.as_str().to_string(); let len = caps.get(0)?.len(); + // Check which capture group matched. + let name = if let Some(scoped_name) = caps.get(1) { + // Matched the scoped symbol pattern (name::scope). + format!("{}::{}", scoped_name.as_str(), caps.get(2)?.as_str()) + } else if let Some(simple_name) = caps.get(3) { + simple_name.as_str().to_string() + } else { + return None; + }; + Some((TokenType::Symbol(SymbolToken { name }), len)) } @@ -357,6 +361,12 @@ impl Tokeniser { line_number: usize, column: usize, ) -> Result<(TokenType, usize), AssembleError> { + dbg!(input); + + if input.starts_with(',') { + return Ok((TokenType::Comma, 1)); + } + // Check for string first (including multiline continuations). if let Some(m) = self.try_match_string(input, line_number, column) { return Ok(m); @@ -390,43 +400,26 @@ impl Tokeniser { return Ok(m); } - let mut idx_iter = 0..; + let mut idx_iter = (column + 1)..; + let Some(idx) = idx_iter.next() else { + unreachable!() + }; + + let source = SourceInfo::new(line_number, self.module.clone(), idx..idx + 1); // Handle miscellaneous characters. - match input.chars().next() { - Some(',') => { - _ = idx_iter.next(); - - Ok((TokenType::Comma, 1)) - } - Some(c) => { - let Some(idx) = idx_iter.next() else { - unreachable!() - }; - - let source = - SourceInfo::new(line_number, self.module.clone(), idx..idx + 1); - - Err(AssembleError::new_source_error( - source, - AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)), - )) - } - None => { - let Some(idx) = idx_iter.next() else { - unreachable!() - }; - - let source = - SourceInfo::new(line_number, self.module.clone(), idx..idx + 1); - - Err(AssembleError::new_source_error( - source, - AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput( - input.len(), - )), - )) - } + if let Some(c) = input.chars().next() { + Err(AssembleError::new_source_error( + source, + AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)), + )) + } else { + Err(AssembleError::new_source_error( + source, + AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput( + input.len(), + )), + )) } } }