Compare commits
3 Commits
7cb7525484
...
40f8b1d57b
| Author | SHA1 | Date | |
|---|---|---|---|
| 40f8b1d57b | |||
| 68e459f32b | |||
| d9807b5b36 |
@@ -21,7 +21,7 @@ use crate::{
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct SourceInfo {
|
pub struct SourceInfo {
|
||||||
/// The line number within the source file underpinned by `module_id`.
|
/// The line number within the source file underpinned by `module_id`.
|
||||||
pub line_no: usize,
|
pub line_number: usize,
|
||||||
pub module: Arc<Module>,
|
pub module: Arc<Module>,
|
||||||
/// The indexes where this token may be found (line-local).
|
/// The indexes where this token may be found (line-local).
|
||||||
pub span: std::ops::Range<usize>,
|
pub span: std::ops::Range<usize>,
|
||||||
@@ -33,7 +33,7 @@ impl Display for SourceInfo {
|
|||||||
f,
|
f,
|
||||||
"{}:{}, column {}",
|
"{}:{}, column {}",
|
||||||
self.module.path.display(),
|
self.module.path.display(),
|
||||||
self.line_no,
|
self.line_number,
|
||||||
self.span.start
|
self.span.start
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -47,7 +47,7 @@ impl SourceInfo {
|
|||||||
span: std::ops::Range<usize>,
|
span: std::ops::Range<usize>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
line_no,
|
line_number: line_no,
|
||||||
module,
|
module,
|
||||||
span,
|
span,
|
||||||
}
|
}
|
||||||
@@ -61,7 +61,7 @@ impl SourceInfo {
|
|||||||
|
|
||||||
let mut lines = LinesWithSpans::new(rdr);
|
let mut lines = LinesWithSpans::new(rdr);
|
||||||
|
|
||||||
let Some(line_result) = lines.nth(self.line_no - 1) else {
|
let Some(line_result) = lines.nth(self.line_number - 1) else {
|
||||||
// Handle a line not existing.
|
// Handle a line not existing.
|
||||||
return Err(AssembleError::new_source_error(
|
return Err(AssembleError::new_source_error(
|
||||||
self.clone(),
|
self.clone(),
|
||||||
@@ -69,7 +69,7 @@ impl SourceInfo {
|
|||||||
IoErrorKind::Other,
|
IoErrorKind::Other,
|
||||||
Some(format!(
|
Some(format!(
|
||||||
"the line {} does not exist in input file `{}` but source info suggested otherwise!.",
|
"the line {} does not exist in input file `{}` but source info suggested otherwise!.",
|
||||||
self.line_no,
|
self.line_number,
|
||||||
self.module.path.display()
|
self.module.path.display()
|
||||||
)),
|
)),
|
||||||
)),
|
)),
|
||||||
@@ -79,7 +79,7 @@ impl SourceInfo {
|
|||||||
let line_span = line_result?;
|
let line_span = line_result?;
|
||||||
|
|
||||||
// Print the line number and line content.
|
// Print the line number and line content.
|
||||||
println!("{:>4} | {}", self.line_no, line_span.content);
|
println!("{:>4} | {}", self.line_number, line_span.content);
|
||||||
|
|
||||||
let mut underline = String::new();
|
let mut underline = String::new();
|
||||||
write!(underline, "{:>4} | ", "")?;
|
write!(underline, "{:>4} | ", "")?;
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
//! easier to build from scratch and edit his code than it would be to try and wrangle it
|
//! easier to build from scratch and edit his code than it would be to try and wrangle it
|
||||||
//! into shape.
|
//! into shape.
|
||||||
|
|
||||||
|
use common::prelude::*;
|
||||||
|
|
||||||
use crate::source::{
|
use crate::source::{
|
||||||
source_info::SourceInfo,
|
source_info::SourceInfo,
|
||||||
token_info::{
|
token_info::{
|
||||||
@@ -71,8 +73,8 @@ impl Token {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub const fn register(name: String, source_info: SourceInfo) -> Self {
|
pub const fn register(reg: Register, source_info: SourceInfo) -> Self {
|
||||||
Self::new(TokenType::Register(RegisterToken { name }), source_info)
|
Self::new(TokenType::Register(RegisterToken { reg }), source_info)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[must_use]
|
#[must_use]
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use common::prelude::Register;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct SymbolToken {
|
pub struct SymbolToken {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
@@ -15,7 +17,15 @@ pub struct DirectiveToken {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct RegisterToken {
|
pub struct RegisterToken {
|
||||||
pub name: String,
|
pub reg: Register,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegisterToken {
|
||||||
|
/// Returns the name of a valid [`Register`]
|
||||||
|
#[must_use]
|
||||||
|
pub fn name(&self) -> String {
|
||||||
|
self.reg.to_string()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ use std::{
|
|||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
|
||||||
|
use common::prelude::*;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
context::AssemblerContext,
|
context::AssemblerContext,
|
||||||
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
|
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
|
||||||
@@ -18,12 +20,14 @@ use crate::{
|
|||||||
source_info::SourceInfo,
|
source_info::SourceInfo,
|
||||||
token::{Token, TokenType},
|
token::{Token, TokenType},
|
||||||
token_info::{
|
token_info::{
|
||||||
DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
|
DirectiveToken, LabelToken, RegisterToken, SymbolToken,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
|
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
|
||||||
pub struct Tokeniser {
|
pub struct Tokeniser {
|
||||||
@@ -34,10 +38,10 @@ pub struct Tokeniser {
|
|||||||
|
|
||||||
// Pre-compiled regex patterns
|
// Pre-compiled regex patterns
|
||||||
label_regex: Regex,
|
label_regex: Regex,
|
||||||
register_regex: Regex,
|
// register_regex: Regex,
|
||||||
immediate_regex: Regex,
|
immediate_regex: Regex,
|
||||||
directive_regex: Regex,
|
directive_regex: Regex,
|
||||||
instruction_regex: Regex,
|
// instruction_regex: Regex,
|
||||||
symbol_regex: Regex,
|
symbol_regex: Regex,
|
||||||
string_regex: Regex,
|
string_regex: Regex,
|
||||||
comment_regex: Regex,
|
comment_regex: Regex,
|
||||||
@@ -52,16 +56,18 @@ impl Tokeniser {
|
|||||||
|
|
||||||
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
|
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
|
||||||
.expect("Failed to compile label regex pattern"),
|
.expect("Failed to compile label regex pattern"),
|
||||||
register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)")
|
// register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)")
|
||||||
.expect("Failed to compile register regex pattern"),
|
// .expect("Failed to compile register regex pattern"),
|
||||||
immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)")
|
immediate_regex: Regex::new(
|
||||||
.expect("Failed to compile immediate regex pattern"),
|
r"^(0x[0-9a-fA-F_]+|0b[0-1_]+|0o[0-7_]+|[0-9_]+)",
|
||||||
|
)
|
||||||
|
.expect("Failed to compile immediate regex pattern"),
|
||||||
directive_regex: Regex::new(r"^\.([a-zA-Z]+)")
|
directive_regex: Regex::new(r"^\.([a-zA-Z]+)")
|
||||||
.expect("Failed to compile directive regex pattern"),
|
.expect("Failed to compile directive regex pattern"),
|
||||||
instruction_regex: Regex::new(
|
// instruction_regex: Regex::new(
|
||||||
r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)",
|
// r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)",
|
||||||
)
|
// )
|
||||||
.expect("Failed to compile instruction regex pattern"),
|
// .expect("Failed to compile instruction regex pattern"),
|
||||||
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
|
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
|
||||||
.expect("Failed to compile symbol regex pattern"),
|
.expect("Failed to compile symbol regex pattern"),
|
||||||
string_regex: Regex::new(r#"^"([^"]*)"#)
|
string_regex: Regex::new(r#"^"([^"]*)"#)
|
||||||
@@ -142,6 +148,14 @@ impl Tokeniser {
|
|||||||
// Try to match a token.
|
// Try to match a token.
|
||||||
let (token_type, consumed) = self.match_token(remaining)?;
|
let (token_type, consumed) = self.match_token(remaining)?;
|
||||||
|
|
||||||
|
// Filter out comments.
|
||||||
|
if token_type == TokenType::Comment {
|
||||||
|
// Advance position.
|
||||||
|
remaining = remaining[consumed..].trim_start();
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
tokens.push(Token::new(
|
tokens.push(Token::new(
|
||||||
token_type,
|
token_type,
|
||||||
SourceInfo::new(
|
SourceInfo::new(
|
||||||
@@ -174,11 +188,19 @@ impl Tokeniser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
|
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
|
||||||
let caps = self.register_regex.captures(input)?;
|
_ = self;
|
||||||
let name = caps.get(1)?.as_str().to_string();
|
|
||||||
let len = caps.get(0)?.len();
|
|
||||||
|
|
||||||
Some((TokenType::Register(RegisterToken { name }), len))
|
let reg = match Register::try_from(input) {
|
||||||
|
Ok(reg) => reg,
|
||||||
|
Err(_why) => {
|
||||||
|
// Probably ignore the error.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let len = input.len();
|
||||||
|
|
||||||
|
Some((TokenType::Register(RegisterToken { reg }), len))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
|
fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
|
||||||
@@ -186,6 +208,11 @@ impl Tokeniser {
|
|||||||
let value_str = caps.get(1)?.as_str();
|
let value_str = caps.get(1)?.as_str();
|
||||||
let len = caps.get(0)?.len();
|
let len = caps.get(0)?.len();
|
||||||
|
|
||||||
|
// Remove any underscores that were inserted for readability.
|
||||||
|
let value_str = value_str.replace('_', "");
|
||||||
|
|
||||||
|
dbg!(&value_str);
|
||||||
|
|
||||||
let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
|
let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
|
||||||
u32::from_str_radix(hex_part, 16).ok()?
|
u32::from_str_radix(hex_part, 16).ok()?
|
||||||
} else if let Some(bin_part) = value_str.strip_prefix("0b") {
|
} else if let Some(bin_part) = value_str.strip_prefix("0b") {
|
||||||
@@ -207,12 +234,15 @@ impl Tokeniser {
|
|||||||
Some((TokenType::Directive(DirectiveToken { directive }), len))
|
Some((TokenType::Directive(DirectiveToken { directive }), len))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> {
|
const fn try_match_instruction(&self, _input: &str) -> Option<(TokenType, usize)> {
|
||||||
let caps = self.instruction_regex.captures(input)?;
|
_ = self;
|
||||||
let mnemonic = caps.get(1)?.as_str().to_string();
|
|
||||||
let len = caps.get(0)?.len();
|
|
||||||
|
|
||||||
Some((TokenType::Instruction(InstructionToken { mnemonic }), len))
|
// let instruction =
|
||||||
|
// Some((TokenType::Instruction(InstructionToken { mnemonic }), len))
|
||||||
|
|
||||||
|
// TODO: fix me.
|
||||||
|
|
||||||
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
|
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
|
||||||
|
|||||||
@@ -0,0 +1,187 @@
|
|||||||
|
//! Unit tests for the tokenizer
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
context::AssemblerContext,
|
||||||
|
source::{
|
||||||
|
token::{Token, TokenType},
|
||||||
|
tokeniser::Tokeniser,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// Helper function to create a tokenizer from source text
|
||||||
|
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
|
||||||
|
let data = source.as_bytes().to_vec();
|
||||||
|
let path = PathBuf::from("test.dsa");
|
||||||
|
Tokeniser::from_data(data, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to tokenize source and return tokens
|
||||||
|
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
|
||||||
|
let tokenizer = create_tokenizer_from_source(source);
|
||||||
|
let context = AssemblerContext::new();
|
||||||
|
tokenizer.tokenise(&context)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to extract token types from a token vector
|
||||||
|
fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> {
|
||||||
|
tokens.iter().map(|t| &t.token_type).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_empty_source() {
|
||||||
|
let tokens = tokenize_source("").expect("Failed to tokenize empty source");
|
||||||
|
|
||||||
|
// Should have at least EOF token
|
||||||
|
assert!(!tokens.is_empty());
|
||||||
|
assert!(matches!(
|
||||||
|
tokens
|
||||||
|
.last()
|
||||||
|
.expect("Expected at least one token")
|
||||||
|
.token_type,
|
||||||
|
TokenType::Eof
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_whitespace_only() {
|
||||||
|
let tokens = tokenize_source(" \n \n ").expect("Failed to tokenize whitespace");
|
||||||
|
|
||||||
|
// Should have newlines and EOF
|
||||||
|
let token_types = extract_token_types(&tokens);
|
||||||
|
assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline)));
|
||||||
|
assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_single_instruction() {
|
||||||
|
let tokens = tokenize_source("add").expect("Failed to tokenize instruction");
|
||||||
|
let token_types = extract_token_types(&tokens);
|
||||||
|
|
||||||
|
// Should have instruction, newline, and EOF
|
||||||
|
assert!(
|
||||||
|
token_types
|
||||||
|
.iter()
|
||||||
|
.any(|t| matches!(t, TokenType::Instruction(_)))
|
||||||
|
);
|
||||||
|
if let TokenType::Instruction(instr) = &tokens[0].token_type {
|
||||||
|
assert_eq!(instr.mnemonic, "add");
|
||||||
|
} else {
|
||||||
|
panic!("Expected instruction token");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_all_instructions() {
|
||||||
|
let instructions = [
|
||||||
|
"add", "sub", "mul", "div", "jmp", "call", "ret", "lli", "nop", "halt",
|
||||||
|
];
|
||||||
|
|
||||||
|
for instr in &instructions {
|
||||||
|
let tokens = tokenize_source(instr).expect("Failed to tokenize instruction");
|
||||||
|
|
||||||
|
if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type {
|
||||||
|
assert_eq!(parsed_instr.mnemonic, *instr);
|
||||||
|
} else {
|
||||||
|
panic!("Expected instruction token for {instr}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_registers() {
|
||||||
|
let test_cases = [("rg0", "r0"), ("rgf", "rgf"), ("pcx", "pcx")];
|
||||||
|
|
||||||
|
for (input, expected) in &test_cases {
|
||||||
|
let tokens = tokenize_source(input).expect("Failed to tokenize register");
|
||||||
|
|
||||||
|
if let TokenType::Register(reg) = &tokens[0].token_type {
|
||||||
|
assert_eq!(reg.reg.to_string(), *expected);
|
||||||
|
} else {
|
||||||
|
panic!("Expected register token for {input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_immediates() {
|
||||||
|
let test_cases = [
|
||||||
|
("42", 42),
|
||||||
|
("0", 0),
|
||||||
|
("0xFF", 255),
|
||||||
|
("0x1234", 0x1234),
|
||||||
|
("0xDEADBEEF", 0xDEAD_BEEF),
|
||||||
|
("0o12", 0o12),
|
||||||
|
("0b101", 0b101),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, expected) in &test_cases {
|
||||||
|
let tokens = tokenize_source(input).expect("Failed to tokenize immediate");
|
||||||
|
|
||||||
|
if let TokenType::Immediate(value) = &tokens[0].token_type {
|
||||||
|
assert_eq!(*value, *expected);
|
||||||
|
} else {
|
||||||
|
panic!("Expected immediate token for {input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_labels() {
|
||||||
|
let test_cases = [
|
||||||
|
("loop_start:", "loop_start"),
|
||||||
|
("main:", "main"),
|
||||||
|
("_private_label:", "_private_label"),
|
||||||
|
("Label123:", "Label123"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, expected) in &test_cases {
|
||||||
|
let tokens = tokenize_source(input).expect("Failed to tokenize label");
|
||||||
|
|
||||||
|
if let TokenType::Label(label) = &tokens[0].token_type {
|
||||||
|
assert_eq!(label.name, *expected);
|
||||||
|
} else {
|
||||||
|
panic!("Expected label token for {input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_directives() {
|
||||||
|
let test_cases = [
|
||||||
|
(".global", "global"),
|
||||||
|
(".section", "section"),
|
||||||
|
(".data", "data"),
|
||||||
|
(".text", "text"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, expected) in &test_cases {
|
||||||
|
let tokens = tokenize_source(input).expect("Failed to tokenize directive");
|
||||||
|
|
||||||
|
if let TokenType::Directive(directive) = &tokens[0].token_type {
|
||||||
|
assert_eq!(directive.directive, *expected);
|
||||||
|
} else {
|
||||||
|
panic!("Expected directive token for {input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_symbols() {
|
||||||
|
let test_cases = [
|
||||||
|
("my_symbol", "my_symbol"),
|
||||||
|
("_private", "_private"),
|
||||||
|
("Symbol123", "Symbol123"),
|
||||||
|
("camelCase", "camelCase"),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (input, expected) in &test_cases {
|
||||||
|
let tokens = tokenize_source(input).expect("Failed to tokenize symbol");
|
||||||
|
|
||||||
|
if let TokenType::Symbol(symbol) = &tokens[0].token_type {
|
||||||
|
assert_eq!(symbol.name, *expected);
|
||||||
|
} else {
|
||||||
|
panic!("Expected symbol token for {input}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user