23 Commits

Author SHA1 Message Date
nullndvoid b91207bfde misc: update release profile for optimised builds 2025-06-29 04:33:24 +01:00
nullndvoid 4ac630ba02 misc: add 'profiling' profile. 2025-06-29 04:10:54 +01:00
nullndvoid 85e3d443cc assembler: small misc updates, I am tired 2025-06-29 03:52:53 +01:00
nullndvoid 0528768947 fmt: ran 'cargo fmt'. 2025-06-29 01:43:31 +01:00
nullndvoid 21582f1297 tokeniser/syntax: (db varname: -> db varname) dropped colon, updated tests. 2025-06-29 00:22:10 +01:00
nullndvoid 6ceb35d439 tokeniser: bugfixes to comma handling, regexes
TODO: Verify output is as expected, perhaps I can dump to file and compare token stream with known valid one?

Will add some extra tests of course!
2025-06-29 00:11:36 +01:00
nullndvoid 8bb252e941 tokeniser: return TokeniserErrors where relevant.
The UnexpectedEndOfInput case is a little vague.
2025-06-28 23:35:55 +01:00
nullndvoid 5317988fdd assembler: SourceInfo doc comment added to self.module. 2025-06-28 23:14:30 +01:00
nullndvoid d15e00c272 tokeniser: refactor to store Module directly in Tokeniser
We hereby avoid making extra copies of the PathBuf.

- Also updated tests to match the new API
2025-06-28 23:13:44 +01:00
nullndvoid a65dca6c5c tokeniser: errors now print with SourceInfo if added 2025-06-28 23:11:24 +01:00
nullndvoid b8be1bd95f tokeniser: add some actual tokeniser errors
TODO: Return these lol
2025-06-28 23:05:07 +01:00
nullndvoid f42c6d4095 assembler: refactor error handling and use ModuleId::new constructor 2025-06-28 23:03:13 +01:00
nullndvoid eebea82c4a assembler: start tokenising multiline strings (WIP) 2025-06-26 17:42:48 +01:00
nullndvoid ed4fcc8495 assembler: enhance error handling and tokenization logic 2025-06-26 17:00:14 +01:00
nullndvoid 40f8b1d57b assembler: fix clippy warnings 2025-06-25 19:49:20 +01:00
nullndvoid 68e459f32b assembler: use common to match registers 2025-06-25 19:29:56 +01:00
nullndvoid d9807b5b36 assembler: update tokeniser to allow extra prefixes and separators (0xDEAD_BEEF) 2025-06-25 19:15:51 +01:00
nullndvoid 7cb7525484 assembler: remove some current dead code 2025-06-25 17:56:45 +01:00
nullndvoid 7565374d5b assembler: Tokeniser updates, Compiler Engine is back finally 2025-06-25 17:55:34 +01:00
nullndvoid 9b9e153500 assembler: wrap Module's with Arc and update Tokeniser (still WIP)
Implements complete tokenizer with Arc-wrapped modules

Enhances module handling by wrapping Module instances in Arc for thread-safe sharing across the assembler pipeline.

Implements full tokenization logic with pattern matching for all token types including labels, registers, immediates, directives, instructions, symbols, and strings.

Adds comma token support and proper EOF handling to complete the lexical analysis phase.

Generated AI slop commit message, may not be super accurate or it may be a bit too serious lol.
2025-06-25 17:35:03 +01:00
nullndvoid 27267e3daa assembler: use smart pointer for modules since sourceinfo gets copy 2025-06-25 17:03:48 +01:00
nullndvoid fb84a6d3c3 assembler: clippy lints, better error formatting
Adds regex dependency and enhances error handling system

Introduces comprehensive error type hierarchy with specific variants for parser, symbol, codegen, threading, and IO errors to improve error reporting and debugging capabilities.

Adds regex crate for pattern matching in tokenizer implementation with pre-compiled patterns for labels, registers, immediates, directives, instructions, and symbols.

Enhances source info functionality with context printing and error underlining similar to compiler diagnostics.

Implements better error conversions and threading error handling for lock failures and panics.
2025-06-25 16:50:17 +01:00
nullndvoid 4e5db58a84 assembler: start refactoring/rewriting tokeniser 2025-06-25 14:48:45 +01:00
32 changed files with 2285 additions and 127 deletions
+4
View File
@@ -5,3 +5,7 @@ rustc-wrapper = "sccache"
[future-incompat-report] [future-incompat-report]
frequency = "always" frequency = "always"
[profile.profiling]
inherits = "release"
debug = true
Generated
+39
View File
@@ -129,6 +129,15 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "android-activity" name = "android-activity"
version = "0.6.0" version = "0.6.0"
@@ -269,6 +278,7 @@ dependencies = [
"clap", "clap",
"common", "common",
"num_cpus", "num_cpus",
"regex",
"threadpool", "threadpool",
"uuid", "uuid",
] ]
@@ -2691,6 +2701,35 @@ dependencies = [
"thiserror 2.0.12", "thiserror 2.0.12",
] ]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]] [[package]]
name = "renderdoc-sys" name = "renderdoc-sys"
version = "1.1.0" version = "1.1.0"
+6 -2
View File
@@ -11,7 +11,11 @@ authors = ["zxq5", "nullndvoid"]
[profile.dev] [profile.dev]
codegen-backend = "cranelift" codegen-backend = "cranelift"
panic = "abort" # Cranelift does not support stack unwinds. panic = "abort" # Cranelift does not support stack unwinds.
lto = false lto = false
debug = true debug = true
incremental = false # sccache does not support caching incremental crates. incremental = false # sccache does not support caching incremental crates.
[profile.release]
incremental = true
lto = "fat"
+1
View File
@@ -16,5 +16,6 @@ path = "src/lib.rs"
clap = { version = "4.5.40", features = ["derive"] } clap = { version = "4.5.40", features = ["derive"] }
common = { path = "../common" } common = { path = "../common" }
num_cpus = "1.17.0" num_cpus = "1.17.0"
regex = "1.11.1"
threadpool = "1.8.1" threadpool = "1.8.1"
uuid = { version = "1.17.0", features = ["v4"] } uuid = { version = "1.17.0", features = ["v4"] }
+2 -1
View File
@@ -5,7 +5,8 @@ pub struct Args {
/// The output format to assemble to. Currently just ELF or a flat binary. /// The output format to assemble to. Currently just ELF or a flat binary.
#[arg(value_enum)] #[arg(value_enum)]
output_format: Option<OutputFormat>, output_format: Option<OutputFormat>,
/// Whether the relocatable object files should be statically linked into a single executable or library. /// Whether the relocatable object files should be statically linked into a single
/// executable or library.
link: bool, link: bool,
} }
+374
View File
@@ -0,0 +1,374 @@
//! Simple compiler engine that orchestrates the entire compilation process.
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::path::Path;
use std::sync::mpsc;
use std::thread;
use crate::error::{AssembleErrorKind, IoErrorKind};
use crate::{
context::AssemblerContext,
error::AssembleError,
model::module::ModuleId,
source::{token::Token, tokeniser::Tokeniser},
};
use common::instructions::Instruction;
/// Error type for the `CompilerEngine`
#[derive(Debug)]
pub enum EngineError {
/// Assembly error during compilation
Assembly(AssembleError),
/// Channel communication error
Channel(String),
/// Other generic error
Other(String),
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Assembly(e) => write!(f, "Assembly error: {e}"),
Self::Channel(msg) => write!(f, "Channel error: {msg}"),
Self::Other(msg) => write!(f, "Engine error: {msg}"),
}
}
}
impl std::error::Error for EngineError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Assembly(e) => Some(e),
Self::Channel(_) | Self::Other(_) => None,
}
}
}
// Convert from AssembleError
impl From<AssembleError> for EngineError {
fn from(error: AssembleError) -> Self {
Self::Assembly(error)
}
}
// Convert from mpsc::SendError
impl<T> From<mpsc::SendError<T>> for EngineError {
fn from(error: mpsc::SendError<T>) -> Self {
Self::Channel(format!("Send error: {error}"))
}
}
// Convert from mpsc::RecvError
impl From<mpsc::RecvError> for EngineError {
fn from(error: mpsc::RecvError) -> Self {
Self::Channel(format!("Receive error: {error}"))
}
}
// Convert from mpsc::TryRecvError
impl From<mpsc::TryRecvError> for EngineError {
fn from(error: mpsc::TryRecvError) -> Self {
Self::Channel(format!("Try receive error: {error}"))
}
}
// Convert from String for generic errors
impl From<String> for EngineError {
fn from(error: String) -> Self {
Self::Other(error)
}
}
// Convert from &str for convenience
impl From<&str> for EngineError {
fn from(error: &str) -> Self {
Self::Other(error.to_string())
}
}
/// Simple compiler engine that orchestrates the entire compilation process.
pub struct CompilerEngine {
result_tx: mpsc::Sender<Result<Vec<Instruction>, EngineError>>,
result_rx: Option<mpsc::Receiver<Result<Vec<Instruction>, EngineError>>>,
is_running: bool,
}
impl CompilerEngine {
/// Create a new compiler engine
#[must_use]
pub fn new() -> Self {
let (tx, rx) = mpsc::channel();
Self {
result_tx: tx,
result_rx: Some(rx),
is_running: false,
}
}
/// Start the compilation process in a separate thread
pub fn start_compilation<P: AsRef<Path>>(&mut self, src: P) {
if self.is_running {
return;
}
let src = src.as_ref().to_path_buf();
let tx = self.result_tx.clone();
thread::spawn(move || {
let result = assemble(&src).map_err(EngineError::from);
let _ = tx.send(result); // Ignore send errors if receiver is dropped
});
self.is_running = true;
}
/// Check if compilation is complete and get the result
pub fn try_get_result(&mut self) -> Option<Result<Vec<Instruction>, EngineError>> {
if !self.is_running {
return None;
}
match self
.result_rx
.as_ref()
.expect("result_rx should be Some while compilation is running")
.try_recv()
{
Ok(result) => {
self.is_running = false;
Some(result)
}
Err(mpsc::TryRecvError::Empty) => None,
Err(mpsc::TryRecvError::Disconnected) => {
self.is_running = false;
Some(Err(EngineError::Channel(
"Compilation thread disconnected".to_string(),
)))
}
}
}
/// Block until compilation is complete and return the result
pub fn wait_for_result(&mut self) -> Result<Vec<Instruction>, EngineError> {
if !self.is_running {
return Err(EngineError::Other("No compilation in progress".to_string()));
}
let result = self
.result_rx
.take()
.expect("result_rx should be Some while waiting for compilation result")
.recv()
.map_err(EngineError::from)?;
self.is_running = false;
result
}
/// Add a source file to be compiled (for compatibility with old interface)
pub fn add_source_file<P: AsRef<Path>>(
&mut self,
path: P,
) -> Result<(), EngineError> {
let path = path.as_ref().to_path_buf();
// Verify file exists
if !path.exists() {
return Err(EngineError::Assembly(AssembleError::new_other_error(
AssembleErrorKind::Io(crate::error::IoError::new(
IoErrorKind::NotFound,
Some(format!("Source file not found: {}", path.display())),
)),
)));
}
// For now, just validate the file exists
// TODO: Could store multiple files for batch compilation
Ok(())
}
/// Compile all added source files (synchronous version)
pub fn compile(&mut self) -> Result<CompileResult, EngineError> {
// This is a placeholder that matches the old interface
// For now, return empty result since we don't have a specific file to compile
Ok(CompileResult {
modules: Vec::new(),
tokens: HashMap::new(),
})
}
/// Get access to the assembler context (placeholder)
pub fn context(&self) -> Result<&AssemblerContext, EngineError> {
// For now, return an error since we're using the threaded approach
// TODO: Integrate context properly when we have more compilation phases
Err(EngineError::Other(
"Context not available in threaded mode".to_string(),
))
}
}
impl Default for CompilerEngine {
fn default() -> Self {
Self::new()
}
}
/// Main assembly function that orchestrates the compilation process
fn assemble(src: &Path) -> Result<Vec<Instruction>, AssembleError> {
// Verify the file exists
if !src.exists() {
return Err(AssembleError::new_other_error(AssembleErrorKind::Io(
crate::error::IoError::new(
IoErrorKind::NotFound,
Some(format!("Source file not found: {}", src.display())),
),
)));
}
let mut modules = HashSet::new();
let mut all_tokens = HashMap::new();
let mut module_ids = Vec::new();
// Create a new assembler context for this compilation
let context = AssemblerContext::new();
// Process the main file and its dependencies
prepare_dependency(
src,
&mut modules,
&mut all_tokens,
&mut module_ids,
&context,
)?;
// Phase 2: Parse tokens into AST (placeholder for now)
// TODO: Add parser here when implemented
println!("Phase 2: Parsing {} modules...", module_ids.len());
// Phase 3: Symbol resolution (placeholder for now)
// TODO: Add symbol resolution here when implemented
println!("Phase 3: Resolving symbols...");
// Phase 4: Code generation (placeholder for now)
// TODO: Add code generation here when implemented
println!("Phase 4: Generating code...");
// For now, return empty instructions since we don't have the full pipeline yet
Ok(Vec::new())
}
/// Prepare a dependency (file) for compilation
fn prepare_dependency(
path: &Path,
modules: &mut HashSet<u64>,
all_tokens: &mut HashMap<ModuleId, Vec<Token>>,
module_ids: &mut Vec<ModuleId>,
context: &AssemblerContext,
) -> Result<(), AssembleError> {
let filename = path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(crate::error::IoError::new(
IoErrorKind::InvalidData,
Some("Failed to get file name from path".to_string()),
)))
})?;
// Calculate a simple hash for the file (similar to quick_hash)
let file_hash = calculate_file_hash(path);
// Skip if we've already processed this module
if modules.contains(&file_hash) {
return Ok(());
}
modules.insert(file_hash);
if let Ok(canonical_path) = path.canonicalize() {
println!("Building {} [{}]", filename, canonical_path.display());
}
// Phase 1: Tokenize the file
println!("Tokenising {filename}");
let tokeniser = Tokeniser::new(path, context)?;
let tokens = tokeniser.tokenise()?;
// Get the module ID that was registered during tokenization
let module_id = get_module_id_for_file(path, context)?;
all_tokens.insert(module_id, tokens);
module_ids.push(module_id);
// TODO: Parse tokens to find dependencies (.include directives, etc.)
// For now, we'll just process the single file
println!("Resolving dependencies for {filename}");
Ok(())
}
/// Calculate a simple hash for a file path (similar to the old `quick_hash`)
fn calculate_file_hash(path: &Path) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
if let Ok(canonical) = path.canonicalize() {
canonical.hash(&mut hasher);
} else {
path.hash(&mut hasher);
}
hasher.finish()
}
/// Get the module ID for a given source file
fn get_module_id_for_file(
file_path: &Path,
context: &AssemblerContext,
) -> Result<ModuleId, AssembleError> {
{
let registry = context.module_registry.read()?;
// Find module by path.
for module in registry.modules() {
if module.path == file_path {
return Ok(module.id);
}
}
}
Err(AssembleError::new_other_error(AssembleErrorKind::Io(
crate::error::IoError::new(
IoErrorKind::NotFound,
Some(format!(
"Module not found for file: {}",
file_path.display()
)),
),
)))
}
/// Result of compilation. This is useless at present but compiles.
#[derive(Debug)]
pub struct CompileResult {
pub modules: Vec<ModuleId>,
pub tokens: HashMap<ModuleId, Vec<Token>>,
}
impl CompileResult {
/// Get tokens for a specific module
#[must_use]
pub fn get_tokens(&self, module_id: &ModuleId) -> Option<&Vec<Token>> {
self.tokens.get(module_id)
}
/// Get all module IDs
#[must_use]
pub fn module_ids(&self) -> &[ModuleId] {
&self.modules
}
/// Get total number of tokens across all modules
#[must_use]
pub fn total_tokens(&self) -> usize {
self.tokens.values().map(std::vec::Vec::len).sum()
}
}
+2 -1
View File
@@ -18,7 +18,8 @@ impl Default for AssemblerContext {
} }
impl AssemblerContext { impl AssemblerContext {
#[must_use] pub fn new() -> Self { #[must_use]
pub fn new() -> Self {
Self { Self {
symbol_table: RwLock::new(SymbolTable::new()), symbol_table: RwLock::new(SymbolTable::new()),
module_registry: RwLock::new(ModuleRegistry::new()), module_registry: RwLock::new(ModuleRegistry::new()),
+205 -12
View File
@@ -13,6 +13,9 @@ pub struct AssembleError {
source_info: Option<SourceInfo>, source_info: Option<SourceInfo>,
/// The type of assembly error that occurred. /// The type of assembly error that occurred.
kind: AssembleErrorKind, kind: AssembleErrorKind,
/// Whether context should be added to errors being printed. This might get changed
/// to Verbosity in the future.
display_quietly: bool,
} }
impl AssembleError { impl AssembleError {
@@ -24,6 +27,7 @@ impl AssembleError {
Self { Self {
source_info: Some(source_info), source_info: Some(source_info),
kind, kind,
display_quietly: false,
} }
} }
@@ -32,16 +36,86 @@ impl AssembleError {
Self { Self {
source_info: None, source_info: None,
kind, kind,
display_quietly: true,
} }
} }
/// Prints a parser error to the screen.
fn print_parser_error(
&self,
f: &mut std::fmt::Formatter<'_>,
parse_error: &ParserError,
) -> std::fmt::Result {
let Some(source_info) = &self.source_info else {
write!(
f,
"parser error thrown with no source information. Error: {parse_error}"
)?;
return Ok(());
};
writeln!(f, "parser error of type `{parse_error}`.\n")?;
// Prints out the context for our error.
if !self.display_quietly {
source_info.print_context_with_underline().map_err(|e| {
_ = writeln!(f, "print context error: {e}");
std::fmt::Error {}
})?;
}
Ok(())
}
/// Prints a tokeniser error to the screen.
fn print_tokeniser_error(
&self,
f: &mut std::fmt::Formatter<'_>,
err: &TokeniserError,
) -> std::fmt::Result {
let Some(source_info) = &self.source_info else {
write!(
f,
"Tokeniser error thrown with no source information. Error: {err}"
)?;
return Ok(());
};
writeln!(f, "tokeniser error of type `{err}`.\n")?;
// Prints out the context for our error.
source_info.print_context_with_underline().map_err(|e| {
_ = writeln!(f, "Print context error: {e}");
std::fmt::Error {}
})?;
Ok(())
}
} }
impl Display for AssembleError { impl Display for AssembleError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(info) = &self.source_info { if let Some(info) = &self.source_info {
write!(f, "at {info}")?; write!(f, "At {info}, got ")?;
match &self.kind {
AssembleErrorKind::Parser(err) => self.print_parser_error(f, err)?,
AssembleErrorKind::Tokeniser(err) => {
self.print_tokeniser_error(f, err)?;
}
_ => write!(f, "{}", self.kind)?,
}
writeln!(f)?;
return Ok(());
} }
// Handle errors without SourceInfo.
write!(f, "{}", self.kind)?; write!(f, "{}", self.kind)?;
Ok(()) Ok(())
@@ -51,20 +125,145 @@ impl Display for AssembleError {
/// Marker trait. /// Marker trait.
impl std::error::Error for AssembleError {} impl std::error::Error for AssembleError {}
/// Different types of errors that may occur when assembling a set of input source files. #[derive(Debug, Clone)]
#[non_exhaustive] #[non_exhaustive]
#[derive(Debug)]
pub enum AssembleErrorKind { pub enum AssembleErrorKind {
/// Usually unexpected I/O errors. Not normally recoverable. /// Usually unexpected I/O errors. Not normally recoverable.
IO(std::io::Error), Io(IoError),
/// Errors emitted from the [`Tokeniser`]. /// Errors emitted from the [`Tokeniser`].
Tokenise(TokeniserError), Tokeniser(TokeniserError),
Parser(ParserError),
Symbol(SymbolError),
Codegen(CodegenError),
Threading(ThreadingError),
/// Returned for code where the functionality has not yet been implemented but we
/// don't want the program to panic.
Unimplemented(&'static str),
}
#[derive(Debug, Clone)]
pub enum ParserError {
UnexpectedToken,
MissingOperand,
InvalidInstruction,
MissingLabel,
DuplicateLabel,
}
impl Display for ParserError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UnexpectedToken => write!(f, "unexpected token"),
Self::MissingOperand => write!(f, "missing operand"),
Self::InvalidInstruction => write!(f, "invalid instruction"),
Self::MissingLabel => write!(f, "missing label"),
Self::DuplicateLabel => write!(f, "duplicate label"),
}
}
}
#[derive(Debug, Clone)]
pub enum SymbolError {
Undefined,
Duplicate,
CircularDependency,
InvalidReference,
}
impl Display for SymbolError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Undefined => write!(f, "undefined symbol"),
Self::Duplicate => write!(f, "duplicate symbol"),
Self::CircularDependency => write!(f, "circular dependency"),
Self::InvalidReference => write!(f, "invalid reference"),
}
}
}
#[derive(Debug, Clone)]
pub enum CodegenError {
InvalidOperand,
OutOfRange,
UnsupportedInstruction,
}
impl Display for CodegenError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidOperand => write!(f, "invalid operand"),
Self::OutOfRange => write!(f, "out of range"),
Self::UnsupportedInstruction => write!(f, "unsupported instruction"),
}
}
}
#[derive(Debug, Clone)]
pub enum ThreadingError {
LockFailed,
ThreadPanic,
}
impl Display for ThreadingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::LockFailed => write!(f, "lock failed"),
Self::ThreadPanic => write!(f, "thread panic"),
}
}
}
#[derive(Debug, Clone)]
pub struct IoError {
msg: Option<String>,
kind: IoErrorKind,
}
impl IoError {
#[must_use]
pub const fn new(kind: IoErrorKind, msg: Option<String>) -> Self {
Self { msg, kind }
}
}
#[derive(Debug, Clone)]
pub enum IoErrorKind {
NotFound,
PermissionDenied,
InvalidData,
Other,
}
impl std::fmt::Display for IoErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound => write!(f, "file not found"),
Self::PermissionDenied => write!(f, "permission denied"),
Self::InvalidData => write!(f, "invalid data"),
Self::Other => write!(f, "other I/O error"),
}
}
}
impl std::fmt::Display for IoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.kind)?;
if let Some(msg) = &self.msg {
write!(f, ", \"{msg}\"")?;
}
Ok(())
}
} }
impl Display for AssembleErrorKind { impl Display for AssembleErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Self::Tokenise(why) => write!(f, "tokeniser error: {why}"), Self::Tokeniser(why) => write!(f, "tokeniser error: {why}"),
Self::Unimplemented(why) => write!(f, "used unimplemented feature: {why}"),
Self::Io(why) => write!(f, "problem occurred with I/O: {why}"),
#[allow(unreachable_patterns)]
_ => write!( _ => write!(
f, f,
"unhandled error type in Display implementation! See error.rs!" "unhandled error type in Display implementation! See error.rs!"
@@ -73,10 +272,4 @@ impl Display for AssembleErrorKind {
} }
} }
impl From<std::io::Error> for AssembleErrorKind {
fn from(err: std::io::Error) -> Self {
Self::IO(err)
}
}
pub mod conversions; pub mod conversions;
+62 -2
View File
@@ -1,7 +1,67 @@
use crate::error::AssembleError; use std::{
io::ErrorKind,
sync::{PoisonError, RwLockReadGuard, RwLockWriteGuard},
};
use crate::error::{AssembleError, IoError, IoErrorKind};
use super::{AssembleErrorKind, ThreadingError};
impl From<std::io::Error> for IoError {
fn from(err: std::io::Error) -> Self {
let kind = match err.kind() {
ErrorKind::NotFound => IoErrorKind::NotFound,
ErrorKind::PermissionDenied => IoErrorKind::PermissionDenied,
ErrorKind::InvalidData => IoErrorKind::InvalidData,
_ => IoErrorKind::Other,
};
let msg = err.to_string();
Self::new(kind, Some(msg))
}
}
impl From<std::io::Error> for AssembleError { impl From<std::io::Error> for AssembleError {
fn from(err: std::io::Error) -> Self { fn from(err: std::io::Error) -> Self {
Self::new_other_error(err.into()) Self::new_other_error(AssembleErrorKind::Io(err.into()))
}
}
// TODO: Maybe attempt recovery? To be honest we don't want any threads to panic at all,
// or we want them all to panic spectacularly.
impl<T> From<PoisonError<RwLockReadGuard<'_, T>>> for AssembleError {
fn from(err: PoisonError<RwLockReadGuard<'_, T>>) -> Self {
Self::new_other_error(AssembleErrorKind::Threading(err.into()))
}
}
impl<T> From<PoisonError<RwLockReadGuard<'_, T>>> for ThreadingError {
fn from(_err: PoisonError<RwLockReadGuard<'_, T>>) -> Self {
Self::LockFailed
}
}
impl<T> From<PoisonError<RwLockWriteGuard<'_, T>>> for AssembleError {
fn from(err: PoisonError<RwLockWriteGuard<'_, T>>) -> Self {
Self::new_other_error(AssembleErrorKind::Threading(err.into()))
}
}
impl<T> From<PoisonError<RwLockWriteGuard<'_, T>>> for ThreadingError {
fn from(_err: PoisonError<RwLockWriteGuard<'_, T>>) -> Self {
Self::LockFailed
}
}
impl From<std::fmt::Error> for AssembleError {
fn from(err: std::fmt::Error) -> Self {
IoError::new(IoErrorKind::Other, Some(err.to_string())).into()
}
}
impl From<IoError> for AssembleError {
fn from(err: IoError) -> Self {
Self::new_other_error(AssembleErrorKind::Io(err))
} }
} }
View File
+2 -6
View File
@@ -13,8 +13,8 @@
)] )]
pub mod args; pub mod args;
pub mod image_builder;
// pub mod tooling; // pub mod tooling;
pub mod compiler_engine;
pub mod context; pub mod context;
pub mod error; pub mod error;
pub mod model; pub mod model;
@@ -23,11 +23,7 @@ pub mod symtab;
mod util; mod util;
pub mod prelude { // pub mod prelude {}
pub use crate::image_builder;
// pub use crate::tooling::brainf;
// pub use crate::tooling::project;
}
use num_cpus as _; use num_cpus as _;
use threadpool as _; use threadpool as _;
+30 -1
View File
@@ -1,3 +1,10 @@
use std::sync::Arc;
use assembler::{
error::{AssembleError, AssembleErrorKind, ParserError},
model::module::Module,
source::{source_info::SourceInfo, token::TokenType, tokeniser::Tokeniser},
};
use common as _; use common as _;
use num_cpus as _; use num_cpus as _;
use threadpool as _; use threadpool as _;
@@ -5,9 +12,31 @@ use threadpool as _;
// use clap::Parser; // use clap::Parser;
// use std::{fs, io::Write, path::PathBuf}; // use std::{fs, io::Write, path::PathBuf};
fn main() { fn main() -> Result<(), AssembleError> {
// // Parse command line arguments // // Parse command line arguments
// let args: Vec<String> = std::env::args().collect(); // let args: Vec<String> = std::env::args().collect();
let contents = include_bytes!("../../resources/dsa/bf.dsa").to_vec();
let module = Arc::new(Module::new("resources/dsa/bf.dsa")?);
let tok = Tokeniser::from_data(contents, module.clone());
let ts = tok
.tokenise()?
.into_iter()
.filter(|t| !matches!(t.token_type, TokenType::Eof | TokenType::Newline));
for t in ts {
t.source_info.print_context_with_underline()?;
}
let test_error: AssembleError = AssembleError::new_source_error(
SourceInfo::new(45, module.clone(), 4..7),
AssembleErrorKind::Parser(ParserError::InvalidInstruction),
);
eprintln!("\n\n{test_error}");
Ok(())
// let _clap_args = assembler::args::Args::parse(); // let _clap_args = assembler::args::Args::parse();
+52 -10
View File
@@ -4,17 +4,29 @@
//! //!
//! They have unique identifiers in the form of UUIDs. //! They have unique identifiers in the form of UUIDs.
use std::path::{Path, PathBuf}; use std::{
path::{Path, PathBuf},
sync::Arc,
};
use regex::Regex;
use uuid::Uuid; use uuid::Uuid;
use crate::model::module_registry::ModuleRegistry; use crate::{
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
model::module_registry::ModuleRegistry,
};
/// The ID for a module. A tuple struct for type safety. /// The ID for a module. A tuple struct for type safety.
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
pub struct ModuleId(Uuid); pub struct ModuleId(Uuid);
impl ModuleId { impl ModuleId {
#[must_use]
pub fn new() -> Self {
Self(Uuid::new_v4())
}
#[must_use] #[must_use]
pub const fn from_module(module: &Module) -> Self { pub const fn from_module(module: &Module) -> Self {
module.id module.id
@@ -22,7 +34,7 @@ impl ModuleId {
/// Convenience method to get the [`Module`] from a [`ModuleId`]. /// Convenience method to get the [`Module`] from a [`ModuleId`].
#[must_use] #[must_use]
pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> { pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Arc<Module>> {
registry.get(self) registry.get(self)
} }
@@ -33,6 +45,12 @@ impl ModuleId {
} }
} }
impl Default for ModuleId {
fn default() -> Self {
Self::new()
}
}
impl std::fmt::Display for ModuleId { impl std::fmt::Display for ModuleId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0) write!(f, "{}", self.0)
@@ -40,7 +58,7 @@ impl std::fmt::Display for ModuleId {
} }
/// A single source file or compilation unit. Stores its own symbol table. /// A single source file or compilation unit. Stores its own symbol table.
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct Module { pub struct Module {
/// The name of the module. This is typically the name of the file, less the `.dsa` /// The name of the module. This is typically the name of the file, less the `.dsa`
/// extension. /// extension.
@@ -58,11 +76,35 @@ impl std::hash::Hash for Module {
} }
impl Module { impl Module {
pub fn new<P: AsRef<Path>>(name: String, path: P) -> Self { pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, AssembleError> {
Self { let path = p.as_ref().to_path_buf();
name, let name = Self::extract_module_name(&path)?;
path: path.as_ref().to_path_buf(), let id = ModuleId::new();
id: ModuleId(Uuid::new_v4()),
} Ok(Self { name, path, id })
}
/// Gets the name for a module from the path.
fn extract_module_name<P: AsRef<Path>>(path: P) -> Result<String, AssembleError> {
let extensions_regex = Regex::new(".(dsa|S|asm)$")
.expect("For some reason the regular expression failed to compile!");
let module_name = path
.as_ref()
.file_name()
.map(|f| f.to_string_lossy())
.ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
IoErrorKind::InvalidData,
Some(
"the filename couldn't be extracted, is it valid UTF-8?"
.to_string(),
),
)))
})?;
// Strip any file extensions given. We don't care for now.
let out = extensions_regex.replace(&module_name, "");
Ok(out.to_string())
} }
} }
+8 -6
View File
@@ -1,13 +1,13 @@
//! This module contains the code for the module registry. This is a singleton storing all //! This module contains the code for the module registry. This is a singleton storing all
//! the modules being assembled. //! the modules being assembled.
use std::collections::HashMap; use std::{collections::HashMap, sync::Arc};
use super::module::{Module, ModuleId}; use super::module::{Module, ModuleId};
/// Stores all the [`Module`]'s to be assembled. /// Stores all the [`Module`]'s to be assembled.
pub struct ModuleRegistry { pub struct ModuleRegistry {
modules: HashMap<ModuleId, Module>, modules: HashMap<ModuleId, Arc<Module>>,
} }
impl Default for ModuleRegistry { impl Default for ModuleRegistry {
@@ -17,26 +17,28 @@ impl Default for ModuleRegistry {
} }
impl ModuleRegistry { impl ModuleRegistry {
#[must_use] pub fn new() -> Self { #[must_use]
pub fn new() -> Self {
Self { Self {
modules: HashMap::new(), modules: HashMap::new(),
} }
} }
/// Gets a [`Module`] by ID. /// Gets a [`Module`] by ID.
#[must_use] pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { #[must_use]
pub fn get(&self, module_id: &ModuleId) -> Option<&Arc<Module>> {
self.modules.get(module_id) self.modules.get(module_id)
} }
/// Adds a [`Module`] and returns its [`ModuleId`]. /// Adds a [`Module`] and returns its [`ModuleId`].
pub fn add(&mut self, module: Module) -> ModuleId { pub fn add(&mut self, module: Arc<Module>) -> ModuleId {
let id = module.id; let id = module.id;
self.modules.insert(id, module); self.modules.insert(id, module);
id id
} }
/// Returns an iterator of modules. /// Returns an iterator of modules.
pub fn modules(&self) -> impl Iterator<Item = &Module> { pub fn modules(&self) -> impl Iterator<Item = &Arc<Module>> {
self.modules.values() self.modules.values()
} }
} }
+12 -1
View File
@@ -1,12 +1,18 @@
//! This module contains anything within the first stage of assembly, i.e. the //! This module contains anything within the first stage of assembly, i.e. the
//! tokenisation stage, or utility functions for reading input files. //! tokenisation stage, or utility functions for reading input files.
use std::path::Path; use std::{
io::{BufRead, Lines},
path::Path,
};
use crate::error::AssembleError; use crate::error::AssembleError;
pub mod lines;
pub mod opcode;
pub mod source_info; pub mod source_info;
pub mod token; pub mod token;
pub mod token_info;
pub mod tokeniser; pub mod tokeniser;
/// Attempts to load and open a source file, returning a [`Vec<u8>`] or an /// Attempts to load and open a source file, returning a [`Vec<u8>`] or an
@@ -16,3 +22,8 @@ pub fn load_source_bytes<P: AsRef<Path>>(p: P) -> Result<Vec<u8>, AssembleError>
Ok(std::fs::read(path)?) Ok(std::fs::read(path)?)
} }
/// Get the lines from a [`BufReader`].
pub fn reader_lines<R: BufRead>(rdr: R) -> Lines<R> {
rdr.lines()
}
+76
View File
@@ -0,0 +1,76 @@
//! Enhanced lines iterator that tracks line numbers and character positions.
use std::io::{BufRead, BufReader, Cursor};
use crate::error::AssembleError;
/// Iterator that yields lines with their line numbers and character spans.
pub struct LinesWithSpans<R: BufRead> {
reader: R,
line_number: usize,
total_chars: usize,
buffer: String,
}
#[derive(Debug, Clone)]
pub struct LineSpan {
/// The line number.
pub line_number: usize,
/// The contents of the line.
pub content: String,
/// Character offset from start of file.
pub start_char: usize,
/// End character offset (exclusive).
pub end_char: usize,
}
impl<R: BufRead> LinesWithSpans<R> {
pub const fn new(reader: R) -> Self {
Self {
reader,
line_number: 0,
total_chars: 0,
buffer: String::new(),
}
}
}
impl<R: BufRead> Iterator for LinesWithSpans<R> {
type Item = Result<LineSpan, AssembleError>;
fn next(&mut self) -> Option<Self::Item> {
self.buffer.clear();
match self.reader.read_line(&mut self.buffer) {
Ok(0) => None, // EOF
Ok(bytes_read) => {
self.line_number += 1;
let start_char = self.total_chars;
self.total_chars += bytes_read;
// Remove trailing newline for cleaner processing
let content = if self.buffer.ends_with('\n') {
self.buffer[..self.buffer.len() - 1].to_string()
} else {
self.buffer.clone()
};
Some(Ok(LineSpan {
line_number: self.line_number,
content,
start_char,
end_char: self.total_chars,
}))
}
Err(e) => Some(Err(e.into())),
}
}
}
/// Helper function to create lines iterator from data.
#[must_use]
pub fn lines_with_spans(data: &[u8]) -> LinesWithSpans<BufReader<Cursor<&[u8]>>> {
let cursor = Cursor::new(data);
let reader = BufReader::new(cursor);
LinesWithSpans::new(reader)
}
+349
View File
@@ -0,0 +1,349 @@
//! This module contains instructions for tokenisation.
use std::{fmt, str::FromStr};
use common::prelude::{ITypeArgs, Instruction, Interrupt, RTypeArgs};
use crate::{
error::{AssembleError, AssembleErrorKind},
source::source_info::SourceInfo,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Opcode {
Nop,
Mov,
Movs,
Ldb,
Ldbs,
Ldh,
Ldhs,
Ldw,
Stb,
Sth,
Stw,
Lli,
Lui,
Jmp,
Jeq,
Jne,
Jgt,
Jge,
Jlt,
Jle,
Cmp,
Inc,
Dec,
Shl,
Shr,
Add,
Sub,
And,
Or,
Not,
Xor,
Nand,
Nor,
Xnor,
Int,
Irt,
Hlt,
AddI,
SubI,
// Pseudo-instructions
Db,
Dh,
Dw,
Resb,
Resh,
Resw,
Push,
Pop,
Pusha,
Popa,
Lwi,
Call,
Return,
// Meta instructions (these aren't present in the binary as instructions)
Include,
Data,
Segment,
}
#[derive(Debug)]
pub enum OpcodeFromStrError {
InvalidRegister(&'static str),
InvalidOpcode(String),
}
impl std::fmt::Display for OpcodeFromStrError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidRegister(reg) => write!(f, "register does not exist: {reg}"),
Self::InvalidOpcode(op) => write!(f, "instruction does not exist: {op}"),
}
}
}
impl std::error::Error for OpcodeFromStrError {}
impl Opcode {
pub const OPCODES: &[&str] = &[
// Real instructions (0x00-0x26)
"nop", "mov", "movs", "ldb", "ldbs", "ldh", "ldhs", "ldw", "stb", "sth", "stw",
"lli", "lui", "jmp", "jeq", "jne", "jgt", "jge", "jlt", "jle", "cmp", "inc",
"dec", "shl", "shr", "add", "sub", "and", "or", "not", "xor", "nand", "nor",
"xnor", "int", "irt", "hlt", "addi", "subi", // Pseudo-instructions
"db", "dh", "dw", "resb", "resh", "resw", "push", "pop", "lwi", "call", "return",
"pusha", "popa", // meta instructions
"include",
];
pub fn to_instruction(
&self,
source_info: SourceInfo,
) -> Result<Instruction, AssembleError> {
match self {
Self::Nop => Ok(Instruction::Nop),
Self::Mov => Ok(Instruction::Mov(RTypeArgs::default())),
Self::Movs => Ok(Instruction::MovSigned(RTypeArgs::default())),
Self::Ldb => Ok(Instruction::LoadByte(ITypeArgs::default())),
Self::Ldbs => Ok(Instruction::LoadByteSigned(ITypeArgs::default())),
Self::Ldh => Ok(Instruction::LoadHalfword(ITypeArgs::default())),
Self::Ldhs => Ok(Instruction::LoadHalfwordSigned(ITypeArgs::default())),
Self::Ldw => Ok(Instruction::LoadWord(ITypeArgs::default())),
Self::Stb => Ok(Instruction::StoreByte(ITypeArgs::default())),
Self::Sth => Ok(Instruction::StoreHalfword(ITypeArgs::default())),
Self::Stw => Ok(Instruction::StoreWord(ITypeArgs::default())),
Self::Lli => Ok(Instruction::LoadLowerImmediate(ITypeArgs::default())),
Self::Lui => Ok(Instruction::LoadUpperImmediate(ITypeArgs::default())),
Self::Jmp => Ok(Instruction::Jump(ITypeArgs::default())),
Self::Jeq => Ok(Instruction::JumpEq(ITypeArgs::default())),
Self::Jne => Ok(Instruction::JumpNeq(ITypeArgs::default())),
Self::Jgt => Ok(Instruction::JumpGt(ITypeArgs::default())),
Self::Jge => Ok(Instruction::JumpGe(ITypeArgs::default())),
Self::Jlt => Ok(Instruction::JumpLt(ITypeArgs::default())),
Self::Jle => Ok(Instruction::JumpLe(ITypeArgs::default())),
Self::Cmp => Ok(Instruction::Compare(RTypeArgs::default())),
Self::Inc => Ok(Instruction::Increment(RTypeArgs::default())),
Self::Dec => Ok(Instruction::Decrement(RTypeArgs::default())),
Self::Shl => Ok(Instruction::ShiftLeft(RTypeArgs::default())),
Self::Shr => Ok(Instruction::ShiftRight(RTypeArgs::default())),
Self::Add => Ok(Instruction::Add(RTypeArgs::default())),
Self::Sub => Ok(Instruction::Sub(RTypeArgs::default())),
Self::And => Ok(Instruction::And(RTypeArgs::default())),
Self::Or => Ok(Instruction::Or(RTypeArgs::default())),
Self::Not => Ok(Instruction::Not(RTypeArgs::default())),
Self::Xor => Ok(Instruction::Xor(RTypeArgs::default())),
Self::Nand => Ok(Instruction::Nand(RTypeArgs::default())),
Self::Nor => Ok(Instruction::Nor(RTypeArgs::default())),
Self::Xnor => Ok(Instruction::Xnor(RTypeArgs::default())),
Self::Int => Ok(Instruction::Interrupt(Interrupt::default())),
Self::Irt => Ok(Instruction::IntReturn),
Self::Hlt => Ok(Instruction::Halt),
Self::AddI => Ok(Instruction::AddImmediate(ITypeArgs::default())),
Self::SubI => Ok(Instruction::SubImmediate(ITypeArgs::default())),
Self::Segment => Ok(Instruction::Segment(0)),
_ => Err(AssembleError::new_source_error(
source_info,
AssembleErrorKind::Unimplemented(
"Opcode::to_instruction called on an instruction that does not exist in common.",
),
)),
}
}
#[must_use]
pub const fn to_opcode_value(&self) -> Option<u8> {
match self {
Self::Nop => Some(0x00),
Self::Mov => Some(0x01),
Self::Movs => Some(0x02),
Self::Ldb => Some(0x03),
Self::Ldbs => Some(0x04),
Self::Ldh => Some(0x05),
Self::Ldhs => Some(0x06),
Self::Ldw => Some(0x07),
Self::Stb => Some(0x08),
Self::Sth => Some(0x09),
Self::Stw => Some(0x0A),
Self::Lli => Some(0x0B),
Self::Lui => Some(0x0C),
Self::Jmp => Some(0x0D),
Self::Jeq => Some(0x0E),
Self::Jne => Some(0x0F),
Self::Jgt => Some(0x10),
Self::Jge => Some(0x11),
Self::Jlt => Some(0x12),
Self::Jle => Some(0x13),
Self::Cmp => Some(0x14),
Self::Inc => Some(0x15),
Self::Dec => Some(0x16),
Self::Shl => Some(0x17),
Self::Shr => Some(0x18),
Self::Add => Some(0x19),
Self::Sub => Some(0x1A),
Self::And => Some(0x1B),
Self::Or => Some(0x1C),
Self::Not => Some(0x1D),
Self::Xor => Some(0x1E),
Self::Nand => Some(0x1F),
Self::Nor => Some(0x20),
Self::Xnor => Some(0x21),
Self::Int => Some(0x22),
Self::Irt => Some(0x23),
Self::Hlt => Some(0x24),
Self::AddI => Some(0x25),
Self::SubI => Some(0x26),
// TODO: Maybe recombine pseudos?
Self::Segment => Some(0x27),
// Pseudo-instructions don't have opcode values
_ => None,
}
}
#[must_use]
pub const fn is_pseudo_instruction(&self) -> bool {
matches!(
self,
Self::Db
| Self::Dh
| Self::Dw
| Self::Resb
| Self::Resh
| Self::Resw
| Self::Push
| Self::Pop
| Self::Lwi
)
}
}
impl FromStr for Opcode {
type Err = OpcodeFromStrError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"nop" => Ok(Self::Nop),
"mov" => Ok(Self::Mov),
"movs" => Ok(Self::Movs),
"ldb" => Ok(Self::Ldb),
"ldbs" => Ok(Self::Ldbs),
"ldh" => Ok(Self::Ldh),
"ldhs" => Ok(Self::Ldhs),
"ldw" => Ok(Self::Ldw),
"stb" => Ok(Self::Stb),
"sth" => Ok(Self::Sth),
"stw" => Ok(Self::Stw),
"lli" => Ok(Self::Lli),
"lui" => Ok(Self::Lui),
"jmp" => Ok(Self::Jmp),
"jeq" => Ok(Self::Jeq),
"jne" => Ok(Self::Jne),
"jgt" => Ok(Self::Jgt),
"jge" => Ok(Self::Jge),
"jlt" => Ok(Self::Jlt),
"jle" => Ok(Self::Jle),
"cmp" => Ok(Self::Cmp),
"inc" => Ok(Self::Inc),
"dec" => Ok(Self::Dec),
"shl" => Ok(Self::Shl),
"shr" => Ok(Self::Shr),
"add" => Ok(Self::Add),
"sub" => Ok(Self::Sub),
"and" => Ok(Self::And),
"or" => Ok(Self::Or),
"not" => Ok(Self::Not),
"xor" => Ok(Self::Xor),
"nand" => Ok(Self::Nand),
"nor" => Ok(Self::Nor),
"xnor" => Ok(Self::Xnor),
"int" => Ok(Self::Int),
"irt" => Ok(Self::Irt),
"hlt" => Ok(Self::Hlt),
"addi" => Ok(Self::AddI),
"subi" => Ok(Self::SubI),
"db" => Ok(Self::Db),
"dh" => Ok(Self::Dh),
"dw" => Ok(Self::Dw),
"resb" => Ok(Self::Resb),
"resh" => Ok(Self::Resh),
"resw" => Ok(Self::Resw),
"push" => Ok(Self::Push),
"pop" => Ok(Self::Pop),
"lwi" => Ok(Self::Lwi),
"include" => Ok(Self::Include),
"call" => Ok(Self::Call),
"return" => Ok(Self::Return),
"pusha" => Ok(Self::Pusha),
"popa" => Ok(Self::Popa),
_ => Err(OpcodeFromStrError::InvalidOpcode(s.to_string())),
}
}
}
impl fmt::Display for Opcode {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::Nop => write!(f, "nop"),
Self::Mov => write!(f, "mov"),
Self::Movs => write!(f, "movs"),
Self::Ldb => write!(f, "ldb"),
Self::Ldbs => write!(f, "ldbs"),
Self::Ldh => write!(f, "ldh"),
Self::Ldhs => write!(f, "ldhs"),
Self::Ldw => write!(f, "ldw"),
Self::Stb => write!(f, "stb"),
Self::Sth => write!(f, "sth"),
Self::Stw => write!(f, "stw"),
Self::Lli => write!(f, "lli"),
Self::Lui => write!(f, "lui"),
Self::Jmp => write!(f, "jmp"),
Self::Jeq => write!(f, "jeq"),
Self::Jne => write!(f, "jne"),
Self::Jgt => write!(f, "jgt"),
Self::Jge => write!(f, "jge"),
Self::Jlt => write!(f, "jlt"),
Self::Jle => write!(f, "jle"),
Self::Cmp => write!(f, "cmp"),
Self::Inc => write!(f, "inc"),
Self::Dec => write!(f, "dec"),
Self::Shl => write!(f, "shl"),
Self::Shr => write!(f, "shr"),
Self::Add => write!(f, "add"),
Self::Sub => write!(f, "sub"),
Self::And => write!(f, "and"),
Self::Or => write!(f, "or"),
Self::Not => write!(f, "not"),
Self::Xor => write!(f, "xor"),
Self::Nand => write!(f, "nand"),
Self::Nor => write!(f, "nor"),
Self::Xnor => write!(f, "xnor"),
Self::Int => write!(f, "int"),
Self::Irt => write!(f, "irt"),
Self::Hlt => write!(f, "hlt"),
Self::AddI => write!(f, "addi"),
Self::SubI => write!(f, "subi"),
Self::Db => write!(f, "db"),
Self::Dh => write!(f, "dh"),
Self::Dw => write!(f, "dw"),
Self::Resb => write!(f, "resb"),
Self::Resh => write!(f, "resh"),
Self::Resw => write!(f, "resw"),
Self::Push => write!(f, "push"),
Self::Pop => write!(f, "pop"),
Self::Lwi => write!(f, "lwi"),
Self::Call => write!(f, "call"),
Self::Return => write!(f, "return"),
Self::Pusha => write!(f, "pusha"),
Self::Popa => write!(f, "popa"),
// meta instructions
Self::Include => write!(f, "include"),
Self::Data => write!(f, "data"),
Self::Segment => write!(f, "[SEGMENT]"),
}
}
}
+4
View File
@@ -0,0 +1,4 @@
//! This module contains code for handling pseudo opcodes.
/// Pseudo instructions that cannot simply be lowered to ISA instructions.
pub enum PseudoOpcode {}
+85 -6
View File
@@ -4,22 +4,101 @@
//! This will likely be attached to a [`Token`] which will in turn be attached to an AST //! This will likely be attached to a [`Token`] which will in turn be attached to an AST
//! [`Node`]. //! [`Node`].
use std::fmt::Display; use std::{
fmt::{Display, Write},
fs::File,
io::BufReader,
sync::Arc,
};
use crate::model::module::Module; use crate::{
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
model::module::Module,
source::lines::LinesWithSpans,
};
/// Information on where the token is within the source. /// Information on where the token is within the source.
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct SourceInfo { pub struct SourceInfo {
/// The line number within the source file underpinned by `module_id`. /// The line number within the source file underpinned by `module_id`.
pub line_no: usize, pub line_number: usize,
pub module: Module, /// The [`Module`] the source code is associated with.
pub module: Arc<Module>,
/// The indexes where this token may be found (line-local). /// The indexes where this token may be found (line-local).
pub span: std::ops::Range<usize>, pub span: std::ops::Range<usize>,
} }
impl Display for SourceInfo { impl Display for SourceInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.module.name) write!(
f,
"{}:{}:{}",
self.module.path.display(),
self.line_number,
self.span.start + 1
)
}
}
impl SourceInfo {
#[must_use]
pub const fn new(
line_no: usize,
module: Arc<Module>,
span: std::ops::Range<usize>,
) -> Self {
Self {
line_number: line_no,
module,
span,
}
}
/// Prints out where in the source code the error originated with an underline similar
/// to what rustc does.
pub fn print_context_with_underline(&self) -> Result<(), AssembleError> {
let f = File::open(&self.module.path)?;
let rdr = BufReader::new(f);
let mut lines = LinesWithSpans::new(rdr);
let Some(line_result) = lines.nth(self.line_number - 1) else {
// Handle a line not existing.
return Err(AssembleError::new_source_error(
self.clone(),
AssembleErrorKind::Io(IoError::new(
IoErrorKind::Other,
Some(format!(
"the line {} does not exist in input file `{}` but source info suggested otherwise!.",
self.line_number,
self.module.path.display()
)),
)),
));
};
let line_span = line_result?;
// Print the line number and line content.
println!("{:>4} | {}", self.line_number, line_span.content);
let mut pad_left = String::new();
write!(pad_left, "{:>4} ", "")?;
let mut underline = String::new();
for _ in 0..self.span.start {
pad_left.push(' ');
}
for _ in self.span.start..self.span.end.min(line_span.content.len()) {
underline.push('^');
}
// Print the underline in red and bold.
// TODO: Use a crate to make this extra portable.
println!("{pad_left}\x1b[1;31m{underline}\x1b[0m");
Ok(())
} }
} }
+21 -36
View File
@@ -2,8 +2,15 @@
//! easier to build from scratch and edit his code than it would be to try and wrangle it //! easier to build from scratch and edit his code than it would be to try and wrangle it
//! into shape. //! into shape.
use crate::source::source_info::SourceInfo; use common::prelude::*;
use crate::source::{
opcode::Opcode,
source_info::SourceInfo,
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
};
/// Represents the different types of tokens that can be produced by the tokeniser.
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenType { pub enum TokenType {
/// Symbol reference (e.g., `loop_start`, `my_data`). /// Symbol reference (e.g., `loop_start`, `my_data`).
@@ -14,12 +21,18 @@ pub enum TokenType {
Immediate(u32), Immediate(u32),
/// String literal (e.g., `"hello world"`). /// String literal (e.g., `"hello world"`).
String(String), String(String),
/// Intermediate token for multiline strings (filtered out in final output)
StringContinuation,
/// Assembly instruction (e.g., `add`, `jmp`, `nop`). /// Assembly instruction (e.g., `add`, `jmp`, `nop`).
Instruction(InstructionToken), Instruction(Opcode),
/// Label definition (e.g., `loop_start:`). /// Label definition (e.g., `loop_start:`).
Label(LabelToken), Label(LabelToken),
/// Assembler directive (e.g., `.global`, `.section`, `.dw`). /// Assembler directive (e.g., `.global`, `.section`, `.dw`).
Directive(DirectiveToken), Directive(DirectiveToken),
/// Comment (e.g., `// this is a comment`).
Comment,
/// Comma separator.
Comma,
/// End of line. /// End of line.
Newline, Newline,
/// End of file. /// End of file.
@@ -29,34 +42,9 @@ pub enum TokenType {
#[derive(Debug)] #[derive(Debug)]
pub struct Token { pub struct Token {
/// The type of the token. /// The type of the token.
token_type: TokenType, pub token_type: TokenType,
/// Where in the source code is this [`Token`]? /// Where in the source code is this [`Token`]?
source_info: SourceInfo, pub source_info: SourceInfo,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LabelToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct DirectiveToken {
pub directive: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct RegisterToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct InstructionToken {
pub mnemonic: String,
} }
impl Token { impl Token {
@@ -79,16 +67,13 @@ impl Token {
} }
#[must_use] #[must_use]
pub const fn instruction(mnemonic: String, source_info: SourceInfo) -> Self { pub const fn instruction(op: Opcode, source_info: SourceInfo) -> Self {
Self::new( Self::new(TokenType::Instruction(op), source_info)
TokenType::Instruction(InstructionToken { mnemonic }),
source_info,
)
} }
#[must_use] #[must_use]
pub const fn register(name: String, source_info: SourceInfo) -> Self { pub const fn register(reg: Register, source_info: SourceInfo) -> Self {
Self::new(TokenType::Register(RegisterToken { name }), source_info) Self::new(TokenType::Register(RegisterToken { reg }), source_info)
} }
#[must_use] #[must_use]
+34
View File
@@ -0,0 +1,34 @@
use common::prelude::Register;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LabelToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct DirectiveToken {
pub directive: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct RegisterToken {
pub reg: Register,
}
impl RegisterToken {
#[must_use]
pub const fn new(reg: Register) -> Self {
Self { reg }
}
/// Returns the name of a valid [`Register`]
#[must_use]
pub fn name(&self) -> String {
self.reg.to_string()
}
}
+416 -2
View File
@@ -1,7 +1,421 @@
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and //! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
//! outputs a [`Vec<Token>`]. //! outputs a [`Vec<Token>`].
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s. use std::{path::Path, str::FromStr, sync::Arc};
pub struct Tokeniser {}
use regex::Regex;
use common::prelude::*;
use crate::{
context::AssemblerContext,
error::{AssembleError, AssembleErrorKind},
model::module::Module,
source::{
lines::{LineSpan, lines_with_spans},
load_source_bytes,
opcode::Opcode,
source_info::SourceInfo,
token::{Token, TokenType},
token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
tokeniser::error::TokeniserError,
},
};
pub mod error; pub mod error;
#[cfg(test)]
mod tests;
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
pub struct Tokeniser {
/// The data in the file.
pub data: Vec<u8>,
/// A copy of the Module in which the file is situated.
pub module: Arc<Module>,
// Pre-compiled regex patterns
label_regex: Regex,
register_regex: Regex,
immediate_regex: Regex,
directive_regex: Regex,
instruction_regex: Regex,
symbol_regex: Regex,
comment_regex: Regex,
// String parsing state
in_string: bool,
string_buffer: String,
string_start_line: usize,
string_start_column: usize,
}
impl Tokeniser {
#[must_use]
pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
Self {
data,
module,
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"),
register_regex: Regex::new(
r"^(rg[0-9a-f]+|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b",
)
.expect("Failed to compile register regex pattern"),
immediate_regex: Regex::new(
r"^(0x[0-9a-fA-F_]+|0b[0-1_]+|0o[0-7_]+|[0-9_]+)",
)
.expect("Failed to compile immediate regex pattern"),
directive_regex: Regex::new(r"^(res[bwh]|d[bwh]|include|section|global|local)\b")
.expect("Failed to compile directive regex pattern"),
instruction_regex: Regex::new(
r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
)
.expect("Failed to compile instruction regex pattern"),
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)::{2}([a-zA-Z0-9_]*)|([a-zA-Z_][a-zA-Z0-9_]*)")
.expect("Failed to compile symbol regex pattern"),
comment_regex: Regex::new("^//.*")
.expect("Failed to compile comment regex pattern"),
// Initialize string parsing state
in_string: false,
string_buffer: String::new(),
string_start_line: 0,
string_start_column: 0,
}
}
/// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
/// for you.
pub fn new<P: AsRef<Path>>(
path: P,
ctx: &AssemblerContext,
) -> Result<Self, AssembleError> {
let path = path.as_ref().to_path_buf();
let data = load_source_bytes(&path)?;
let module = Arc::new(Module::new(path)?);
{
let mut module_registry = ctx.module_registry.write()?;
module_registry.add(module.clone());
}
Ok(Self::from_data(data, module))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
let mut token_stream = Vec::new();
let data = self.data.clone();
let lines = lines_with_spans(&data);
// Process each line
for line_result in lines {
let line_span = line_result?;
let trimmed = line_span.content.trim();
// Skip empty lines and add newline tokens
if trimmed.is_empty() {
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
));
continue;
}
// Actually tokenise the line content
let line_tokens = self.tokenise_line(&line_span)?;
token_stream.extend(line_tokens);
// Add newline token at end of line
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(
line_span.line_number,
self.module.clone(),
line_span.content.len()..line_span.content.len(),
),
));
}
// Add EOF token
token_stream.push(Token::new(
TokenType::Eof,
SourceInfo::new(0, self.module.clone(), 0..0),
));
Ok(token_stream)
}
fn tokenise_line(
&mut self,
line_span: &LineSpan,
) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new();
let mut remaining = line_span.content.as_str();
let mut column = 0;
// Skip leading whitespace
let trimmed_start = remaining.trim_start();
column += remaining.len() - trimmed_start.len();
remaining = trimmed_start;
while !remaining.is_empty() {
let start_column = column;
// Try to match a token.
let (token_type, consumed) =
self.match_token(remaining, line_span.line_number, column)?;
// Filter out string continuation tokens and comments.
match token_type {
TokenType::StringContinuation => {
// Don't add to token stream, just consume input
}
TokenType::Comment => {
// Don't add to token stream, consume rest of line
break;
}
_ => {
tokens.push(Token::new(
token_type,
SourceInfo::new(
line_span.line_number,
self.module.clone(),
start_column..start_column + consumed,
),
));
}
}
// Advance position.
remaining = &remaining[consumed..];
column += consumed;
// Skip whitespace.
let before_trim = remaining.len();
remaining = remaining.trim_start();
column += before_trim - remaining.len();
}
Ok(tokens)
}
fn try_match_comment(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.comment_regex.captures(input)?;
let len = caps.get(0)?.len();
Some((TokenType::Comment, len))
}
fn try_match_label(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.label_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Label(LabelToken { name }), len))
}
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.register_regex.captures(input)?;
let captured_group = caps.get(1)?.as_str();
let len = caps.get(0)?.len();
let reg = Register::try_from(captured_group).ok()?;
Some((TokenType::Register(RegisterToken { reg }), len))
}
fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.immediate_regex.captures(input)?;
let value_str = caps.get(1)?.as_str();
let len = caps.get(0)?.len();
// Remove any underscores that were inserted for readability.
let value_str = value_str.replace('_', "");
let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
u32::from_str_radix(hex_part, 16).ok()?
} else if let Some(bin_part) = value_str.strip_prefix("0b") {
u32::from_str_radix(bin_part, 2).ok()?
} else if let Some(oct_part) = value_str.strip_prefix("0o") {
u32::from_str_radix(oct_part, 8).ok()?
} else {
value_str.parse::<u32>().ok()?
};
Some((TokenType::Immediate(value), len))
}
fn try_match_directive(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.directive_regex.captures(input)?;
let directive = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Directive(DirectiveToken { directive }), len))
}
fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.instruction_regex.captures(input)?;
let mnemonic = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
let op = Opcode::from_str(&mnemonic).ok()?;
Some((TokenType::Instruction(op), len))
}
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.symbol_regex.captures(input)?;
let len = caps.get(0)?.len();
// Check which capture group matched.
let name = if let Some(scoped_name) = caps.get(1) {
// Matched the scoped symbol pattern (name::scope).
format!("{}::{}", scoped_name.as_str(), caps.get(2)?.as_str())
} else if let Some(simple_name) = caps.get(3) {
simple_name.as_str().to_string()
} else {
return None;
};
Some((TokenType::Symbol(SymbolToken { name }), len))
}
fn try_match_string(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Option<(TokenType, usize)> {
if self.in_string {
// We're continuing a multiline string
Some(self.handle_string_continuation(input, line_number, column))
} else {
// Look for the start of a new string
self.handle_string_start(input, line_number, column)
}
}
fn handle_string_start(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Option<(TokenType, usize)> {
if !input.starts_with('"') {
return None;
}
// Find the closing quote on the same line
if let Some(end_pos) = input[1..].find('"') {
// Complete string on one line
let content = input[1..=end_pos].to_string();
let len = end_pos + 2; // +2 for both quotes
Some((TokenType::String(content), len))
} else {
// Start of multiline string
self.in_string = true;
self.string_start_line = line_number;
self.string_start_column = column;
self.string_buffer = input[1..].to_string(); // Everything after opening quote
self.string_buffer.push('\n'); // Add newline for multiline
// Consume the entire rest of the line
Some((TokenType::StringContinuation, input.len()))
}
}
fn handle_string_continuation(
&mut self,
input: &str,
_line_number: usize,
_column: usize,
) -> (TokenType, usize) {
// Look for closing quote
if let Some(end_pos) = input.find('"') {
// End of multiline string found
self.string_buffer.push_str(&input[..end_pos]);
self.in_string = false;
let content = std::mem::take(&mut self.string_buffer);
let len = end_pos + 1; // +1 for the closing quote
(TokenType::String(content), len)
} else {
// Continue multiline string
self.string_buffer.push_str(input);
self.string_buffer.push('\n'); // Add newline
// Consume the entire line
(TokenType::StringContinuation, input.len())
}
}
#[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
fn match_token(
&mut self,
input: &str,
line_number: usize,
column: usize,
) -> Result<(TokenType, usize), AssembleError> {
if input.starts_with(',') {
return Ok((TokenType::Comma, 1));
}
// Check for string first (including multiline continuations).
if let Some(m) = self.try_match_string(input, line_number, column) {
return Ok(m);
}
if let Some(m) = self.try_match_directive(input) {
return Ok(m);
}
if let Some(m) = self.try_match_instruction(input) {
return Ok(m);
}
if let Some(m) = self.try_match_comment(input) {
return Ok(m);
}
if let Some(m) = self.try_match_label(input) {
return Ok(m);
}
if let Some(m) = self.try_match_register(input) {
return Ok(m);
}
if let Some(m) = self.try_match_immediate(input) {
return Ok(m);
}
if let Some(m) = self.try_match_symbol(input) {
return Ok(m);
}
let mut idx_iter = (column + 1)..;
let Some(idx) = idx_iter.next() else {
unreachable!()
};
let source = SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
// Handle miscellaneous characters.
if let Some(c) = input.chars().next() {
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
))
} else {
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
input.len(),
)),
))
}
}
}
+34 -3
View File
@@ -1,10 +1,41 @@
//! This module contains the error types for the tokeniser. //! This module contains the error types for the tokeniser.
#[derive(Debug)] #[derive(Debug, Clone, Copy)]
pub enum TokeniserError {} /// Types of errors that may be returned during tokenisation.
pub enum TokeniserError {
/// An unexpected character was found in the source code.
UnexpectedChar(char),
/// An unterminated string literal was found. [`SourceInfo`] will be attached if this
/// was returned.
UnterminatedString,
/// An invalid number format was encountered when parsing a literal value
/// ([`TokenType::Immediate`]).
InvalidNumber(&'static str),
/// An unrecognized token was encountered.
UnrecognisedToken,
/// Returned if the consumed count was lower than the length of the input file.
/// This is a sign you will need to debug some [`Tokeniser`] code to ensure that
/// [`Tokeniser::match_token`] is working as intended.
///
/// First field is length of the line.
UnexpectedEndOfInput(usize),
}
impl TokeniserError {}
impl std::fmt::Display for TokeniserError { impl std::fmt::Display for TokeniserError {
#[rustfmt::skip]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "TODO!!!!!!") match self {
Self::UnexpectedChar(c) => write!(f, "unexpected char '{c}' found in input")?,
Self::InvalidNumber(lit) => write!(f, "invalid integer literal \"{lit}\" found in input")?,
Self::UnrecognisedToken => write!(f, "unrecognised token found in input")?,
Self::UnterminatedString => write!(f, "unterminated string literal")?,
Self::UnexpectedEndOfInput(line_length) => write!(
f, "unexpected end of input, input length: {line_length}"
)?,
}
Ok(())
} }
} }
+418
View File
@@ -0,0 +1,418 @@
//! Unit tests for the tokenizer
use common::prelude::Register;
use crate::{
model::module::Module,
source::{
opcode::Opcode,
token::{Token, TokenType},
token_info::RegisterToken,
tokeniser::Tokeniser,
},
};
use std::{path::PathBuf, sync::Arc};
/// Helper function to create a tokenizer from source text
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
let path = PathBuf::from("test.dsa");
let module = Module::new(path).expect("Cannot create module!");
Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
}
/// Helper function to tokenize source and return tokens
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
let tokenizer = create_tokenizer_from_source(source);
tokenizer.tokenise()
}
/// Helper function to extract token types from a token vector
fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> {
tokens.iter().map(|t| &t.token_type).collect()
}
#[test]
fn test_empty_source() {
let tokens = tokenize_source("").expect("Failed to tokenize empty source");
// Should have at least EOF token
assert!(!tokens.is_empty());
assert!(matches!(
tokens
.last()
.expect("Expected at least one token")
.token_type,
TokenType::Eof
));
}
#[test]
fn test_whitespace_only() {
let tokens = tokenize_source(" \n \n ").expect("Failed to tokenize whitespace");
// Should have newlines and EOF
let token_types = extract_token_types(&tokens);
assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline)));
assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof)));
}
#[test]
fn test_single_instruction() {
let tokens = tokenize_source("add").expect("Failed to tokenize instruction");
let token_types = extract_token_types(&tokens);
// Should have instruction, newline, and EOF
assert!(
token_types
.iter()
.any(|t| matches!(t, TokenType::Instruction(_)))
);
if let TokenType::Instruction(instr) = &tokens[0].token_type {
assert_eq!(instr.to_string(), "add");
} else {
panic!("Expected instruction token");
}
}
#[test]
fn test_all_instructions() {
let instructions = ["add", "sub", "jmp", "call", "return", "lli", "nop", "hlt"];
for instr in &instructions {
let tokens = tokenize_source(instr).expect("Failed to tokenize instruction");
if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type {
assert_eq!(parsed_instr.to_string(), *instr);
} else {
panic!("Expected instruction token for {instr}");
}
}
}
#[test]
fn test_registers() {
let test_cases = [("rg0", "rg0"), ("rgf", "rgf"), ("pcx", "pcx")];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize register");
if let TokenType::Register(reg) = &tokens[0].token_type {
assert_eq!(reg.reg.to_string(), *expected);
} else {
panic!("Expected register token for {input}");
}
}
}
#[test]
fn test_immediates() {
let test_cases = [
("42", 42),
("0", 0),
("0xFF", 255),
("0x1234", 0x1234),
("0xDEADBEEF", 0xDEAD_BEEF),
("0o12", 0o12),
("0b101", 0b101),
];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize immediate");
if let TokenType::Immediate(value) = &tokens[0].token_type {
assert_eq!(*value, *expected);
} else {
panic!("Expected immediate token for {input}");
}
}
}
#[test]
fn test_labels() {
let test_cases = [
("loop_start:", "loop_start"),
("main:", "main"),
("_private_label:", "_private_label"),
("Label123:", "Label123"),
];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize label");
if let TokenType::Label(label) = &tokens[0].token_type {
assert_eq!(label.name, *expected);
} else {
panic!("Expected label token for {input}");
}
}
}
#[test]
fn test_directives() {
let test_cases = [
("global", "global"),
("section", "section"),
("local", "local"),
];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize directive");
if let TokenType::Directive(directive) = &tokens[0].token_type {
assert_eq!(directive.directive, *expected);
} else {
panic!("Expected directive token for {input}");
}
}
}
#[test]
fn test_symbols() {
let test_cases = [
("my_symbol", "my_symbol"),
("_private", "_private"),
("Symbol123", "Symbol123"),
("camelCase", "camelCase"),
];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize symbol");
if let TokenType::Symbol(symbol) = &tokens[0].token_type {
assert_eq!(symbol.name, *expected);
} else {
panic!("Expected symbol token for {input}");
}
}
}
#[test]
fn test_complex_instruction_line() {
let source = "addi rg1, rg2, 0xFF";
let tokens = tokenize_source(source).expect("Failed to tokenise complex instruction");
// Should have: instruction, register, comma, register, comma, immediate, newline, EOF
assert!(tokens.len() >= 6);
assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
assert!(matches!(tokens[2].token_type, TokenType::Comma));
assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
assert!(matches!(tokens[4].token_type, TokenType::Comma));
assert!(matches!(tokens[5].token_type, TokenType::Immediate(_)));
}
#[test]
fn test_multiline_with_comments() {
const EXPECTED_TOKEN_TYPES: [TokenType; 11] = [
TokenType::Instruction(Opcode::Add),
TokenType::Register(RegisterToken::new(Register::Rg0)),
TokenType::Comma,
TokenType::Register(RegisterToken::new(Register::Rg1)),
TokenType::Newline,
TokenType::Instruction(Opcode::SubI),
TokenType::Register(RegisterToken::new(Register::Rg2)),
TokenType::Comma,
TokenType::Immediate(10),
TokenType::Newline,
TokenType::Eof,
];
const SOURCE: &str = r"add rg0, rg1 // Another comment
subi rg2, 10";
let tokens =
tokenize_source(SOURCE).expect("Failed to tokenise source with comments");
let token_types = extract_token_types(&tokens);
assert_eq!(
token_types.len(),
EXPECTED_TOKEN_TYPES.len(),
"{token_types:#?}"
);
for (expected, got) in EXPECTED_TOKEN_TYPES.iter().zip(token_types.iter()) {
assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
}
}
#[test]
fn test_tokenise_brainf_interpreter() {
const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
let tokens =
tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
dbg!(tokens);
}
#[test]
fn test_string_literals() {
let test_cases = [
(r#""hello world""#, "hello world"),
(
r#""++++++++++++++++++++++++++++++++++++++++++++""#,
"++++++++++++++++++++++++++++++++++++++++++++",
),
(r#""Invalid Instruction!""#, "Invalid Instruction!"),
(r#""""#, ""),
];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize string literal");
if let TokenType::String(value) = &tokens[0].token_type {
assert_eq!(value, expected);
} else {
panic!("Expected string token for {input}");
}
}
}
#[test]
fn test_data_directives() {
let test_cases = [("db", "db"), ("dw", "dw"), ("resb", "resb")];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize data declaration");
if let TokenType::Directive(decl) = &tokens[0].token_type {
assert_eq!(decl.directive, *expected);
} else {
panic!("Expected data declaration token for {input}");
}
}
}
#[test]
fn test_include_directive() {
let source = r#"include print "./lib/print.dsa""#;
let tokens = tokenize_source(source).expect("Failed to tokenize include directive");
assert!(tokens.len() >= 3);
assert!(matches!(tokens[0].token_type, TokenType::Directive(_)));
assert!(matches!(tokens[1].token_type, TokenType::Symbol(_)));
assert!(matches!(tokens[2].token_type, TokenType::String(_)));
}
#[test]
fn test_hex_addresses() {
let test_cases = [("0x10000", 0x10000), ("0x30000", 0x30000)];
for (input, expected) in &test_cases {
let tokens = tokenize_source(input).expect("Failed to tokenize hex address");
if let TokenType::Immediate(value) = &tokens[0].token_type {
assert_eq!(*value, *expected);
} else {
panic!("Expected immediate token for {input}");
}
}
}
#[test]
fn test_memory_operations() {
let source = "ldw rg1, rg2";
let tokens = tokenize_source(source).expect("Failed to tokenize memory operation");
assert!(tokens.len() >= 4);
assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
assert!(matches!(tokens[2].token_type, TokenType::Comma));
assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
}
#[test]
fn test_function_calls() {
let source = "call print::print";
let tokens = tokenize_source(source).expect("Failed to tokenize function call");
assert!(tokens.len() >= 2);
assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
// The symbol might be parsed differently depending on how :: is handled
// This test checks basic structure
assert!(
tokens
.iter()
.any(|t| matches!(t.token_type, TokenType::Symbol(_)))
);
}
#[test]
fn test_comments_are_ignored() {
let source = "add rg0, rg1 // this is a comment\nsub rg2, rg3";
let tokens = tokenize_source(source).expect("Failed to tokenize with comments");
// Comments should be stripped, so we should only have instruction tokens
let instruction_count = tokens
.iter()
.filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
.count();
assert_eq!(instruction_count, 2);
}
#[test]
fn test_newline_always_present() {
// Test that even without explicit newline at end, one is added
let source = "add rg0, rg1"; // No newline at end
let tokens = tokenize_source(source).expect("Failed to tokenize without newline");
// Should have newline before EOF
let has_newline = tokens
.iter()
.any(|t| matches!(t.token_type, TokenType::Newline));
assert!(
has_newline,
"Expected newline to be added even when missing from input"
);
// EOF should be last.
assert!(matches!(
tokens
.last()
.expect("Expected at least one token")
.token_type,
TokenType::Eof
));
}
#[test]
fn test_complex_branching_code() {
let source = r"
cmp rg3, rg8
jeq increment
cmp rg3, rg9
jeq decrement";
let tokens = tokenize_source(source).expect("Failed to tokenize branching code");
let instruction_count = tokens
.iter()
.filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
.count();
assert_eq!(instruction_count, 4);
let symbol_count = tokens
.iter()
.filter(|t| matches!(t.token_type, TokenType::Symbol(_)))
.count();
assert_eq!(symbol_count, 2); // increment and decrement labels
}
#[test]
fn test_stack_operations() {
let source = "push rg2\npop zero\npusha 2\npopa 2";
let tokens = tokenize_source(source).expect("Failed to tokenize stack operations");
let instruction_count = tokens
.iter()
.filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
.count();
assert_eq!(instruction_count, 4);
}
+17 -16
View File
@@ -46,12 +46,11 @@ impl SymbolTable {
&& let Some(existing) = self.symbols.get(&existing_id) && let Some(existing) = self.symbols.get(&existing_id)
&& existing.module_id == module_id && existing.module_id == module_id
{ {
return Err(AssembleError::new_other_error( return Err(std::io::Error::new(
crate::error::AssembleErrorKind::IO(std::io::Error::new( std::io::ErrorKind::AlreadyExists,
std::io::ErrorKind::AlreadyExists, format!("Symbol '{name}' already defined in module"),
format!("Symbol '{name}' already defined in module"), )
)), .into());
));
} }
// Add to all mappings // Add to all mappings
@@ -63,19 +62,22 @@ impl SymbolTable {
} }
/// Gets the [`Symbol`] by its [`SymbolId`]. /// Gets the [`Symbol`] by its [`SymbolId`].
#[must_use] pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { #[must_use]
pub fn get(&self, id: &SymbolId) -> Option<&Symbol> {
self.symbols.get(id) self.symbols.get(id)
} }
/// Gets the [`Symbol`] by its name. /// Gets the [`Symbol`] by its name.
#[must_use] pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { #[must_use]
pub fn get_by_name(&self, name: &str) -> Option<&Symbol> {
self.name_to_id self.name_to_id
.get(name) .get(name)
.and_then(|id| self.symbols.get(id)) .and_then(|id| self.symbols.get(id))
} }
/// Gets all [`Symbol`]s in a module. /// Gets all [`Symbol`]s in a module.
#[must_use] pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { #[must_use]
pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> {
self.module_symbols self.module_symbols
.get(module_id) .get(module_id)
.map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect()) .map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect())
@@ -83,7 +85,8 @@ impl SymbolTable {
} }
/// Gets all the public symbols. /// Gets all the public symbols.
#[must_use] pub fn get_public_symbols(&self) -> Vec<&Symbol> { #[must_use]
pub fn get_public_symbols(&self) -> Vec<&Symbol> {
self.symbols self.symbols
.values() .values()
.filter(|sym| matches!(sym.visibility, Visibility::Public)) .filter(|sym| matches!(sym.visibility, Visibility::Public))
@@ -104,12 +107,10 @@ impl SymbolTable {
} }
Ok(()) Ok(())
} else { } else {
Err(AssembleError::new_other_error( Err(
crate::error::AssembleErrorKind::IO(std::io::Error::new( std::io::Error::new(std::io::ErrorKind::NotFound, "Symbol not found")
std::io::ErrorKind::NotFound, .into(),
"Symbol not found", )
)),
))
} }
} }
} }
+1 -1
View File
@@ -2,7 +2,7 @@ pub mod logging;
use std::io::Write; use std::io::Write;
pub fn input(prompt: &str) -> String { pub fn _input(prompt: &str) -> String {
print!("{prompt}\n > "); print!("{prompt}\n > ");
std::io::stdout().flush().expect("Failed to flush stdout"); std::io::stdout().flush().expect("Failed to flush stdout");
let mut input = String::new(); let mut input = String::new();
+2 -1
View File
@@ -1,9 +1,10 @@
use crate::{instructions::encode::Encode, prelude::*}; use crate::{instructions::encode::Encode, prelude::*};
#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
pub enum Interrupt { pub enum Interrupt {
Software(u8), Software(u8),
Breakpoint, Breakpoint,
#[default]
HardFault, HardFault,
} }
+14 -9
View File
@@ -1,4 +1,5 @@
//! Various types of arguments that instructions can take, alongside encoding and decoding logic. //! Various types of arguments that instructions can take, alongside encoding and decoding
//! logic.
use crate::{ use crate::{
instructions::{RegisterParseError, encode::Encode}, instructions::{RegisterParseError, encode::Encode},
@@ -35,18 +36,20 @@ impl std::fmt::Display for ArgsDecodeError {
impl std::error::Error for ArgsDecodeError {} impl std::error::Error for ArgsDecodeError {}
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
/// Used by instructions with 2 registers and an immediate argument. /// Used by instructions with 2 registers and an immediate argument.
pub struct ITypeArgs { pub struct ITypeArgs {
pub immediate: u16, pub immediate: u16,
pub r1: Register, pub r1: Register,
/// May not actually be used by some instructions taking an immediate e.g. LUI. This is solved by making the constructor take Options. /// May not actually be used by some instructions taking an immediate e.g. LUI. This
/// is solved by making the constructor take Options.
pub r2: Register, pub r2: Register,
} }
impl ITypeArgs { impl ITypeArgs {
#[must_use] #[must_use]
/// Creates a new [`ITypeArgs`]. If r1 or r2 is unset, they will be replaced with [`Register::NoReg`]. /// Creates a new [`ITypeArgs`]. If r1 or r2 is unset, they will be replaced with
/// [`Register::NoReg`].
pub fn new(immediate: u16, r1: Option<Register>, r2: Option<Register>) -> Self { pub fn new(immediate: u16, r1: Option<Register>, r2: Option<Register>) -> Self {
let r1 = r1.unwrap_or_default(); let r1 = r1.unwrap_or_default();
let r2 = r2.unwrap_or_default(); let r2 = r2.unwrap_or_default();
@@ -56,8 +59,8 @@ impl ITypeArgs {
} }
impl Encode for ITypeArgs { impl Encode for ITypeArgs {
/// Encodes an I-type instruction from its fields. These must have some unused high-order /// Encodes an I-type instruction from its fields. These must have some unused
/// bits set to 0 else the bit shifting logic gets fucked. /// high-order bits set to 0 else the bit shifting logic gets fucked.
fn encode(self, opcode: u8) -> u32 { fn encode(self, opcode: u8) -> u32 {
let opcode = u32::from(opcode); let opcode = u32::from(opcode);
let r1 = self.r1 as u32; let r1 = self.r1 as u32;
@@ -84,7 +87,7 @@ impl TryFrom<u32> for ITypeArgs {
} }
/// Used by instructions not using immediates (besides 5 bit shift values). /// Used by instructions not using immediates (besides 5 bit shift values).
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct RTypeArgs { pub struct RTypeArgs {
pub sr1: Register, pub sr1: Register,
pub sr2: Register, pub sr2: Register,
@@ -95,7 +98,8 @@ pub struct RTypeArgs {
impl RTypeArgs { impl RTypeArgs {
#[must_use] #[must_use]
/// Creates a new [`RTypeArgs`]. If any registers are unset, they will be replaced with [`Register::NoReg`]. If `shamt` is unset, it will be set to 0. /// Creates a new [`RTypeArgs`]. If any registers are unset, they will be replaced
/// with [`Register::NoReg`]. If `shamt` is unset, it will be set to 0.
pub fn new( pub fn new(
sr1: Option<Register>, sr1: Option<Register>,
sr2: Option<Register>, sr2: Option<Register>,
@@ -122,7 +126,8 @@ impl Encode for RTypeArgs {
/// ///
/// # Arguments /// # Arguments
/// ///
/// - `shamt`: The amount to shift value (used only in shift instructions, otherwise 0). /// - `shamt`: The amount to shift value (used only in shift instructions, otherwise
/// 0).
fn encode(self, opcode: u8) -> u32 { fn encode(self, opcode: u8) -> u32 {
let opcode = u32::from(opcode); let opcode = u32::from(opcode);
let sr1 = self.sr1 as u32; let sr1 = self.sr1 as u32;
+3 -1
View File
@@ -39,7 +39,9 @@ impl std::fmt::Display for InstructionDecodeError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Self::InvalidOpcode(code) => write!(f, "invalid opcode, got {code:x}")?, Self::InvalidOpcode(code) => write!(f, "invalid opcode, got {code:x}")?,
Self::InvalidArgument(err) => write!(f, "invalid arguments, got an error {err}")?, Self::InvalidArgument(err) => {
write!(f, "invalid arguments, got an error {err}")?;
}
} }
Ok(()) Ok(())
+6 -5
View File
@@ -160,11 +160,12 @@ impl CodeEditor {
/// Stick to bottom /// Stick to bottom
/// The scroll handle will stick to the bottom position even while the content size /// The scroll handle will stick to the bottom position even while the content size
/// changes dynamically. This can be useful to simulate terminal UIs or log/info scrollers. /// changes dynamically. This can be useful to simulate terminal UIs or log/info
/// The scroll handle remains stuck until user manually changes position. Once "unstuck" /// scrollers. The scroll handle remains stuck until user manually changes
/// it will remain focused on whatever content viewport the user left it on. If the scroll /// position. Once "unstuck" it will remain focused on whatever content viewport
/// handle is dragged to the bottom it will again become stuck and remain there until manually /// the user left it on. If the scroll handle is dragged to the bottom it will
/// pulled from the end position. /// again become stuck and remain there until manually pulled from the end
/// position.
/// ///
/// **Default: false** /// **Default: false**
pub fn stick_to_bottom(self, stick_to_bottom: bool) -> Self { pub fn stick_to_bottom(self, stick_to_bottom: bool) -> Self {
+1
View File
@@ -5,6 +5,7 @@ use std::{
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use assembler::compiler_engine::CompilerEngine;
use common::prelude::Instruction; use common::prelude::Instruction;
use egui::{Align, Context, Key, Layout, Ui}; use egui::{Align, Context, Key, Layout, Ui};
+5 -5
View File
@@ -5,7 +5,7 @@
include print "./lib/print.dsa" include print "./lib/print.dsa"
// "print hello world" // "print hello world"
db program: "++++++++++++++++++++++++++++++++++++++++++++ db program "++++++++++++++++++++++++++++++++++++++++++++
>++++++++++++++++++++++++++++++++ >++++++++++++++++++++++++++++++++
>++++++++++++++++ >++++++++++++++++
> >
@@ -35,10 +35,10 @@ db program: "++++++++++++++++++++++++++++++++++++++++++++
] ]
<<++..." <<++..."
db error: "Invalid Instruction!" db error "Invalid Instruction!"
dw stack: 0x10000 dw stack 0x10000
dw input: 0x30000 dw input 0x30000
resb data: 1024 resb data 1024
// set up a stack so we can call functions // set up a stack so we can call functions
_init_stack: _init_stack: