diff --git a/assembler/Cargo.toml b/assembler/Cargo.toml index a9c19f8..d920624 100644 --- a/assembler/Cargo.toml +++ b/assembler/Cargo.toml @@ -4,5 +4,13 @@ version.workspace = true edition.workspace = true authors.workspace = true +[[bin]] +name = "assembler_runner" +path = "src/main.rs" + +[lib] +name = "assembler" +path = "src/lib.rs" + [dependencies] common = { path = "../common" } \ No newline at end of file diff --git a/assembler/src/lexer.rs b/assembler/src/lexer.rs new file mode 100644 index 0000000..fc08910 --- /dev/null +++ b/assembler/src/lexer.rs @@ -0,0 +1,141 @@ +use crate::{AssembleError, parser::Opcode}; +use common::prelude::Register; + +pub type Symbol = String; + +#[derive(Debug, Clone)] +pub enum Token { + Symbol(Symbol), + Register(Register), + Immediate(u32), + StringLit(String), + Opcode(Opcode), +} + +pub fn lexer(mut program: String) -> Result, AssembleError> { + let mut tokens = Vec::new(); + + program = program.replace(",", ""); + let lines = program.lines(); + let mut literal; + + for line in lines { + literal = String::new(); + for token in line.split_whitespace() { + if token.starts_with("//") { + break; + } + + if token.starts_with('"') { + literal.push_str(&token[1..]); + println!("literal: {}", literal); + } + + if !literal.is_empty() { + if !token.starts_with('"') { + literal.push(' '); + literal.push_str(token); + } + + if token.ends_with('"') { + literal.pop(); // remove the closing quote + + tokens.push(Token::StringLit(literal)); + literal = String::new(); + } + + continue; + } + + if let Some(token) = parse_register(token)? { + tokens.push(token); + } else if let Some(token) = parse_opcode(token)? { + tokens.push(token); + } else if let Some(token) = parse_hex(token)? { + tokens.push(token); + } else if let Some(token) = parse_octal(token)? { + tokens.push(token); + } else if let Some(token) = parse_binary(token)? { + tokens.push(token); + } else if let Some(token) = parse_decimal(token)? { + tokens.push(token); + } else if let Some(token) = parse_label(token)? { + tokens.push(token); + } else if let Some(token) = parse_symbol(token)? { + tokens.push(token); + } else { + return Err(AssembleError::Generic); + } + } + } + + Ok(tokens) +} +pub fn parse_register(token: &str) -> Result, AssembleError> { + Ok(Register::try_from(token).map(|r| Token::Register(r)).ok()) +} + +pub fn parse_opcode(token: &str) -> Result, AssembleError> { + if Opcode::OPCODES.contains(&token) { + Ok(Some(Token::Opcode(Opcode::from_str(token).unwrap()))) + } else { + Ok(None) + } +} + +pub fn parse_hex(token: &str) -> Result, AssembleError> { + if (token.len() < 3) | !token.starts_with("0x") { + return Ok(None); + } + + match u32::from_str_radix(&token[2..], 16) { + Ok(value) => Ok(Some(Token::Immediate(value))), + Err(_) => Err(AssembleError::Generic), + } +} + +pub fn parse_octal(token: &str) -> Result, AssembleError> { + if (token.len() < 3) | !token.starts_with("0o") { + return Ok(None); + } + + match u32::from_str_radix(&token[2..], 8) { + Ok(value) => Ok(Some(Token::Immediate(value))), + Err(_) => Err(AssembleError::Generic), + } +} + +pub fn parse_binary(token: &str) -> Result, AssembleError> { + if (token.len() < 3) | !token.starts_with("0b") { + return Ok(None); + } + + match u32::from_str_radix(&token[2..], 2) { + Ok(value) => Ok(Some(Token::Immediate(value))), + Err(_) => Err(AssembleError::Generic), + } +} + +pub fn parse_decimal(token: &str) -> Result, AssembleError> { + if !token.parse::().is_ok() { + return Ok(None); + } else { + return Ok(Some(Token::Immediate(token.parse().unwrap()))); + } +} + +pub fn parse_label(token: &str) -> Result, AssembleError> { + if !token.ends_with(":") { + return Ok(None); + } else { + return Ok(Some(Token::Symbol(token[0..token.len() - 1].to_string()))); + } +} + +pub fn parse_symbol(token: &str) -> Result, AssembleError> { + if token.chars().nth(0).unwrap().is_numeric() { + return Ok(None); + } + + Ok(Some(Token::Symbol(token.to_string()))) +} diff --git a/assembler/src/lib.rs b/assembler/src/lib.rs index 0e15029..7470dd0 100644 --- a/assembler/src/lib.rs +++ b/assembler/src/lib.rs @@ -1,5 +1,12 @@ +use core::fmt; + use common::prelude::Instruction; +use crate::{lexer::Token, parser::TokenType}; + +pub mod lexer; +pub mod parser; + pub fn assemble(src: &str) -> Vec { todo!() } @@ -12,3 +19,20 @@ pub fn disassemble(binary: Vec) -> String { // sequences that are expansions of pseduo-instructions and reversing this to produce near enough the original source code. todo!() } + +#[derive(Debug)] +pub enum AssembleError { + Generic, + UnexpectedToken(Token, TokenType), +} + +impl fmt::Display for AssembleError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + AssembleError::Generic => write!(f, "Generic error"), + AssembleError::UnexpectedToken(tok, expected) => { + write!(f, "Unexpected token {:?}, expected {:?}", tok, expected) + } + } + } +} diff --git a/assembler/src/main.rs b/assembler/src/main.rs new file mode 100644 index 0000000..8277af7 --- /dev/null +++ b/assembler/src/main.rs @@ -0,0 +1,16 @@ +use std::fs; + +use assembler::{lexer, parser::Parser}; + +fn main() { + let program = fs::read_to_string("../resources/dsa/print.dsa").unwrap(); + let tokens = lexer::lexer(program).unwrap(); + + println!("{:?}", tokens); + + let parser = Parser::new(tokens); + + for node in parser { + println!("{:?}", node); + } +} diff --git a/assembler/src/parser.rs b/assembler/src/parser.rs new file mode 100644 index 0000000..ad4d21f --- /dev/null +++ b/assembler/src/parser.rs @@ -0,0 +1,490 @@ +use std::path::Iter; + +use common::prelude::Register; + +use crate::AssembleError; +use crate::lexer::{Symbol, Token}; + +pub struct Parser { + tokens: Vec, +} + +#[derive(Debug, PartialEq, Copy, Clone)] +pub enum TokenType { + Symbol, + Register, + Immediate, + StringLit, + Opcode, +} + +impl TokenType { + fn from_token(token: &Token) -> TokenType { + match token { + Token::Symbol(_) => TokenType::Symbol, + Token::Register(_) => TokenType::Register, + Token::Immediate(_) => TokenType::Immediate, + Token::StringLit(_) => TokenType::StringLit, + Token::Opcode(_) => TokenType::Opcode, + } + } +} + +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN +// TODO: MAKE SURE I DO THE BIT SHIFT FOR LUI CODEGEN + +#[derive(Debug)] +pub struct Node(Option, Opcode, Vec); + +impl Iterator for Parser { + type Item = Result; + + fn next(&mut self) -> Option> { + if self.tokens.is_empty() { + return None; + } + + Some(self.parse_instruction()) + } +} + +impl Parser { + pub fn new(tokens: Vec) -> Parser { + Parser { + tokens: tokens.into_iter().rev().collect(), + } + } + + fn parse_instruction(&mut self) -> Result { + if self.tokens.is_empty() { + unreachable!(); + } + + let label = if let Token::Symbol(label) = self + .tokens + .last() + .expect("parse_instruction should not have been called if this is none!!") + { + Some(label.clone()) + } else { + None + }; + + if label.is_some() { + self.tokens.pop(); + } + + let opcode = match self.expect(TokenType::Opcode)? { + Token::Opcode(opcode) => opcode, + _ => unreachable!(), + }; + + let args: Vec; + + match opcode { + // R-type instructions + Opcode::Mov | Opcode::Movs => { + let reg1 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let reg2 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![reg1, reg2]; + } + + Opcode::Ldb | Opcode::Ldbs | Opcode::Ldh | Opcode::Ldhs | Opcode::Ldw => { + let base = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let dest = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let offset = self.maybe_expect(&[TokenType::Register, TokenType::Immediate]); + if offset.is_some() { + self.tokens.pop(); + args = vec![base, offset.unwrap(), dest]; + } else { + args = vec![base, Token::Immediate(0), dest]; + } + } + + Opcode::Stb | Opcode::Sth | Opcode::Stw => { + let base = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let dest = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + + let offset = self.maybe_expect(&[TokenType::Register, TokenType::Immediate]); + if offset.is_some() { + self.tokens.pop(); + args = vec![base, offset.unwrap(), dest]; + } else { + args = vec![base, Token::Immediate(0), dest]; + } + } + + Opcode::Add + | Opcode::Sub + | Opcode::And + | Opcode::Or + | Opcode::Xor + | Opcode::Nand + | Opcode::Nor + | Opcode::Xnor => { + let src1 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let src2 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let dest = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![src1, src2, dest]; + } + + Opcode::Not | Opcode::Cmp => { + let reg1 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let reg2 = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![reg1, reg2]; + } + + Opcode::Shl | Opcode::Shr => { + let reg = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + let num = self.expect(TokenType::Immediate)?; + args = vec![reg, num]; + } + + Opcode::Inc | Opcode::Dec => { + let reg = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![reg]; + } + + // J-type instructions + Opcode::Jmp + | Opcode::Jeq + | Opcode::Jne + | Opcode::Jgt + | Opcode::Jge + | Opcode::Jlt + | Opcode::Jle => { + let imm = self.expect_any(&[TokenType::Immediate, TokenType::Symbol])?; + args = vec![imm]; + } + + // I-type instructions + Opcode::Lui | Opcode::Lli | Opcode::Lwi | Opcode::Iadd | Opcode::Isub => { + let imm = self.expect_any(&[TokenType::Immediate, TokenType::Symbol])?; + let reg = self.expect(TokenType::Register)?; + args = vec![reg, imm]; + } + + // D-type pseudoinstructions (data definition) + Opcode::Resb | Opcode::Resh | Opcode::Resw => { + let num = self.expect(TokenType::Immediate)?; + args = vec![num]; + } + + Opcode::Db | Opcode::Dh | Opcode::Dw => { + args = self.parse_data_definition(opcode.clone())?; + } + + // E-type pseudoinstructions (stack operations) + Opcode::Push => { + let reg = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![reg]; + } + + Opcode::Pop => { + let reg = self.expect_any(&[TokenType::Register, TokenType::Symbol])?; + args = vec![reg]; + } + + // Special instructions + Opcode::Int => { + let val = self.expect(TokenType::Immediate)?; + args = vec![val]; + } + + // Instructions with no arguments + Opcode::Hlt | Opcode::Nop | Opcode::Irt => { + args = Vec::new(); + } + } + + Ok(Node(label, opcode, args)) + } + + fn parse_data_definition(&mut self, opcode: Opcode) -> Result, AssembleError> { + let mut values = Vec::new(); + + let name = self.expect(TokenType::Symbol)?; + values.push(self.tokens.pop().unwrap()); + + match opcode { + Opcode::Db => { + // db can take string literals or u8 immediates + while !self.tokens.is_empty() { + match self.tokens.last().unwrap() { + Token::StringLit(_) => { + values.push(self.tokens.pop().unwrap()); + } + Token::Immediate(val) if *val <= u8::MAX as u32 => { + values.push(self.tokens.pop().unwrap()); + } + _ => break, + } + } + } + + Opcode::Dh => { + // dh can take u16 immediates + while !self.tokens.is_empty() { + if let Token::Immediate(val) = self.tokens.last().unwrap() { + if *val <= u16::MAX as u32 { + values.push(self.tokens.pop().unwrap()); + } else { + break; + } + } else { + break; + } + } + } + + Opcode::Dw => { + // dw can take u32 immediates + while !self.tokens.is_empty() { + if let Token::Immediate(_) = self.tokens.last().unwrap() { + values.push(self.tokens.pop().unwrap()); + } else { + break; + } + } + } + + _ => unreachable!(), + } + + Ok(values) + } + + fn expect(&mut self, type_: TokenType) -> Result { + let tok = self.tokens.pop().unwrap(); + + if TokenType::from_token(&tok) == type_ { + Ok(tok) + } else { + Err(AssembleError::UnexpectedToken(tok, type_)) + } + } + + fn expect_any(&mut self, types: &[TokenType]) -> Result { + let tok = self.tokens.pop().unwrap(); + + if types.contains(&TokenType::from_token(&tok)) { + Ok(tok) + } else { + Err(AssembleError::UnexpectedToken(tok, types[0])) + } + } + + fn maybe_expect(&mut self, types: &[TokenType]) -> Option { + let tok = self.tokens.last().unwrap(); + + if types.contains(&TokenType::from_token(&tok)) { + Some(tok.clone()) + } else { + None + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Opcode { + // Real instructions (0x00-0x26) + Nop, + Mov, + Movs, + Ldb, + Ldbs, + Ldh, + Ldhs, + Ldw, + Stb, + Sth, + Stw, + Lli, + Lui, + Jmp, + Jeq, + Jne, + Jgt, + Jge, + Jlt, + Jle, + Cmp, + Inc, + Dec, + Shl, + Shr, + Add, + Sub, + And, + Or, + Not, + Xor, + Nand, + Nor, + Xnor, + Int, + Irt, + Hlt, + Iadd, + Isub, + // Pseudo-instructions + Db, + Dh, + Dw, + Resb, + Resh, + Resw, + Push, + Pop, + Lwi, +} + +impl Opcode { + pub const OPCODES: &[&str] = &[ + // Real instructions (0x00-0x26) + "nop", "mov", "movs", "ldb", "ldbs", "ldh", "ldhs", "ldw", "stb", "sth", "stw", "lli", + "lui", "jmp", "jeq", "jne", "jgt", "jge", "jlt", "jle", "cmp", "inc", "dec", "shl", "shr", + "add", "sub", "and", "or", "not", "xor", "nand", "nor", "xnor", "int", "irt", "hlt", + "iadd", "isub", // Pseudo-instructions + "db", "dh", "dw", "resb", "resh", "resw", "push", "pop", "lwi", + ]; + + pub fn from_str(s: &str) -> Option { + match s.to_lowercase().as_str() { + "nop" => Some(Self::Nop), + "mov" => Some(Self::Mov), + "movs" => Some(Self::Movs), + "ldb" => Some(Self::Ldb), + "ldbs" => Some(Self::Ldbs), + "ldh" => Some(Self::Ldh), + "ldhs" => Some(Self::Ldhs), + "ldw" => Some(Self::Ldw), + "stb" => Some(Self::Stb), + "sth" => Some(Self::Sth), + "stw" => Some(Self::Stw), + "lli" => Some(Self::Lli), + "lui" => Some(Self::Lui), + "jmp" => Some(Self::Jmp), + "jeq" => Some(Self::Jeq), + "jne" => Some(Self::Jne), + "jgt" => Some(Self::Jgt), + "jge" => Some(Self::Jge), + "jlt" => Some(Self::Jlt), + "jle" => Some(Self::Jle), + "cmp" => Some(Self::Cmp), + "inc" => Some(Self::Inc), + "dec" => Some(Self::Dec), + "shl" => Some(Self::Shl), + "shr" => Some(Self::Shr), + "add" => Some(Self::Add), + "sub" => Some(Self::Sub), + "and" => Some(Self::And), + "or" => Some(Self::Or), + "not" => Some(Self::Not), + "xor" => Some(Self::Xor), + "nand" => Some(Self::Nand), + "nor" => Some(Self::Nor), + "xnor" => Some(Self::Xnor), + "int" => Some(Self::Int), + "irt" => Some(Self::Irt), + "hlt" => Some(Self::Hlt), + "iadd" => Some(Self::Iadd), + "isub" => Some(Self::Isub), + "db" => Some(Self::Db), + "dh" => Some(Self::Dh), + "dw" => Some(Self::Dw), + "resb" => Some(Self::Resb), + "resh" => Some(Self::Resh), + "resw" => Some(Self::Resw), + "push" => Some(Self::Push), + "pop" => Some(Self::Pop), + "lwi" => Some(Self::Lwi), + _ => None, + } + } + + pub fn to_opcode_value(&self) -> Option { + match self { + Self::Nop => Some(0x00), + Self::Mov => Some(0x01), + Self::Movs => Some(0x02), + Self::Ldb => Some(0x03), + Self::Ldbs => Some(0x04), + Self::Ldh => Some(0x05), + Self::Ldhs => Some(0x06), + Self::Ldw => Some(0x07), + Self::Stb => Some(0x08), + Self::Sth => Some(0x09), + Self::Stw => Some(0x0A), + Self::Lli => Some(0x0B), + Self::Lui => Some(0x0C), + Self::Jmp => Some(0x0D), + Self::Jeq => Some(0x0E), + Self::Jne => Some(0x0F), + Self::Jgt => Some(0x10), + Self::Jge => Some(0x11), + Self::Jlt => Some(0x12), + Self::Jle => Some(0x13), + Self::Cmp => Some(0x14), + Self::Inc => Some(0x15), + Self::Dec => Some(0x16), + Self::Shl => Some(0x17), + Self::Shr => Some(0x18), + Self::Add => Some(0x19), + Self::Sub => Some(0x1A), + Self::And => Some(0x1B), + Self::Or => Some(0x1C), + Self::Not => Some(0x1D), + Self::Xor => Some(0x1E), + Self::Nand => Some(0x1F), + Self::Nor => Some(0x20), + Self::Xnor => Some(0x21), + Self::Int => Some(0x22), + Self::Irt => Some(0x23), + Self::Hlt => Some(0x24), + Self::Iadd => Some(0x25), + Self::Isub => Some(0x26), + // Pseudo-instructions don't have opcode values + _ => None, + } + } + + pub fn is_pseudo_instruction(&self) -> bool { + matches!( + self, + Self::Db + | Self::Dh + | Self::Dw + | Self::Resb + | Self::Resh + | Self::Resw + | Self::Push + | Self::Pop + | Self::Lwi + ) + } +}