diff options
| author | Thomas Voss <mail@thomasvoss.com> | 2026-03-29 23:09:46 +0200 |
|---|---|---|
| committer | Thomas Voss <mail@thomasvoss.com> | 2026-03-29 23:09:46 +0200 |
| commit | da65ee39162d0323321340b2a9cef9a013ad36ef (patch) | |
| tree | 127f6afd6bb418c5df3216e1ad83239aa693ef77 /oryxc/src | |
| parent | db11ea02d777a33fedb6af4ee056e85f52fbb008 (diff) | |
Beginning sema work
Diffstat (limited to 'oryxc/src')
| -rw-r--r-- | oryxc/src/compiler.rs | 210 | ||||
| -rw-r--r-- | oryxc/src/intern.rs | 12 | ||||
| -rw-r--r-- | oryxc/src/main.rs | 1 | ||||
| -rw-r--r-- | oryxc/src/parser.rs | 18 | ||||
| -rw-r--r-- | oryxc/src/prelude.rs | 89 | ||||
| -rw-r--r-- | oryxc/src/sema/mod.rs | 3 | ||||
| -rw-r--r-- | oryxc/src/sema/typecheck.rs | 192 | ||||
| -rw-r--r-- | oryxc/src/unistr.rs | 7 |
8 files changed, 436 insertions, 96 deletions
diff --git a/oryxc/src/compiler.rs b/oryxc/src/compiler.rs index 6119915..7e61237 100644 --- a/oryxc/src/compiler.rs +++ b/oryxc/src/compiler.rs @@ -20,6 +20,7 @@ use std::{ thread, }; +use boxcar; use crossbeam_deque::{ Injector, Steal, @@ -31,7 +32,10 @@ use soa_rs::Soa; use crate::errors::OryxError; use crate::intern::Interner; use crate::lexer::Token; -use crate::parser::Ast; +use crate::parser::{ + Ast, + AstType, +}; use crate::prelude::*; use crate::unistr::UniStr; use crate::{ @@ -39,6 +43,7 @@ use crate::{ err, lexer, parser, + sema, }; #[allow(dead_code)] @@ -47,7 +52,7 @@ pub struct FileData { pub buffer: String, pub tokens: OnceLock<Soa<Token>>, pub ast: OnceLock<Ast>, - pub scopes: OnceLock<Vec<Scope>>, + pub scopes: OnceLock<boxcar::Vec<Scope>>, } impl FileData { @@ -76,8 +81,20 @@ impl FileData { #[allow(dead_code)] #[derive(Clone)] pub enum JobType { - Lex { file: FileId, fdata: Arc<FileData> }, - Parse { file: FileId, fdata: Arc<FileData> }, + Lex { + file: FileId, + fdata: Arc<FileData>, + }, + Parse { + file: FileId, + fdata: Arc<FileData>, + }, + TypecheckConstant { + file: FileId, + fdata: Arc<FileData>, + scope: ScopeId, + node: u32, + }, } mkidtype!(JobId); @@ -88,17 +105,18 @@ pub struct Job { kind: JobType, } -struct CompilerState<'a> { +pub struct CompilerState<'a> { #[allow(dead_code)] globalq: Injector<Job>, njobs: AtomicUsize, flags: Flags, worker_threads: OnceLock<Box<[thread::Thread]>>, - /* Files needs to be after interner, so that the files get dropped - * after the interner. This is because the interner holds references - * to substrings of file buffers, so we want to control the drop - * order to avoid any potential undefined behaviour. */ - interner: Interner<UniStr<'a>, SymbolId>, + /* Files needs to be after the identifier interner, so that the files + * get dropped after the interner. This is because the interner holds + * references to substrings of file buffers, so we want to control the + * drop order to avoid any potential undefined behaviour. */ + pub ident_intr: Interner<UniStr<'a>, SymbolId>, + pub type_intr: Interner<OryxType, TypeId>, files: Vec<Arc<FileData>>, next_id: AtomicU32, } @@ -152,6 +170,116 @@ impl<'a> CompilerState<'a> { fn job_complete(&self) -> usize { return self.njobs.fetch_sub(1, Ordering::Release) - 1; } + + fn job_dispatch(&self, queue: &Worker<Job>, job: Job) -> bool { + match job.kind { + JobType::Lex { file, fdata } => 'blk: { + let tokens = match lexer::tokenize(&fdata.buffer) { + Ok(xs) => xs, + Err(e) => { + emit_errors(&fdata, iter::once(e)); + break 'blk false; + }, + }; + + if self.flags.debug_lexer { + let mut handle = io::stderr().lock(); + for t in tokens.iter() { + let _ = write!(handle, "{t:?}\n"); + } + } + + fdata.tokens.set(tokens).unwrap(); + self.job_push( + &queue, + self.job_new(JobType::Parse { file, fdata }), + ); + true + }, + + JobType::Parse { file, fdata } => 'blk: { + let tokens = fdata.tokens.get().unwrap(); + let ast = match parser::parse(tokens) { + Ok(ast) => ast, + Err(errs) => { + emit_errors(&fdata, errs); + break 'blk false; + }, + }; + + if self.flags.debug_parser { + let mut handle = io::stderr().lock(); + for n in ast.nodes.iter() { + let _ = write!(handle, "{n:?}\n"); + } + } + + /* Rust autism */ + fdata.ast.set(ast).unwrap(); + let ast = fdata.ast.get().unwrap(); + + let scopes = boxcar::vec![Scope::new(ScopeId::INVALID)]; + fdata.scopes.set(scopes).unwrap(); + + let root = ast.nodes.len() - 1; + let SubNodes(beg, len) = ast.nodes.sub()[root]; + let gscope = &fdata.scopes.get().unwrap()[0]; + + for i in 0..len { + let stmt = ast.extra[(beg + i) as usize]; + if ast.nodes.kind()[stmt as usize] != AstType::MultiDefBind + { + continue; + } + + let beg = ast.nodes.sub()[stmt as usize].0 as usize; + let nidents = ast.extra[beg] as usize; + for i in 0..nidents { + let tok = ast.extra[beg + 1 + i * 2] as usize; + let view = tokens.view()[tok]; + + /* Pointer fuckery to bypass the borrow checker */ + let s = UniStr(unsafe { + &*(&fdata.buffer[view.0..view.1] as *const str) + }); + let sid = self.ident_intr.intern(s); + let symbol = Symbol::new(SymbolType::Constant); + gscope.symtab.insert(sid, symbol); + } + + self.job_push( + &queue, + self.job_new(JobType::TypecheckConstant { + file, + fdata: fdata.clone(), + scope: ScopeId::GLOBAL, + node: stmt, + }), + ); + } + + true + }, + + JobType::TypecheckConstant { + file, + fdata, + scope, + node, + } => { + let ast = fdata.ast.get().unwrap(); + let tokens = fdata.tokens.get().unwrap(); + match sema::typecheck_multi_def_bind(&self, &fdata, scope, node) + { + Ok(()) => true, + Err(e) => { + emit_errors(&fdata, iter::once(e)); + false + }, + } + }, + } + } } /// Initialize compiler state and drive all source files through the @@ -187,7 +315,8 @@ where njobs: AtomicUsize::new(njobs), flags, worker_threads: OnceLock::new(), - interner: Interner::new(), + ident_intr: Interner::new(), + type_intr: Interner::new(), next_id: AtomicU32::new(njobs as u32), }); @@ -259,66 +388,9 @@ fn worker_loop( thread::park(); continue; }; - - let result = match job.kind { - JobType::Lex { file, fdata } => 'blk: { - let tokens = match lexer::tokenize(&fdata.buffer) { - Ok(xs) => xs, - Err(e) => { - emit_errors(&fdata, iter::once(e)); - break 'blk false; - }, - }; - - if c_state.flags.debug_lexer { - let mut handle = io::stderr().lock(); - for t in tokens.iter() { - let _ = write!(handle, "{t:?}\n"); - } - } - - fdata.tokens.set(tokens).unwrap(); - c_state.job_push( - &queue, - c_state.job_new(JobType::Parse { file, fdata }), - ); - true - }, - - JobType::Parse { file, fdata } => 'blk: { - let ast = match parser::parse(fdata.tokens.get().unwrap()) { - Ok(ast) => ast, - Err(errs) => { - emit_errors(&fdata, errs); - break 'blk false; - }, - }; - - if c_state.flags.debug_parser { - let mut handle = io::stderr().lock(); - for n in ast.nodes.iter() { - let _ = write!(handle, "{n:?}\n"); - } - } - - fdata.ast.set(ast).unwrap(); - fdata.scopes.set(Vec::new()).unwrap(); - - // c_state.job_push( - // &queue, - // c_state.job_new(JobType::IndexScopeConstants { - // fdata, - // block: root, - // parent: ScopeId::INVALID, - // }), - // ); - true - }, - }; - if !result { + if !c_state.job_dispatch(&queue, job) { ok = false; } - if c_state.job_complete() == 0 { c_state.wake_all(); return ok; diff --git a/oryxc/src/intern.rs b/oryxc/src/intern.rs index 5fb5cd0..86088a8 100644 --- a/oryxc/src/intern.rs +++ b/oryxc/src/intern.rs @@ -5,7 +5,7 @@ use dashmap::DashMap; pub struct Interner<V, I> where - V: Copy + Eq + Hash, + V: Clone + Eq + Hash, I: Copy + From<usize> + Into<usize>, { map: DashMap<V, I>, @@ -14,7 +14,7 @@ where impl<V, I> Interner<V, I> where - V: Copy + Eq + Hash, + V: Clone + Eq + Hash, I: Copy + From<usize> + Into<usize>, { pub fn new() -> Self { @@ -24,15 +24,15 @@ where }; } - pub fn get(&self, key: I) -> V { - return self.store[key.into()]; + pub fn get(&self, key: I) -> &V { + return &self.store[key.into()]; } pub fn intern(&self, value: V) -> I { if let Some(key) = self.map.get(&value) { return *key; } - let key = self.store.push(value).into(); + let key = self.store.push(value.clone()).into(); self.map.insert(value, key); return key; } @@ -66,7 +66,7 @@ mod tests { let ys = xs.iter().map(|&x| interner.intern(x)).collect::<Vec<_>>(); for i in 0..ys.len() { - assert_eq!(interner.get(ys[i]), xs[0]); + assert_eq!(*interner.get(ys[i]), xs[0]); } } } diff --git a/oryxc/src/main.rs b/oryxc/src/main.rs index 8d38ab1..9710ad5 100644 --- a/oryxc/src/main.rs +++ b/oryxc/src/main.rs @@ -6,6 +6,7 @@ mod intern; mod lexer; mod parser; mod prelude; +mod sema; mod size; mod unicode; mod unistr; diff --git a/oryxc/src/parser.rs b/oryxc/src/parser.rs index 8c24df1..93adc9f 100644 --- a/oryxc/src/parser.rs +++ b/oryxc/src/parser.rs @@ -1,3 +1,5 @@ +use std::iter; + use soa_rs::{ Soa, Soars, @@ -36,6 +38,16 @@ pub enum AstType { UnaryOperator, /* (rhs, _) */ } +/// The number of AstType variants that represent expressions/binops/etc. In +/// the places where we match/switch on just the subset of AstTypes that are +/// expressions/etc., we can static assert that this equals its current value. +/// This is useful because if a new AstType of the given subset is added and +/// these numbers are incremented, the static asserts fail everywhere where code +/// changes are required. +pub const NEXPRKINDS: usize = 10; +pub const NUNARYKINDS: usize = 5; +pub const NBINARYKINDS: usize = 23; + #[derive(Soars)] #[soa_derive(Debug)] pub struct AstNode { @@ -48,7 +60,7 @@ pub struct AstNode { pub struct Ast { pub nodes: Soa<AstNode>, pub extra: Vec<u32>, - pub types: Box<[TypeId]>, + pub types: Box<[TypeCell]>, pub textra: boxcar::Vec<TypeId>, } @@ -923,7 +935,9 @@ pub fn parse(tokens: &Soa<Token>) -> Result<Ast, Vec<OryxError>> { return Ok(Ast { nodes: p.ast, extra: p.extra_data, - types: vec![TypeId::INVALID; nodecnt].into_boxed_slice(), + types: iter::repeat_with(|| TypeCell::new(TypeId::INVALID)) + .take(nodecnt) + .collect::<Box<[_]>>(), /* TODO: Get the parser to try to compute the required capacity */ textra: boxcar::vec![], }); diff --git a/oryxc/src/prelude.rs b/oryxc/src/prelude.rs index 5628c87..9ee42f1 100644 --- a/oryxc/src/prelude.rs +++ b/oryxc/src/prelude.rs @@ -1,29 +1,49 @@ -use std::collections::HashMap; +use std::cell::UnsafeCell; use std::fmt::{ self, Debug, Formatter, }; +use std::sync::Arc; + +use dashmap::DashMap; macro_rules! mkidtype { ($name:ident) => { #[repr(transparent)] #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct $name(pub u32); + impl $name { #[allow(dead_code)] pub const INVALID: Self = Self(u32::MAX); } + impl From<usize> for $name { fn from(n: usize) -> Self { return Self(n as u32); } } + impl Into<usize> for $name { fn into(self) -> usize { return self.0 as usize; } } + + impl<T> std::ops::Index<$name> for [T] { + type Output = T; + + fn index(&self, index: $name) -> &Self::Output { + return &self[index.0 as usize]; + } + } + + impl<T> std::ops::IndexMut<$name> for [T] { + fn index_mut(&mut self, index: $name) -> &mut Self::Output { + return &mut self[index.0 as usize]; + } + } }; } pub(crate) use mkidtype; @@ -33,37 +53,50 @@ mkidtype!(ScopeId); mkidtype!(SymbolId); mkidtype!(TypeId); -impl ScopeId { - pub const GLOBAL: Self = Self(0); -} +/* Bypass Rust autism */ +#[repr(transparent)] +#[derive(Debug)] +pub struct TypeCell(UnsafeCell<TypeId>); -impl TypeId { - const MVMSK: u32 = 1 << 31; +impl TypeCell { + pub fn new(id: TypeId) -> Self { + return Self(UnsafeCell::new(id)); + } - pub fn to_mvindex(&self) -> Option<usize> { - return if self.0 & Self::MVMSK != 0 { - Some((self.0 & !Self::MVMSK) as usize) - } else { - None - }; + pub fn get(&self) -> TypeId { + return unsafe { *self.0.get() }; } - pub fn from_mvindex(i: usize) -> Self { - return Self(i as u32 & Self::MVMSK); + pub fn set(&self, id: TypeId) { + return unsafe { *self.0.get() = id }; } } +unsafe impl Sync for TypeCell {} + +impl ScopeId { + pub const GLOBAL: Self = Self(0); +} + +/// A fully-qualified symbol name +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct QualifiedId { + pub file: FileId, + pub scope: ScopeId, + pub sym: SymbolId, +} + #[derive(Debug)] pub struct Scope { pub parent: ScopeId, - pub symtab: HashMap<SymbolId, Symbol>, + pub symtab: DashMap<SymbolId, Symbol>, } impl Scope { pub fn new(parent: ScopeId) -> Self { return Self { parent, - symtab: HashMap::new(), + symtab: DashMap::new(), }; } } @@ -82,6 +115,17 @@ pub enum ResolutionState { pub struct Symbol { pub state: ResolutionState, pub kind: SymbolType, + pub vtype: TypeId, +} + +impl Symbol { + pub fn new(kind: SymbolType) -> Self { + return Self { + state: ResolutionState::Unresolved, + kind, + vtype: TypeId::INVALID, + }; + } } #[repr(u8)] @@ -91,20 +135,27 @@ pub enum SymbolType { FuncParam, } +#[derive(Clone, Eq, PartialEq, Hash)] pub enum OryxType { + /* Untyped types – the types of constant literals */ + UBoolean, + UNumber, + UString, + Boolean, Function { - args: Box<[TypeId]>, - rets: Box<[TypeId]>, + args: Arc<[TypeId]>, + rets: Arc<[TypeId]>, }, Integer { bits: usize, signed: bool, }, Pointer { - base: u32, + base: TypeId, }, Type(TypeId), + MultiValue(Arc<[TypeId]>), } #[derive(Clone, Copy)] diff --git a/oryxc/src/sema/mod.rs b/oryxc/src/sema/mod.rs new file mode 100644 index 0000000..9d55486 --- /dev/null +++ b/oryxc/src/sema/mod.rs @@ -0,0 +1,3 @@ +mod typecheck; + +pub use typecheck::*; diff --git a/oryxc/src/sema/typecheck.rs b/oryxc/src/sema/typecheck.rs new file mode 100644 index 0000000..27c3e31 --- /dev/null +++ b/oryxc/src/sema/typecheck.rs @@ -0,0 +1,192 @@ +use crate::compiler::{ + CompilerState, + FileData, +}; +use crate::errors::OryxError; +use crate::parser::{ + self, + AstType, +}; +use crate::prelude::*; +use crate::unistr::UniStr; + +pub fn typecheck_multi_def_bind( + c_state: &CompilerState, + fdata: &FileData, + scope_id: ScopeId, + node: u32, +) -> Result<(), OryxError> { + let ast = fdata.ast.get().unwrap(); + let tokens = fdata.tokens.get().unwrap(); + let scope = &fdata.scopes.get().unwrap()[scope_id.into()]; + + let SubNodes(decls, exprs) = ast.nodes.sub()[node as usize]; + let (decls, exprs) = (decls as usize, exprs as usize); + let nidents = ast.extra[decls] as usize; + let nexprs = ast.extra[exprs] as usize; + + /* Mark our identifiers as ‘resolving’ */ + let mut circularp = false; + for i in 0..nidents { + let itok = ast.extra[decls + 1 + i * 2] as usize; + let view = tokens.view()[itok]; + /* Pointer fuckery to bypass the borrow checker */ + let s = + UniStr(unsafe { &*(&fdata.buffer[view.0..view.1] as *const str) }); + let id = c_state.ident_intr.intern(s); + let mut sym = scope.symtab.get_mut(&id).unwrap(); + match sym.state { + ResolutionState::Unresolved => sym.state = ResolutionState::Resolving, + ResolutionState::Resolving | ResolutionState::Poisoned => { + sym.state = ResolutionState::Poisoned; + circularp = true; + }, + _ => unreachable!(), + } + } + + if circularp { + return Err(OryxError::new( + tokens.view()[ast.nodes.tok()[node as usize] as usize], + format!("circular dependency of multiple symbol definition"), + )); + } + + let mut types = Vec::with_capacity(nexprs); + for i in 0..nexprs { + let expr = ast.extra[exprs + 1 + i]; + let id = typecheck_expr(c_state, fdata, expr)?; + match c_state.type_intr.get(id) { + OryxType::MultiValue(xs) => { + /* TODO: There is probaby a method for this + * (append_from_slice()?) */ + for &x in xs.into_iter() { + types.push(x); + } + }, + _ => types.push(id), + }; + } + + if types.len() != nidents { + return Err(OryxError::new( + tokens.view()[ast.nodes.tok()[node as usize] as usize], + format!( + "attempted to assign {} values to {} symbols", + types.len(), + nidents + ), + )); + } + + for i in 0..nidents { + let (ii, ti) = (decls + 1 + i * 2, decls + 2 + i * 2); + let itok = ast.extra[ii] as usize; + let tnode = ast.extra[ti]; + + let alleged_type = if tnode == u32::MAX { + /* Inferred type */ + TypeId::INVALID + } else { + /* In ‘def a, b int = x, y;’, the type node for A and B are the + * same, so we need to make sure we only typecheck once. */ + let id = match ast.types[tnode as usize].get() { + TypeId::INVALID => typecheck_expr(c_state, fdata, tnode)?, + n => n, + }; + + let OryxType::Type(id) = c_state.type_intr.get(id) else { + return Err(OryxError::new( + tokens.view()[ast.nodes.tok()[tnode as usize] as usize], + "expected a type expression", + )); + }; + *id + }; + + let expr_type = types[i]; + let vtype = if alleged_type == TypeId::INVALID { + expr_type + } else if type_implicitly_casts_p(c_state, expr_type, alleged_type) { + alleged_type + } else { + return Err(OryxError::new( + tokens.view()[itok], + "expression type is not compatible with the explicitly provided symbol type", + )); + }; + + let view = tokens.view()[itok]; + /* Pointer fuckery to bypass the borrow checker */ + let s = + UniStr(unsafe { &*(&fdata.buffer[view.0..view.1] as *const str) }); + let id = c_state.ident_intr.intern(s); + let mut sym = scope.symtab.get_mut(&id).unwrap(); + sym.state = ResolutionState::Resolved; + sym.vtype = vtype; + } + + return Ok(()); +} + +fn typecheck_expr( + c_state: &CompilerState, + fdata: &FileData, + node: u32, +) -> Result<TypeId, OryxError> { + let ast = fdata.ast.get().unwrap(); + let tokens = fdata.tokens.get().unwrap(); + + const { + assert!(parser::NEXPRKINDS == 10, "Missing expression case"); + }; + let expr_type = match ast.nodes.kind()[node as usize] { + AstType::BinaryOperator => todo!(), + AstType::Dereference => { + let pointer = ast.nodes.sub()[node as usize].0; + let ptype = typecheck_expr(c_state, fdata, pointer)?; + let &OryxType::Pointer { base } = c_state.type_intr.get(ptype) + else { + let tok = ast.nodes.tok()[pointer as usize] as usize; + return Err(OryxError::new( + tokens.view()[tok], + "cannot deference a non-pointer type (expression has type {})", + )); + }; + base + }, + AstType::FunCall => todo!(), + AstType::FunProto => todo!(), + AstType::Function => todo!(), + AstType::Identifier => todo!(), + AstType::Number => c_state.type_intr.intern(OryxType::UNumber), + AstType::Pointer => { + let pointee = ast.nodes.sub()[node as usize].0; + let base = typecheck_expr(c_state, fdata, pointee)?; + c_state.type_intr.intern(OryxType::Pointer { base }) + }, + AstType::String => todo!(), + AstType::UnaryOperator => todo!(), + _ => unreachable!(), + }; + + ast.types[node as usize].set(expr_type); + return Ok(expr_type); +} + +fn type_implicitly_casts_p( + c_state: &CompilerState, + from: TypeId, + to: TypeId, +) -> bool { + if from == to { + return true; + } + + /* TODO: Handle other types */ + return match (c_state.type_intr.get(from), c_state.type_intr.get(to)) { + (OryxType::UNumber, OryxType::Integer { .. }) => true, + (OryxType::UBoolean, OryxType::Boolean { .. }) => true, + _ => false, + }; +} diff --git a/oryxc/src/unistr.rs b/oryxc/src/unistr.rs index f9ea3d0..9a204cf 100644 --- a/oryxc/src/unistr.rs +++ b/oryxc/src/unistr.rs @@ -1,3 +1,4 @@ +use std::fmt::{self, Display, Formatter}; use std::hash::{ Hash, Hasher, @@ -13,6 +14,12 @@ use unicode_normalization::{ #[derive(Copy, Clone, Debug, Eq)] pub struct UniStr<'a>(pub &'a str); +impl Display for UniStr<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> { + return self.0.fmt(f); + } +} + impl Hash for UniStr<'_> { fn hash<H: Hasher>(&self, state: &mut H) { /* In the ASCII common case we use .bytes() to avoid decoding |