From a20abd65acf16286fa861a39dcfb0a32dfe9b06b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Norman=20K=C3=B6hring?= Date: Sun, 13 Jul 2025 14:56:27 +0200 Subject: [PATCH] Logos based lexer, initial benchmark with Criterion --- Cargo.lock | 328 +++++++++++++++++++++++++++++++++ Cargo.toml | 17 ++ benches/lexer.rs | 46 +++++ sample2.nrs | 3 - src/emitter.rs | 0 src/emitter/js.rs | 24 +++ src/emitter/mod.rs | 1 + src/lexer.rs | 419 +++++++++++++++++++++++++++++++++--------- src/lexer_.rs | 95 ++++++++++ src/lib.rs | 1 + src/main.rs | 26 +-- src/parser.rs | 38 ++-- src/transformer/js.rs | 105 +++++++++-- 13 files changed, 974 insertions(+), 129 deletions(-) create mode 100644 benches/lexer.rs delete mode 100644 src/emitter.rs create mode 100644 src/emitter/js.rs create mode 100644 src/emitter/mod.rs create mode 100644 src/lexer_.rs create mode 100644 src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 705b2ce..cba09ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,6 +29,18 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + [[package]] name = "anyhow" version = "1.0.98" @@ -79,6 +91,12 @@ dependencies = [ "vsimd", ] +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "better_scoped_tls" version = "1.0.1" @@ -131,6 +149,12 @@ dependencies = [ "serde", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "castaway" version = "0.2.3" @@ -169,6 +193,58 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be92d32e80243a54711e5d7ce823c35c41c9d929dc4ab58e1276f625841aadf9" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707eab41e9622f9139419d573eca0900137718000c517d47da73045f54331c3d" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + [[package]] name = "compact_str" version = "0.7.1" @@ -182,6 +258,70 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "criterion" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf7af66b0989381bd0be551bd7cc91912a655a58c6918420c9527b1fd8b4679" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "itertools 0.13.0", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "data-encoding" version = "2.9.0" @@ -221,6 +361,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foldhash" version = "0.1.5" @@ -253,6 +399,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "half" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -419,6 +575,24 @@ dependencies = [ "syn", ] +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -459,6 +633,40 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "logos" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f536c1af4c7cc81edf73da1f8029896e7e1e16a219ef09b184e76a296f3db" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "189bbfd0b61330abea797e5e9276408f2edbe4f822d7ad08685d67419aafb34e" +dependencies = [ + "beef", + "fnv", + "lazy_static", + "proc-macro2", + "quote", + "regex-syntax 0.8.5", + "rustc_version", + "syn", +] + +[[package]] +name = "logos-derive" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebfe8e1a19049ddbfccbd14ac834b215e11b85b90bab0c2dba7c7b92fb5d5cba" +dependencies = [ + "logos-codegen", +] + [[package]] name = "memchr" version = "2.7.5" @@ -506,6 +714,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + [[package]] name = "outref" version = "0.5.2" @@ -566,6 +780,34 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "potential_utf" version = "0.1.2" @@ -643,6 +885,26 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.11.1" @@ -695,6 +957,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustversion" version = "1.0.21" @@ -713,12 +984,27 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd29631678d6fb0903b69223673e122c32e9ae559d0960a38d574695ebc0ea15" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scoped-tls" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.219" @@ -782,7 +1068,10 @@ dependencies = [ "anyhow", "ariadne", "chumsky", + "criterion", "lazy_static", + "logos", + "serde_json", "swc_common", "swc_ecma_ast", "swc_ecma_codegen", @@ -1045,6 +1334,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tracing" version = "0.1.41" @@ -1149,6 +1448,16 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1207,6 +1516,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + [[package]] name = "windows-sys" version = "0.59.0" diff --git a/Cargo.toml b/Cargo.toml index 6d2c49e..941e9a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,24 @@ anyhow = "1.0.98" ariadne = "0.5.1" chumsky = "0.10.1" lazy_static = "1.5.0" +logos = "0.15.0" +serde_json = "1.0.140" swc_common = "13.0.2" swc_ecma_ast = "13.0.0" swc_ecma_codegen = "15.0.1" thiserror = "2.0.12" + +[dev-dependencies] +criterion = "0.6.0" + +[lib] +name = "solace" +path = "src/lib.rs" + +[[bin]] +name = "solace" +path = "src/main.rs" + +[[bench]] +name = "lexer" +harness = false diff --git a/benches/lexer.rs b/benches/lexer.rs new file mode 100644 index 0000000..ae629e7 --- /dev/null +++ b/benches/lexer.rs @@ -0,0 +1,46 @@ +use criterion::{Criterion, criterion_group, criterion_main}; +use logos::Logos; +use solace::lexer::Token; +use std::hint::black_box; + +pub fn criterion_benchmark(c: &mut Criterion) { + let input = " + const NUMBER = 1_000_000; + const HEX_NUMBER = 0x7C1; + const BIN_NUMBER = 0b10111; + const OCT_NUMBER = 0o27; + const INVALID_NUMBER = 0o28; + const MORE_TOKENS = \"More tokens to fill the 100 Tokens!\"; + + fn easterEgg() -> (output: string) { + /* + * Someone found the easter egg! + * Lets celebrate that with a comment! + */ + + output = \"Yeah, you found the easter egg!\"; + } + + fn main(args: string[]) -> ArgumentError!string { + if args.length <= 2 { + return Err(\"Not enough Arguments\", ArgumentError); + } + return match args.length { + 3 => \"This is actually just one argument\", + 4 => \"Two arguments. Good!\", + NUMBER => easterEgg(), + _ => \"You're overdoing it... maybe?\" + } + } + "; + + c.bench_function("Lexer", |b| { + b.iter(|| { + let mut lexer = Token::lexer(black_box(input)); + while let Some(_) = lexer.next() {} + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/sample2.nrs b/sample2.nrs index 43c4ea7..c28e76b 100644 --- a/sample2.nrs +++ b/sample2.nrs @@ -1,6 +1,3 @@ -let x = 2; -let y = 3; - fn mul(x, y) { let a = x*2; a * y diff --git a/src/emitter.rs b/src/emitter.rs deleted file mode 100644 index e69de29..0000000 diff --git a/src/emitter/js.rs b/src/emitter/js.rs new file mode 100644 index 0000000..8712ecc --- /dev/null +++ b/src/emitter/js.rs @@ -0,0 +1,24 @@ +use swc_common::{SourceMap, sync::Lrc}; +use swc_ecma_ast::Module; +use swc_ecma_codegen::{Config, Emitter, text_writer::JsWriter}; + +pub struct JsEmitter; + +impl JsEmitter { + pub fn new() -> Self { + Self + } + pub fn emit(&self, ast: Module) -> (String, Lrc) { + let sm = Lrc::new(SourceMap::default()); + let mut buf = vec![]; + let mut emitter = Emitter { + cfg: Config::default(), + cm: sm.clone(), + comments: None, + wr: JsWriter::new(sm.clone(), "\n", &mut buf, None), + }; + + emitter.emit_module(&ast).unwrap(); + (String::from_utf8(buf).unwrap(), sm) + } +} diff --git a/src/emitter/mod.rs b/src/emitter/mod.rs new file mode 100644 index 0000000..cb71828 --- /dev/null +++ b/src/emitter/mod.rs @@ -0,0 +1 @@ +pub mod js; diff --git a/src/lexer.rs b/src/lexer.rs index c3fddc9..7333ea5 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,95 +1,336 @@ -use chumsky::prelude::*; -use std::fmt::{Display, Formatter, Result}; +use logos::Logos; +use serde_json; -// A few type definitions to be used by our parsers below -pub type Span = SimpleSpan; -pub type Spanned = (T, Span); +fn parse_radix(s: &str, radix: u32) -> Result { + let s = s.replace('_', ""); + let (sign, num) = if s.starts_with('-') { + (-1.0, &s[3..]) // skip "-0x", "-0b" or "-0o" + } else { + (1.0, &s[2..]) + }; -#[derive(Clone, Debug, PartialEq)] -pub enum Token<'src> { - None, - Bool(bool), - Num(f64), - Str(&'src str), - Op(&'src str), - Ctrl(char), - Ident(&'src str), - Fn, - Var, - If, - Else, -} - -impl Display for Token<'_> { - fn fmt(&self, f: &mut Formatter) -> Result { - match self { - Token::None => write!(f, "none"), - Token::Bool(x) => write!(f, "{x}"), - Token::Num(n) => write!(f, "{n}"), - Token::Str(s) => write!(f, "{s}"), - Token::Op(s) => write!(f, "{s}"), - Token::Ctrl(c) => write!(f, "{c}"), - Token::Ident(s) => write!(f, "{s}"), - Token::Fn => write!(f, "fn"), - Token::Var => write!(f, "var"), - Token::If => write!(f, "if"), - Token::Else => write!(f, "else"), - } + match u64::from_str_radix(num, radix) { + Ok(val) => Ok(sign * val as f64), + Err(_) => Err(format!( + "Failed to parse number \"{}\" with radix {}", + s, radix + )), } } -pub fn lexer<'src>() --> impl Parser<'src, &'src str, Vec>>, extra::Err>> { - // A parser for numbers - let num = text::int(10) - .then(just('.').then(text::digits(10)).or_not()) - .to_slice() - .from_str() - .unwrapped() - .map(Token::Num); - - // A parser for strings - let str_ = just('"') - .ignore_then(none_of('"').repeated().to_slice()) - .then_ignore(just('"')) - .map(Token::Str); - - // A parser for operators - let op = one_of("+*-/!=") - .repeated() - .at_least(1) - .to_slice() - .map(Token::Op); - - // A parser for control characters (delimiters, semicolons, etc.) - let ctrl = one_of("()[]{};,").map(Token::Ctrl); - - // A parser for identifiers and keywords - let ident = text::ascii::ident().map(|ident: &str| match ident { - "fn" => Token::Fn, - "var" => Token::Var, - "let" => Token::Var, // var and let are synonyms - "if" => Token::If, - "else" => Token::Else, - "true" => Token::Bool(true), - "false" => Token::Bool(false), - "none" => Token::None, - _ => Token::Ident(ident), - }); - - // A single token can be one of the above - let token = num.or(str_).or(op).or(ctrl).or(ident); - - let comment = just("//") - .then(any().and_is(just('\n').not()).repeated()) - .padded(); - - token - .map_with(|tok, e| (tok, e.span())) - .padded_by(comment.repeated()) - .padded() - // If we encounter an error, skip and attempt to lex the next character as a token instead - .recover_with(skip_then_retry_until(any().ignored(), end())) - .repeated() - .collect() +fn parse_number(s: &str) -> Result { + let s = s.replace('_', ""); + s.parse::() + .map_err(|_| format!("Failed to parse number \"{}\"", s)) +} + +#[derive(Logos, Debug, PartialEq)] +// #[logos(extras = (u32, u32))] +#[logos(skip r"\s+")] +pub enum Token<'src> { + #[regex(r"-?0[xX][0-9a-fA-F_]+", |lex| parse_radix(lex.slice(), 16))] + #[regex(r"-?0[bB][01_]+", |lex| parse_radix(lex.slice(), 2))] + #[regex(r"-?0[oO][0-7_]+", |lex| parse_radix(lex.slice(), 8))] + #[regex(r"-?(?:0|[1-9][0-9_]*)(?:\.\d+)?(?:[eE][+-]?\d+)?", |lex| parse_number(lex.slice()))] + Number(Result), + + #[token("NaN")] + NaN, + + #[regex(r#"("[^"\\\x00-\x1F]*(?:\\.[^"\\\x00-\x1F]*)*")|('[^'\\\x00-\x1F]*(?:\\.[^'\\\x00-\x1F]*)*')"#, + |lex| { let slice = lex.slice(); slice[1..slice.len()-1].to_owned() })] + String(String), // "string" or 'string' + + #[token("undefined")] + Undefined, // undefined (value not initialized or not existing) + #[token("None")] + None, // none - optional with no value + #[token("Some")] + Some, // Some(value) - optional with value + #[token("Err")] + Err, // Err(Error) - result with error + #[token("Ok")] + Ok, // Ok(Value) - result with value + #[token("false", |_| false)] + #[token("true", |_| true)] + Bool(bool), + + #[token("fn")] + Fn, // keyword for functions + #[token("var")] + Var, // variable + #[token("let")] + Let, // synonymous to var + #[token("const")] + Const, // constants + #[token("live")] + Live, // live variables / signals + #[token("if")] + If, + #[token("else")] + Else, + #[token("match")] + Match, + #[token("for")] + For, + #[token("while")] + While, + #[token("return")] + Return, + + // Range and other multi char operators + #[token("..=")] + RangeIncl, + #[token("..<")] + RangeExcl, + #[token("==")] + Eq, + #[token("!=")] + Ne, + #[token("<=")] + Le, + #[token(">=")] + Ge, + #[token("++")] + Inc, + #[token("--")] + Dec, + #[token("**")] + Pow, + #[token("+=")] + AddEq, + #[token("-=")] + SubEq, + #[token("*=")] + MulEq, + #[token("/=")] + DivEq, + #[token("&&")] + And, + #[token("||")] + Or, + #[token("=>")] + FatArrow, + #[token("->")] + Arrow, + + // Single character operators + #[token(".")] + Dot, + #[token("!")] + ExclamationMark, + #[token("?")] + QuestionMark, + #[token("&")] + BAnd, + #[token("|")] + BOr, + #[token("<")] + Lt, + #[token(">")] + Gt, + #[token("=")] + Assign, + #[token(":")] + Colon, + #[token(",")] + Comma, + #[token("+")] + Add, + #[token("-")] + Sub, + #[token("*")] + Mul, + #[token("/")] + Div, + #[token("%")] + Mod, + // Parentheses + #[token("(")] + ParenOpen, + #[token(")")] + ParenClose, + #[token("{")] + BraceOpen, + #[token("}")] + BraceClose, + #[token("[")] + BracketOpen, + #[token("]")] + BracketClose, + + #[token("_")] + Default, + #[token(";")] + Semicolon, + + #[regex(r"([a-zA-Z$][a-zA-Z0-9_$]*)|(_[a-zA-Z0-9_$]+)")] + Identifier(&'src str), // Identifiers start with letters, _ or $ and can contain numbers + + // Comments + #[regex(r"//[^\n]*")] + LineComment(&'src str), + #[regex(r"/\*([^*]|\*[^/])*\*/")] + BlockComment(&'src str), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_keywords() { + let mut lex = Token::lexer("let var const fn match"); + assert_eq!(lex.next(), Some(Ok(Token::Let))); + assert_eq!(lex.next(), Some(Ok(Token::Var))); + assert_eq!(lex.next(), Some(Ok(Token::Const))); + assert_eq!(lex.next(), Some(Ok(Token::Fn))); + assert_eq!(lex.next(), Some(Ok(Token::Match))); + } + + #[test] + fn test_operators() { + let mut lex = Token::lexer("** * == += + ="); + assert_eq!(lex.next(), Some(Ok(Token::Pow))); + assert_eq!(lex.next(), Some(Ok(Token::Mul))); + assert_eq!(lex.next(), Some(Ok(Token::Eq))); + assert_eq!(lex.next(), Some(Ok(Token::AddEq))); + assert_eq!(lex.next(), Some(Ok(Token::Add))); + assert_eq!(lex.next(), Some(Ok(Token::Assign))); + } + + #[test] + fn test_declaration() { + let mut lex = Token::lexer("const foo = 42;"); + assert_eq!(lex.next(), Some(Ok(Token::Const))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("foo")))); + assert_eq!(lex.next(), Some(Ok(Token::Assign))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(42.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Semicolon))); + } + #[test] + fn test_numbers() { + let mut lex = Token::lexer("42 * -0.2 + 4e3 - 0xFF / 0b1010 + 1_000_000;"); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(42.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Mul))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(-0.2))))); + assert_eq!(lex.next(), Some(Ok(Token::Add))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(4000.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Sub))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(255.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Div))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(10.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Add))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(1000000.0))))); + assert_eq!(lex.next(), Some(Ok(Token::Semicolon))); + } + + #[test] + fn test_strings() { + let mut lex = Token::lexer("\"Foo\" 'Single' 'Sin\\'Esq\\'gle'"); + assert_eq!(lex.next(), Some(Ok(Token::String("Foo".to_owned())))); + assert_eq!(lex.next(), Some(Ok(Token::String("Single".to_owned())))); + assert_eq!( + lex.next(), + Some(Ok(Token::String("Sin'Esq'gle".to_owned()))) + ); + } + + #[test] + fn test_full_syntax_example() { + let mut lex = Token::lexer( + " + fn main(args: string[]) -> ArgumentError!string { + if args.length <= 2 { + return Err(\"Not enough Arguments\", ArgumentError); + } + return match args.length { + 3 => \"This is actually just one argument\", + 4 => \"Two arguments. Good!\", + _ => \"You're overdoing it!\" + } + } + ", + ); + // FIRST LINE + assert_eq!(lex.next(), Some(Ok(Token::Fn))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("main")))); + assert_eq!(lex.next(), Some(Ok(Token::ParenOpen))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("args")))); + assert_eq!(lex.next(), Some(Ok(Token::Colon))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("string")))); + assert_eq!(lex.next(), Some(Ok(Token::BracketOpen))); + assert_eq!(lex.next(), Some(Ok(Token::BracketClose))); + assert_eq!(lex.next(), Some(Ok(Token::ParenClose))); + assert_eq!(lex.next(), Some(Ok(Token::Arrow))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("ArgumentError")))); + assert_eq!(lex.next(), Some(Ok(Token::ExclamationMark))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("string")))); + assert_eq!(lex.next(), Some(Ok(Token::BraceOpen))); + + // SECOND LINE + assert_eq!(lex.next(), Some(Ok(Token::If))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("args")))); + assert_eq!(lex.next(), Some(Ok(Token::Dot))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("length")))); + assert_eq!(lex.next(), Some(Ok(Token::Le))); + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(2.0))))); + assert_eq!(lex.next(), Some(Ok(Token::BraceOpen))); + + // THIRD LINE + assert_eq!(lex.next(), Some(Ok(Token::Return))); + assert_eq!(lex.next(), Some(Ok(Token::Err))); + assert_eq!(lex.next(), Some(Ok(Token::ParenOpen))); + assert_eq!( + lex.next(), + Some(Ok(Token::String("Not enough Arguments".to_owned()))) + ); + assert_eq!(lex.next(), Some(Ok(Token::Comma))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("ArgumentError")))); + assert_eq!(lex.next(), Some(Ok(Token::ParenClose))); + assert_eq!(lex.next(), Some(Ok(Token::Semicolon))); + + // FOURTH LINE + assert_eq!(lex.next(), Some(Ok(Token::BraceClose))); + + // FIFTH LINE + assert_eq!(lex.next(), Some(Ok(Token::Return))); + assert_eq!(lex.next(), Some(Ok(Token::Match))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("args")))); + assert_eq!(lex.next(), Some(Ok(Token::Dot))); + assert_eq!(lex.next(), Some(Ok(Token::Identifier("length")))); + assert_eq!(lex.next(), Some(Ok(Token::BraceOpen))); + + // SIXTH LINE + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(3.0))))); + assert_eq!(lex.next(), Some(Ok(Token::FatArrow))); + assert_eq!( + lex.next(), + Some(Ok(Token::String( + "This is actually just one argument".to_owned() + ))) + ); + assert_eq!(lex.next(), Some(Ok(Token::Comma))); + + // SEVENTH LINE + assert_eq!(lex.next(), Some(Ok(Token::Number(Ok(4.0))))); + assert_eq!(lex.next(), Some(Ok(Token::FatArrow))); + assert_eq!( + lex.next(), + Some(Ok(Token::String("Two arguments. Good!".to_owned()))) + ); + assert_eq!(lex.next(), Some(Ok(Token::Comma))); + + // EIGHTH LINE + assert_eq!(lex.next(), Some(Ok(Token::Default))); + assert_eq!(lex.next(), Some(Ok(Token::FatArrow))); + assert_eq!( + lex.next(), + Some(Ok(Token::String("You're overdoing it!".to_owned()))) + ); + + // NINTH AND TENTH LINE + assert_eq!(lex.next(), Some(Ok(Token::BraceClose))); + assert_eq!(lex.next(), Some(Ok(Token::BraceClose))); + } } diff --git a/src/lexer_.rs b/src/lexer_.rs new file mode 100644 index 0000000..c3fddc9 --- /dev/null +++ b/src/lexer_.rs @@ -0,0 +1,95 @@ +use chumsky::prelude::*; +use std::fmt::{Display, Formatter, Result}; + +// A few type definitions to be used by our parsers below +pub type Span = SimpleSpan; +pub type Spanned = (T, Span); + +#[derive(Clone, Debug, PartialEq)] +pub enum Token<'src> { + None, + Bool(bool), + Num(f64), + Str(&'src str), + Op(&'src str), + Ctrl(char), + Ident(&'src str), + Fn, + Var, + If, + Else, +} + +impl Display for Token<'_> { + fn fmt(&self, f: &mut Formatter) -> Result { + match self { + Token::None => write!(f, "none"), + Token::Bool(x) => write!(f, "{x}"), + Token::Num(n) => write!(f, "{n}"), + Token::Str(s) => write!(f, "{s}"), + Token::Op(s) => write!(f, "{s}"), + Token::Ctrl(c) => write!(f, "{c}"), + Token::Ident(s) => write!(f, "{s}"), + Token::Fn => write!(f, "fn"), + Token::Var => write!(f, "var"), + Token::If => write!(f, "if"), + Token::Else => write!(f, "else"), + } + } +} + +pub fn lexer<'src>() +-> impl Parser<'src, &'src str, Vec>>, extra::Err>> { + // A parser for numbers + let num = text::int(10) + .then(just('.').then(text::digits(10)).or_not()) + .to_slice() + .from_str() + .unwrapped() + .map(Token::Num); + + // A parser for strings + let str_ = just('"') + .ignore_then(none_of('"').repeated().to_slice()) + .then_ignore(just('"')) + .map(Token::Str); + + // A parser for operators + let op = one_of("+*-/!=") + .repeated() + .at_least(1) + .to_slice() + .map(Token::Op); + + // A parser for control characters (delimiters, semicolons, etc.) + let ctrl = one_of("()[]{};,").map(Token::Ctrl); + + // A parser for identifiers and keywords + let ident = text::ascii::ident().map(|ident: &str| match ident { + "fn" => Token::Fn, + "var" => Token::Var, + "let" => Token::Var, // var and let are synonyms + "if" => Token::If, + "else" => Token::Else, + "true" => Token::Bool(true), + "false" => Token::Bool(false), + "none" => Token::None, + _ => Token::Ident(ident), + }); + + // A single token can be one of the above + let token = num.or(str_).or(op).or(ctrl).or(ident); + + let comment = just("//") + .then(any().and_is(just('\n').not()).repeated()) + .padded(); + + token + .map_with(|tok, e| (tok, e.span())) + .padded_by(comment.repeated()) + .padded() + // If we encounter an error, skip and attempt to lex the next character as a token instead + .recover_with(skip_then_retry_until(any().ignored(), end())) + .repeated() + .collect() +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fc84151 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod lexer; diff --git a/src/main.rs b/src/main.rs index 7c7be11..5e8c541 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ -mod emitter; mod lexer; -mod parser; -mod transformer; +// mod emitter; +// mod parser; +// mod transformer; use anyhow::Result; use std::{env, fs}; @@ -23,20 +23,20 @@ fn main() -> Result<()> { let file_path = &args[1]; let input = fs::read_to_string(file_path).expect(&format!("Cannot read file '{}'!", file_path)); + /* // Parse Solace Code - if let Some((ast, span)) = parser::parse(file_path.to_string(), &input) { + if let Some((ast, span)) = parser::parse(file_path.to_string(), &input, false) { // Transform from Solace AST to SWC AST let js_transformer = transformer::js::JsTransformer::new(); - let js_ast = js_transformer.transform(ast); + let js_ast = js_transformer.transform(ast, span); + + // Emit JavaScript + let js_emitter = emitter::js::JsEmitter::new(); + let (js_code, _source_map) = js_emitter.emit(js_ast); + + // Write Output to stdout + println!("{}", js_code); } - - /* - // Emit JavaScript - let js_emitter = emitter::js::JsEmitter::new(); - let js_code = js_emitter.emit(js_ast)?; - - // Write Output to stdout - println!("{}", js_code); */ Ok(()) diff --git a/src/parser.rs b/src/parser.rs index f9566b7..f576f2a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -44,27 +44,36 @@ pub enum BinaryOp { NotEq, } -// An expression node in the AST. Children are spanned so we can generate useful runtime errors. #[derive(Debug)] pub enum Expr<'src> { Error, Value(Value<'src>), List(Vec>), Local(&'src str), - Var(&'src str, Box>, Box>), + If(Box>, Box>, Box>), Then(Box>, Box>), Binary(Box>, BinaryOp, Box>), Call(Box>, Spanned>>), - If(Box>, Box>, Box>), - Print(Box>), +} + +#[derive(Debug)] +pub struct Block<'src> { + stmts: Vec>>, + expr: Option>>>, +} + +#[derive(Debug)] +pub enum Stmt<'src> { + Var(&'src str, Box>, Box>), + Expr(Box>>), } // A function node in the AST. #[derive(Debug)] pub struct Func<'src> { - args: Vec<&'src str>, - span: Span, - body: Spanned>, + pub args: Vec<&'src str>, + pub span: Span, + pub body: Spanned>, } fn expr_parser<'tokens, 'src: 'tokens, I>() @@ -98,7 +107,7 @@ where .then(inline_expr) .then_ignore(just(Token::Ctrl(';'))) .then(expr.clone()) - .map(|((name, val), body)| Expr::Var(name, Box::new(val), Box::new(body))); + .map(|((name, val), body)| Stmt::Var(name, Box::new(val), Box::new(body))); let list = items .clone() @@ -329,6 +338,7 @@ where pub fn parse<'src>( filename: String, src: &'src str, + debug: bool, ) -> Option<(HashMap<&'src str, Func<'src>>, SimpleSpan)> { let (tokens, lex_errs) = lexer().parse(src).into_output_errors(); @@ -342,11 +352,13 @@ pub fn parse<'src>( ) .into_output_errors(); - if let Some((funcs, _file_span)) = ast - .as_ref() - .filter(|_| lex_errs.len() + parse_errs.len() == 0) - { - println!("{funcs:#?}") + if debug { + if let Some((funcs, _file_span)) = ast + .as_ref() + .filter(|_| lex_errs.len() + parse_errs.len() == 0) + { + println!("{funcs:#?}") + } } (ast, parse_errs) diff --git a/src/transformer/js.rs b/src/transformer/js.rs index 4f3e706..fbd2995 100644 --- a/src/transformer/js.rs +++ b/src/transformer/js.rs @@ -1,25 +1,108 @@ -use crate::parser::Func; +use crate::parser; +use chumsky; use std::collections::HashMap; -use swc_common::DUMMY_SP; -use swc_ecma_ast as js_ast; +use swc_common::{BytePos, DUMMY_SP, Span, SyntaxContext}; +use swc_ecma_ast::{ + AssignExpr, BinaryOp, BindingIdent, BlockStmt, Decl, Expr, FnDecl, Function, Ident, Module, + ModuleItem, Param, Pat, Stmt, +}; + +pub trait ToSWC { + fn to_swc(&self) -> T; +} + +impl ToSWC for chumsky::span::SimpleSpan { + fn to_swc(&self) -> Span { + Span::new(BytePos(self.start as u32), BytePos(self.end as u32)) + } +} + +impl ToSWC for parser::BinaryOp { + fn to_swc(&self) -> BinaryOp { + match self { + parser::BinaryOp::Add => BinaryOp::Add, + parser::BinaryOp::Sub => BinaryOp::Sub, + parser::BinaryOp::Mul => BinaryOp::Mul, + parser::BinaryOp::Div => BinaryOp::Div, + parser::BinaryOp::Eq => BinaryOp::EqEqEq, + parser::BinaryOp::NotEq => BinaryOp::NotEqEq, + // TODO: implement all members of BinaryOp + } + } +} + +impl ToSWC for parser::Func<'_> { + fn to_swc(&self) -> Function { + Function { + params: self + .args + .iter() + .map(|name| Param { + span: DUMMY_SP, + decorators: vec![], + pat: Pat::Ident(BindingIdent { + id: name.to_string().into(), + type_ann: None, + }), + }) + .collect(), + decorators: vec![], + span: self.span.to_swc(), + body: Some(BlockStmt { + span: DUMMY_SP, + ctxt: SyntaxContext::empty(), + stmts: vec![], //TODO! + }), + is_generator: false, + is_async: false, + type_params: None, + return_type: None, + ctxt: SyntaxContext::empty(), + } + } +} + +impl ToSWC for parser::Expr<'_> { + fn to_swc(&self) -> Expr { + match self { + parser::Expr::Var(name, val, body) => Expr::Assign(AssignExpr { + span: Span::new(BytePos(self.1.1.start as u32), BytePos(self.2.1.end as u32)), + }), + } + } +} pub struct JsTransformer; impl<'src> JsTransformer { pub fn new() -> Self { - Self + Self {} } - pub fn transform(&self, solace_ast: HashMap<&'src str, Func<'_>>) -> js_ast::Module { - js_ast::Module { - span: DUMMY_SP, - body: solace_ast + pub fn transform( + &self, + source_ast: HashMap<&'src str, parser::Func<'src>>, + span: chumsky::span::SimpleSpan, + ) -> Module { + Module { + span: span.to_swc(), + body: source_ast .into_iter() - .map(|(name, func)| self.transform_func(name, func)) + .map(|(name, func)| self.transform_func_stmt(name, func)) .collect(), shebang: None, } } - pub fn transform_func(&self, name: &str, func: Func<'_>) -> js_ast::Function { - todo!("Implement me") + pub fn transform_func_stmt(&self, ident: &str, func: parser::Func<'_>) -> ModuleItem { + ModuleItem::Stmt(Stmt::Decl(Decl::Fn(FnDecl { + ident: Ident::new(ident.into(), DUMMY_SP, SyntaxContext::empty()), + declare: false, + function: Box::new(func.to_swc()), + }))) } + + // pub fn transform_expr(&self, expr: parser::Expr) -> Stmt { + // match expr { + // Expr::Var => Stmt::Decl(Decl::Var(())), + // } + // } }