Thu, 19 Oct 2023 23:25:34 -0500
Implement a more expansible parser
// The main documentation is in the README. #![doc = include_str!("../README.md")] #![feature(trait_upcasting)] use std::io; use std::fs::File; use std::io::{BufWriter, BufRead, BufReader}; use std::io::Write; use clap::Parser; /// Command line parameters #[derive(Parser, Debug)] #[clap( about = env!("CARGO_PKG_DESCRIPTION"), author = env!("CARGO_PKG_AUTHORS"), version = env!("CARGO_PKG_VERSION"), )] struct CommandLineArgs { /// Input file (default is stdin) input : Option<String>, /// Output file (defalt is stdout) #[arg(long, short = 'o')] output : Option<String>, #[clap(flatten)] config : Config } #[derive(Parser, Debug)] struct Config { #[arg(long, short = 'c')] /// Strip comments strip_comments : bool, #[arg(long, short = 'w')] /// Strip unnecessary whitespace strip_whitespace : bool, } struct Context { lineno : usize, input_only_ws : bool, cli : Config } type AnyChainRule = Box<dyn ChainRule>; type AnyNestedRule = Box<dyn NestedRule>; trait ChainRule { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule; fn flush(self : Box<Self>, ctx : &Context); } trait NestedRule : ChainRule { fn produce(&mut self, c : char, ctx : &Context); fn next(self : Box<Self>) -> AnyChainRule; fn produce_string(&mut self, s : String, ctx : &Context) { s.chars().for_each(|c| self.produce(c, ctx)); } fn start_ignored_comment(&mut self, c : char); } impl<W : Write + 'static> ChainRule for Out<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { basic_consume(self, c, ctx, true) } fn flush(mut self : Box<Self>, _ctx : &Context) { self.output.flush().unwrap(); } } impl<W : Write + 'static> NestedRule for Out<W> { fn produce(&mut self, c : char, ctx : &Context) { if c == '\n' { self.line_end(ctx.cli.strip_whitespace, ctx.input_only_ws) } else if c.is_whitespace() { self.stored_whitespace.push(c); } else { write!(self.output, "{}{}", self.stored_whitespace, c).unwrap(); self.stored_whitespace.clear(); self.only_whitespace = false; self.whitespace_satisfied = false; self.par_satisfied = false; } } fn next(self : Box<Self>) -> AnyChainRule { self } fn start_ignored_comment(&mut self, c : char) { if self.stored_whitespace.is_empty() && !self.only_whitespace { // The marker needs to be inserted if there is to be no whitespace inserted write!(self.output, "{c}").unwrap(); self.whitespace_satisfied = false; self.par_satisfied = false; self.only_whitespace = false; } else if self.only_whitespace { self.ignored_comment_only_line = true } } } fn basic_consume(mut s : AnyNestedRule, c : char, ctx : &Context, print_end : bool) -> AnyChainRule { match c { '{' => { s.produce(c, ctx); Box::new(Group(s)) }, '}' => { if print_end { s.produce(c, ctx); } s.next() }, '\\' => { Box::new(CommandName{parent : s, command : "\\".to_string()}) }, '%' => { if !ctx.cli.strip_comments { s.produce(c, ctx); Box::new(Comment(s)) } else { s.start_ignored_comment(c); Box::new(IgnoreComment(s)) } }, _ => { s.produce(c, ctx); s } } } struct CommandName { parent : AnyNestedRule, command : String } impl ChainRule for CommandName { fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { match c { '}' | '{' | '\\' if self.command.len() <= 1 => { self.command.push(c); self.handle(ctx) }, c if c.is_alphanumeric() => { self.command.push(c); self }, c => { self.handle(ctx) .consume(c, ctx) } } } fn flush(self : Box<Self>, ctx : &Context) { self.handle(ctx) .flush(ctx) } } impl CommandName { fn handle(mut self, ctx : &Context) -> AnyChainRule { match self.command.as_str() { "\\added" => { Scan::new(Added(self.parent)) }, "\\replaced" => { Scan::new(Replaced(self.parent)) }, "\\deleted" => { Scan::new(Deleted(self.parent)) }, _ => { self.parent.produce_string(self.command, ctx); self.parent } } } } struct Comment(AnyNestedRule); impl ChainRule for Comment { fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { if c == '\n' { self.0.consume(c, ctx) } else { self.0.produce(c, ctx); self } } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } struct IgnoreComment(AnyChainRule); impl ChainRule for IgnoreComment { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { if c == '\n' { self.0.consume(c, ctx) } else { self } } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } struct Group(AnyNestedRule); impl ChainRule for Group { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { basic_consume(self, c, ctx, true) } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } impl NestedRule for Group { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Added(AnyNestedRule); impl ChainRule for Added { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } impl NestedRule for Added { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Deleted(AnyNestedRule); impl ChainRule for Deleted { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } impl NestedRule for Deleted { fn produce(&mut self, _c : char, _ctx : &Context) { } fn next(self : Box<Self>) -> AnyChainRule { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Replaced(AnyNestedRule); impl ChainRule for Replaced { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } impl NestedRule for Replaced { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule { Scan::new(Deleted(self.0)) } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Scan(AnyNestedRule); impl ChainRule for Scan { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { if c.is_whitespace() || c == '\n' { self } else if c == '{' { self.0 } else if c == '%' { Box::new(IgnoreComment(self)) } else { panic!("Non-whitespace character ({c}) separating arguments on \ line {lineno}", lineno = ctx.lineno) } } fn flush(self : Box<Self>, ctx : &Context) { self.0.flush(ctx) } } impl Scan { fn new<R : NestedRule + 'static>(r : R) -> Box<dyn ChainRule> { Box::new(Scan(Box::new(r))) } } struct Out<W : Write> { only_whitespace : bool, stored_whitespace : String, output : W, whitespace_satisfied : bool, par_satisfied : bool, ignored_comment_only_line : bool } impl<W : Write> Out<W> { pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) { let skip_linefeed = if input_only_ws { // Need a paragraph break strip_ws && self.par_satisfied } else if strip_ws { self.only_whitespace && self.whitespace_satisfied } else { // Skip comment-only lines if the comment is ignored self.ignored_comment_only_line }; if !skip_linefeed { if !strip_ws { write!(self.output, "{}", self.stored_whitespace).unwrap(); } write!(self.output, "\n").unwrap(); self.whitespace_satisfied = true; self.par_satisfied = self.only_whitespace; } self.stored_whitespace.clear(); self.only_whitespace = true; self.ignored_comment_only_line = false; } } fn main() { let cli = CommandLineArgs::parse(); let input = cli.input.map_or_else( || Box::new(BufReader::new(io::stdin())) as Box<dyn BufRead>, |f| Box::new(BufReader::new(File::open(f).unwrap())) as Box<dyn BufRead> ); let output = cli.output.map_or_else( || Box::new(BufWriter::new(io::stdout())) as Box<dyn Write>, |f| Box::new(BufWriter::new(File::create(f).unwrap())) as Box<dyn Write> ); let mut rule : Box<dyn ChainRule> = Box::new(Out { only_whitespace : true, stored_whitespace : String::new(), output, whitespace_satisfied : true, par_satisfied : true, ignored_comment_only_line : false }); let mut ctx = Context{ lineno : 0, cli : cli.config, input_only_ws : true}; for l in input.lines().map(|l| l.unwrap()) { ctx.lineno += 1; ctx.input_only_ws = true; for c in l.chars() { ctx.input_only_ws = ctx.input_only_ws && c.is_whitespace(); rule = rule.consume(c, &ctx); } rule = rule.consume('\n', &ctx); } rule.flush(&ctx); }