Fri, 03 May 2024 14:53:53 -0500
Properly handle \%
// The main documentation is in the README. #![doc = include_str!("../README.md")] #![feature(trait_upcasting)] use std::io; use std::fs::File; use std::io::{BufWriter, BufRead, BufReader, Write, Read}; use std::fmt::Debug; use clap::Parser; /// Command line parameters #[derive(Parser, Debug, Clone)] #[clap( about = env!("CARGO_PKG_DESCRIPTION"), author = env!("CARGO_PKG_AUTHORS"), version = env!("CARGO_PKG_VERSION"), )] struct CommandLineArgs { /// Input file (default is stdin) input : Option<String>, /// Output file (defalt is stdout) #[arg(long, short = 'o')] output : Option<String>, #[clap(flatten)] config : Config } #[derive(Parser, Debug, Clone)] struct Config { #[arg(long, short = 'c')] /// Strip comments strip_comments : bool, #[arg(long, short = 'w')] /// Strip unnecessary whitespace strip_whitespace : bool, } struct Context { lineno : usize, input_only_ws : bool, cli : Config } type AnyChainRule<W> = Box<dyn ChainRule<W>>; type AnyNestedRule<W> = Box<dyn NestedRule<W>>; trait ChainRule<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W>; fn flush(self : Box<Self>, ctx : &Context) -> W; } trait NestedRule<W : Write> : ChainRule<W> { fn produce(&mut self, c : char, ctx : &Context); fn next(self : Box<Self>) -> AnyChainRule<W>; fn produce_string(&mut self, s : String, ctx : &Context) { s.chars().for_each(|c| self.produce(c, ctx)); } fn start_ignored_comment(&mut self, c : char); } impl<W : Write + 'static> ChainRule<W> for Out<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { basic_consume(self, c, ctx, true) } fn flush(mut self : Box<Self>, _ctx : &Context) -> W { self.output.flush().unwrap(); self.output } } impl<W : Write + 'static> NestedRule<W> for Out<W> { fn produce(&mut self, c : char, ctx : &Context) { if c == '\n' { self.line_end(ctx.cli.strip_whitespace, ctx.input_only_ws) } else if c.is_whitespace() { self.stored_whitespace.push(c); } else { write!(self.output, "{}{}", self.stored_whitespace, c).unwrap(); self.stored_whitespace.clear(); self.only_whitespace = false; self.whitespace_satisfied = false; self.par_satisfied = false; } } fn next(self : Box<Self>) -> AnyChainRule<W> { self } fn start_ignored_comment(&mut self, c : char) { if self.stored_whitespace.is_empty() && !self.only_whitespace { // The marker needs to be inserted if there is to be no whitespace inserted write!(self.output, "{c}").unwrap(); self.whitespace_satisfied = false; self.par_satisfied = false; self.only_whitespace = false; } else if self.only_whitespace { self.ignored_comment_only_line = true } } } fn basic_consume<W : Write + 'static>( mut s : AnyNestedRule<W>, c : char, ctx : &Context, print_end : bool ) -> AnyChainRule<W> { match c { '{' => { s.produce(c, ctx); Box::new(Group(s)) }, '}' => { if print_end { s.produce(c, ctx); } s.next() }, '\\' => { Box::new(CommandName{parent : s, command : "\\".to_string()}) }, '%' => { if !ctx.cli.strip_comments { s.produce(c, ctx); Box::new(Comment(s)) } else { s.start_ignored_comment(c); Box::new(IgnoreComment(s)) } }, _ => { s.produce(c, ctx); s } } } struct CommandName<W : Write> { parent : AnyNestedRule<W>, command : String } impl<W : Write + 'static> ChainRule<W> for CommandName<W> { fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { match c { '}' | '{' | '\\' | '%' if self.command.len() <= 1 => { self.command.push(c); self.handle(ctx) }, c if c.is_alphanumeric() => { self.command.push(c); self }, c => { self.handle(ctx) .consume(c, ctx) } } } fn flush(self : Box<Self>, ctx : &Context) -> W { self.handle(ctx) .flush(ctx) } } impl<W : Write + 'static> CommandName<W> { fn handle(mut self, ctx : &Context) -> AnyChainRule<W> { match self.command.as_str() { "\\added" => { Scan::new(Added(self.parent)) }, "\\replaced" => { Scan::new(Replaced(self.parent)) }, "\\deleted" => { Scan::new(Deleted(self.parent)) }, _ => { self.parent.produce_string(self.command, ctx); self.parent } } } } struct Comment<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Comment<W> { fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { if c == '\n' { self.0.consume(c, ctx) } else { self.0.produce(c, ctx); self } } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } struct IgnoreComment<W : Write>(AnyChainRule<W>); impl<W : Write +'static> ChainRule<W> for IgnoreComment<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { if c == '\n' { self.0.consume(c, ctx) } else { self } } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } struct Group<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Group<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { basic_consume(self, c, ctx, true) } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } impl<W : Write + 'static> NestedRule<W> for Group<W> { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule<W> { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Added<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Added<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } impl<W : Write + 'static> NestedRule<W> for Added<W> { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule<W> { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Deleted<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Deleted<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } impl<W : Write + 'static> NestedRule<W> for Deleted<W> { fn produce(&mut self, _c : char, _ctx : &Context) { } fn next(self : Box<Self>) -> AnyChainRule<W> { self.0 } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Replaced<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Replaced<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { basic_consume(self, c, ctx, false) } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } impl<W : Write + 'static> NestedRule<W> for Replaced<W> { fn produce(&mut self, c : char, ctx : &Context) { self.0.produce(c, ctx) } fn next(self : Box<Self>) -> AnyChainRule<W> { Scan::new(Deleted(self.0)) } fn start_ignored_comment(&mut self, c : char) { self.0.start_ignored_comment(c) } } struct Scan<W : Write>(AnyNestedRule<W>); impl<W : Write + 'static> ChainRule<W> for Scan<W> { fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> { if c.is_whitespace() || c == '\n' { self } else if c == '{' { self.0 } else if c == '%' { Box::new(IgnoreComment(self)) } else { panic!("Non-whitespace character ({c}) separating arguments on \ line {lineno}", lineno = ctx.lineno) } } fn flush(self : Box<Self>, ctx : &Context) -> W { self.0.flush(ctx) } } impl<W : Write + 'static> Scan<W> { fn new<R : NestedRule<W> + 'static>(r : R) -> Box<dyn ChainRule<W>> { Box::new(Scan(Box::new(r))) } } struct Out<W : Write> { only_whitespace : bool, stored_whitespace : String, output : W, whitespace_satisfied : bool, par_satisfied : bool, ignored_comment_only_line : bool } impl<W : Write> Out<W> { pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) { let skip_linefeed = if input_only_ws { // Need a paragraph break strip_ws && self.par_satisfied } else if strip_ws { self.only_whitespace && self.whitespace_satisfied } else { // Skip comment-only lines if the comment is ignored self.ignored_comment_only_line }; if !skip_linefeed { if !strip_ws { write!(self.output, "{}", self.stored_whitespace).unwrap(); } write!(self.output, "\n").unwrap(); self.whitespace_satisfied = true; self.par_satisfied = self.only_whitespace; } self.stored_whitespace.clear(); self.only_whitespace = true; self.ignored_comment_only_line = false; } } fn main() { let cli = CommandLineArgs::parse(); match (cli.input, cli.output) { (None, None) => { process_buffered(cli.config, io::stdin(), io::stdout()); }, (None, Some(o)) => { process_buffered(cli.config, io::stdin(), File::create(o).unwrap()); } (Some(i), None) => { process_buffered(cli.config, File::open(i).unwrap(), io::stdout()); } (Some(i), Some(o)) => { process_buffered(cli.config, File::open(i).unwrap(), File::create(o).unwrap()); } } } fn process_buffered<I : Read, O : Debug + Write + 'static>( config : Config, input : I, output : O ) -> O { process(config, BufReader::new(input), BufWriter::new(output)).into_inner().unwrap() } fn process<I : BufRead, O : Write + 'static>(config : Config, input : I, output : O) -> O { let mut rule : Box<dyn ChainRule<O>> = Box::new(Out { only_whitespace : true, stored_whitespace : String::new(), output, whitespace_satisfied : true, par_satisfied : true, ignored_comment_only_line : false }); let mut ctx = Context{ lineno : 0, cli : config, input_only_ws : true}; for l in input.lines().map(|l| l.unwrap()) { ctx.lineno += 1; ctx.input_only_ws = true; for c in l.chars() { ctx.input_only_ws = ctx.input_only_ws && c.is_whitespace(); rule = rule.consume(c, &ctx); } rule = rule.consume('\n', &ctx); } rule.flush(&ctx) } #[cfg(test)] mod tests { use super::*; use std::io::Cursor; fn process_str(config : Config, input : &str) -> String { let b = process(config, Cursor::new(input.as_bytes()), Vec::new()); String::from_utf8(b).unwrap() } #[test] fn test1_changes() { let p = |s| process_str(Config { strip_comments : false, strip_whitespace : false }, s); assert_eq!(p("x\\added{y}z\n"), "xyz\n"); assert_eq!(p("x\\deleted{y}z\n"), "xz\n"); assert_eq!(p("x\\replaced{y}{q}z\n"), "xyz\n"); // TODO: working as intended, but not how would be good for LaTeX consistency assert_eq!(p("x\\added{y{q}\\bar}z\n"), "xy{q}\\barz\n"); assert_eq!(p("x\\deleted{y{q}\\bar}z\n"), "xz\n"); assert_eq!(p("x\\added{y\na}z\n"), "xy\naz\n"); assert_eq!(p("x\\deleted{y\na}z\n"), "xz\n"); assert_eq!(p("x\\replaced{y\na}{q}z\n"), "xy\naz\n"); assert_eq!(p("x\\added{y\\test{\na}}z\n"), "xy\\test{\na}z\n"); assert_eq!(p("x\\deleted{y\\test{\na}}z\n"), "xz\n"); assert_eq!(p("x\\deleted{y\\added{a}}z\n"), "xz\n"); assert_eq!(p("\\added{%\n\\begin{a}\n x\n y\n z\\end{a}}\n"), "%\n\\begin{a}\n x\n y\n z\\end{a}\n"); assert_eq!(p("\\added\n\n {q}\n"), "q\n"); assert_eq!(p("\\replaced{\\{q}\n \n{z}\n"), "\\{q\n"); assert_eq!(p("\\replaced{q\\}}\n \n{z}\n"), "q\\}\n"); } #[test] fn test2_comments() { let p = |s| process_str(Config { strip_comments : true, strip_whitespace : false }, s); assert_eq!(p("\\added{%\n\\begin{a}\n x\n y\n z\\end{a}}\n"), "\\begin{a}\n x\n y\n z\\end{a}\n"); assert_eq!(p(" test % comments"), " test \n"); assert_eq!(p(" test% comments"), " test%\n"); assert_eq!(p(" % comments\nline"), "line\n"); } #[test] fn test3_whitespace() { let p = |s| process_str(Config { strip_comments : false, strip_whitespace : true }, s); assert_eq!(p("a\n\n\n\nb\n"), "a\n\nb\n"); assert_eq!(p("a\n\n \n\nb\n"), "a\n\nb\n"); assert_eq!(p("a \n\n\n\nb\n"), "a\n\nb\n"); assert_eq!(p("a\n\n% comment\n\nb\n"), "a\n\n% comment\n\nb\n"); assert_eq!(p("a\n\n % comment \n\nb\n"), "a\n\n % comment\n\nb\n"); assert_eq!(p("a % comment \n\n\n\nb\n"), "a % comment\n\nb\n"); } #[test] fn test4_comments_whitespace() { let p = |s| process_str(Config { strip_comments : true, strip_whitespace : true }, s); assert_eq!(p("\\added{%\n\\begin{a}\n x\n y\n z\\end{a}}\n"), "\\begin{a}\n x\n y\n z\\end{a}\n"); assert_eq!(p(" test % comments"), " test\n"); assert_eq!(p(" % comments\nline"), "line\n"); assert_eq!(p("a\n\n% comment\n\nb\n"), "a\n\nb\n"); assert_eq!(p("a\n\n % comment \n\nb\n"), "a\n\nb\n"); assert_eq!(p("a % comment \n\n\n\nb\n"), "a\n\nb\n"); assert_eq!(p("\\added{a % comment \n\n\n\nb}\n"), "a\n\nb\n"); assert_eq!(p("c\\added{a % comment \n\n\n\nb}\n"), "ca\n\nb\n"); } }