Thu, 19 Oct 2023 23:25:34 -0500
Implement a more expansible parser
Cargo.toml | file | annotate | diff | comparison | revisions | |
src/main.rs | file | annotate | diff | comparison | revisions |
--- a/Cargo.toml Thu Oct 19 15:40:04 2023 -0500 +++ b/Cargo.toml Thu Oct 19 23:25:34 2023 -0500 @@ -1,6 +1,6 @@ [package] name = "strip-changes-markup" -version = "0.1.0" +version = "0.2.0" edition = "2021" authors = ["Tuomo Valkonen <tuomov@iki.fi>"] description = "Removes changes-markup from LaTeX documents"
--- a/src/main.rs Thu Oct 19 15:40:04 2023 -0500 +++ b/src/main.rs Thu Oct 19 23:25:34 2023 -0500 @@ -1,6 +1,8 @@ // The main documentation is in the README. #![doc = include_str!("../README.md")] +#![feature(trait_upcasting)] + use std::io; use std::fs::File; use std::io::{BufWriter, BufRead, BufReader}; @@ -22,6 +24,12 @@ #[arg(long, short = 'o')] output : Option<String>, + #[clap(flatten)] + config : Config +} + +#[derive(Parser, Debug)] +struct Config { #[arg(long, short = 'c')] /// Strip comments strip_comments : bool, @@ -31,91 +39,332 @@ strip_whitespace : bool, } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -enum Element { - Added, - Deleted, - Replaced, - Other, - Comment, +struct Context { + lineno : usize, + input_only_ws : bool, + cli : Config +} + +type AnyChainRule = Box<dyn ChainRule>; +type AnyNestedRule = Box<dyn NestedRule>; + +trait ChainRule { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule; + fn flush(self : Box<Self>, ctx : &Context); +} + +trait NestedRule : ChainRule { + fn produce(&mut self, c : char, ctx : &Context); + fn next(self : Box<Self>) -> AnyChainRule; + fn produce_string(&mut self, s : String, ctx : &Context) { + s.chars().for_each(|c| self.produce(c, ctx)); + } + fn start_ignored_comment(&mut self, c : char); +} + +impl<W : Write + 'static> ChainRule for Out<W> { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + basic_consume(self, c, ctx, true) + } + fn flush(mut self : Box<Self>, _ctx : &Context) { + self.output.flush().unwrap(); + } +} + +impl<W : Write + 'static> NestedRule for Out<W> { + fn produce(&mut self, c : char, ctx : &Context) { + if c == '\n' { + self.line_end(ctx.cli.strip_whitespace, ctx.input_only_ws) + } else if c.is_whitespace() { + self.stored_whitespace.push(c); + } else { + write!(self.output, "{}{}", self.stored_whitespace, c).unwrap(); + self.stored_whitespace.clear(); + self.only_whitespace = false; + self.whitespace_satisfied = false; + self.par_satisfied = false; + } + } + + fn next(self : Box<Self>) -> AnyChainRule { + self + } + + fn start_ignored_comment(&mut self, c : char) { + if self.stored_whitespace.is_empty() && !self.only_whitespace { + // The marker needs to be inserted if there is to be no whitespace inserted + write!(self.output, "{c}").unwrap(); + self.whitespace_satisfied = false; + self.par_satisfied = false; + self.only_whitespace = false; + } else if self.only_whitespace { + self.ignored_comment_only_line = true + } + } +} + +fn basic_consume(mut s : AnyNestedRule, c : char, ctx : &Context, print_end : bool) +-> AnyChainRule { + match c { + '{' => { + s.produce(c, ctx); + Box::new(Group(s)) + }, + '}' => { + if print_end { + s.produce(c, ctx); + } + s.next() + }, + '\\' => { + Box::new(CommandName{parent : s, command : "\\".to_string()}) + }, + '%' => { + if !ctx.cli.strip_comments { + s.produce(c, ctx); + Box::new(Comment(s)) + } else { + s.start_ignored_comment(c); + Box::new(IgnoreComment(s)) + } + }, + _ => { + s.produce(c, ctx); + s + } + } +} + +struct CommandName { + parent : AnyNestedRule, + command : String +} + +impl ChainRule for CommandName { + fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + match c { + '}' | '{' | '\\' if self.command.len() <= 1 => { + self.command.push(c); + self.handle(ctx) + }, + c if c.is_alphanumeric() => { + self.command.push(c); + self + }, + c => { + self.handle(ctx) + .consume(c, ctx) + } + } + } + + fn flush(self : Box<Self>, ctx : &Context) { + self.handle(ctx) + .flush(ctx) + } +} + +impl CommandName { + fn handle(mut self, ctx : &Context) -> AnyChainRule { + match self.command.as_str() { + "\\added" => { + Scan::new(Added(self.parent)) + }, + "\\replaced" => { + Scan::new(Replaced(self.parent)) + }, + "\\deleted" => { + Scan::new(Deleted(self.parent)) + }, + _ => { + self.parent.produce_string(self.command, ctx); + self.parent + } + } + } } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -enum Status { - Output(Element), - Ignore(Element), - Scan(Element, bool), +struct Comment(AnyNestedRule); + +impl ChainRule for Comment { + fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + if c == '\n' { + self.0.consume(c, ctx) + } else { + self.0.produce(c, ctx); + self + } + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +struct IgnoreComment(AnyChainRule); + +impl ChainRule for IgnoreComment { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + if c == '\n' { + self.0.consume(c, ctx) + } else { + self + } + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +struct Group(AnyNestedRule); + +impl ChainRule for Group { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + basic_consume(self, c, ctx, true) + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +impl NestedRule for Group { + fn produce(&mut self, c : char, ctx : &Context) { + self.0.produce(c, ctx) + } + fn next(self : Box<Self>) -> AnyChainRule { + self.0 + } + fn start_ignored_comment(&mut self, c : char) { + self.0.start_ignored_comment(c) + } +} + +struct Added(AnyNestedRule); + +impl ChainRule for Added { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + basic_consume(self, c, ctx, false) + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } } -use Status::*; -use Element::*; +impl NestedRule for Added { + fn produce(&mut self, c : char, ctx : &Context) { + self.0.produce(c, ctx) + } + fn next(self : Box<Self>) -> AnyChainRule { + self.0 + } + fn start_ignored_comment(&mut self, c : char) { + self.0.start_ignored_comment(c) + } +} +struct Deleted(AnyNestedRule); + +impl ChainRule for Deleted { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + basic_consume(self, c, ctx, false) + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +impl NestedRule for Deleted { + fn produce(&mut self, _c : char, _ctx : &Context) { + } + fn next(self : Box<Self>) -> AnyChainRule { + self.0 + } + fn start_ignored_comment(&mut self, c : char) { + self.0.start_ignored_comment(c) + } +} + +struct Replaced(AnyNestedRule); + +impl ChainRule for Replaced { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + basic_consume(self, c, ctx, false) + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +impl NestedRule for Replaced { + fn produce(&mut self, c : char, ctx : &Context) { + self.0.produce(c, ctx) + } + fn next(self : Box<Self>) -> AnyChainRule { + Scan::new(Deleted(self.0)) + } + fn start_ignored_comment(&mut self, c : char) { + self.0.start_ignored_comment(c) + } +} + +struct Scan(AnyNestedRule); + +impl ChainRule for Scan { + fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule { + if c.is_whitespace() || c == '\n' { + self + } else if c == '{' { + self.0 + } else if c == '%' { + Box::new(IgnoreComment(self)) + } else { + panic!("Non-whitespace character ({c}) separating arguments on \ + line {lineno}", lineno = ctx.lineno) + } + } + fn flush(self : Box<Self>, ctx : &Context) { + self.0.flush(ctx) + } +} + +impl Scan { + fn new<R : NestedRule + 'static>(r : R) -> Box<dyn ChainRule> { + Box::new(Scan(Box::new(r))) + } +} + struct Out<W : Write> { only_whitespace : bool, stored_whitespace : String, output : W, - stack : Vec<Status>, whitespace_satisfied : bool, par_satisfied : bool, + ignored_comment_only_line : bool } impl<W : Write> Out<W> { - fn current(&self) -> Status { - self.stack.last().map_or(Output(Other), |s| *s) - } - - fn raw_out(&mut self, c : char) { - write!(self.output, "{}", c).unwrap(); - } - - pub fn out(&mut self, c : char) { - self.only_whitespace = false; - write!(self.output, "{}{}", self.stored_whitespace, c).unwrap(); - self.stored_whitespace.clear(); - self.whitespace_satisfied = false; - self.par_satisfied = false; - } - - pub fn whitespace(&mut self, c : char) { - self.stored_whitespace.push(c); - } - pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) { - let cur = self.current(); let skip_linefeed = if input_only_ws { // Need a paragraph break strip_ws && self.par_satisfied } else if strip_ws { self.only_whitespace && self.whitespace_satisfied - } else if let Ignore(Comment) = cur { + } else { // Skip comment-only lines if the comment is ignored - self.only_whitespace - } else if let Ignore(_) = cur { - // Skip line feeds in ignored bits - true - } else { - false + self.ignored_comment_only_line }; if !skip_linefeed { if !strip_ws { write!(self.output, "{}", self.stored_whitespace).unwrap(); } - self.raw_out('\n'); + write!(self.output, "\n").unwrap(); self.whitespace_satisfied = true; self.par_satisfied = self.only_whitespace; } - if let Ignore(Comment) | Output(Comment) = cur { - self.stack.pop(); - } - self.stored_whitespace.clear(); self.only_whitespace = true; - } - - pub fn flush(&mut self) { - self.output.flush().unwrap(); + self.ignored_comment_only_line = false; } } @@ -130,135 +379,26 @@ |f| Box::new(BufWriter::new(File::create(f).unwrap())) as Box<dyn Write> ); - let mut o = Out { + let mut rule : Box<dyn ChainRule> = Box::new(Out { only_whitespace : true, stored_whitespace : String::new(), output, - stack : Vec::new(), whitespace_satisfied : true, par_satisfied : true, - }; + ignored_comment_only_line : false + }); - let mut lineno = 0; + let mut ctx = Context{ lineno : 0, cli : cli.config, input_only_ws : true}; for l in input.lines().map(|l| l.unwrap()) { - lineno += 1; - let mut chars = l.chars(); - let mut maybe_next_char = None; - let mut input_only_ws = true; - - 'process_line: loop { - let next_char = match maybe_next_char { - None => chars.next(), - Some(c) => { - maybe_next_char = None; - Some(c) - } - }; - input_only_ws = input_only_ws && next_char.map_or(true, |c| c.is_whitespace()); - match(o.current(), next_char) { - (_, None) => { - break 'process_line; - }, - (st @ (Output(e) | Ignore(e)), Some('\\')) if e != Comment => { - let mut command = String::new(); - let mut first = true; - maybe_next_char = 'scan_command: loop { - match chars.next() { - Some(c) if first && (c=='{' || c=='}' || c=='\\') => { - command.push(c); - break 'scan_command None; - }, - Some(c) if c.is_alphanumeric() => { - command.push(c); - }, - maybe_c => { - break 'scan_command maybe_c; - } - } - first = false; - }; - let output_guard = if let Ignore(_) = st { false } else { true }; - match command.as_str() { - "added" => { - o.stack.push(Scan(Added, true && output_guard)); - }, - "replaced" => { - o.stack.push(Scan(Replaced, true && output_guard)); - }, - "deleted" => { - o.stack.push(Scan(Deleted, false)); - }, - _ => { - if output_guard { - o.out('\\'); - command.chars().for_each(|c| o.out(c.clone())); - } - } - }; - }, - (Scan(next, out), Some(c)) => { - match c { - '{' => { - o.stack.pop(); - o.stack.push(if out { Output(next) } else { Ignore(next) }); - }, - ' ' => { - }, - _ => panic!("Non-whitespace character ({c}) separating arguments on\ - line {lineno}"), - } - }, - (Output(e), Some('{')) if e != Comment => { - o.out('{'); - o.stack.push(Output(Other)); - }, - (Ignore(e), Some('{')) if e != Comment => { - o.stack.push(Ignore(Other)); - }, - (Output(Added) | Ignore(Added) | Output(Deleted) | Ignore(Deleted), Some('}')) => { - o.stack.pop(); - }, - (Output(Replaced) | Ignore(Replaced), Some('}')) => { - o.stack.pop(); - o.stack.push(Scan(Deleted, false)); - }, - (Output(Other), Some('}')) => { - o.out('}'); - o.stack.pop(); - }, - (Ignore(e), Some('}')) if e != Comment => { - o.stack.pop(); - }, - (Output(e), Some('%')) if e != Comment=> { - if cli.strip_comments { - if o.stored_whitespace.is_empty() && !o.only_whitespace { - // Output comment marker if it is required to maintain - // lack of whitespace. - o.out('%'); - } - o.stack.push(Ignore(Comment)); - } else { - o.out('%'); - o.stack.push(Output(Comment)); - } - }, - (Ignore(e), Some('%')) if e != Comment => { - o.stack.push(Ignore(Comment)); - }, - (Output(_), Some(c)) if c.is_whitespace() => { - o.whitespace(c); - }, - (Output(_), Some(c)) => { - o.out(c); - }, - (Ignore(_), Some(_)) => { - }, - }; + ctx.lineno += 1; + ctx.input_only_ws = true; + for c in l.chars() { + ctx.input_only_ws = ctx.input_only_ws && c.is_whitespace(); + rule = rule.consume(c, &ctx); } - - o.line_end(cli.strip_whitespace, input_only_ws); + rule = rule.consume('\n', &ctx); } - o.flush(); + rule.flush(&ctx); }