Implement a more expansible parser

Thu, 19 Oct 2023 23:25:34 -0500

author
Tuomo Valkonen <tuomov@iki.fi>
date
Thu, 19 Oct 2023 23:25:34 -0500
changeset 6
de1cf8032322
parent 5
3716d0eaa356
child 7
68538da191c7

Implement a more expansible parser

Cargo.toml file | annotate | diff | comparison | revisions
src/main.rs file | annotate | diff | comparison | revisions
--- a/Cargo.toml	Thu Oct 19 15:40:04 2023 -0500
+++ b/Cargo.toml	Thu Oct 19 23:25:34 2023 -0500
@@ -1,6 +1,6 @@
 [package]
 name = "strip-changes-markup"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Tuomo Valkonen <tuomov@iki.fi>"]
 description = "Removes changes-markup from LaTeX documents"
--- a/src/main.rs	Thu Oct 19 15:40:04 2023 -0500
+++ b/src/main.rs	Thu Oct 19 23:25:34 2023 -0500
@@ -1,6 +1,8 @@
 // The main documentation is in the README.
 #![doc = include_str!("../README.md")]
 
+#![feature(trait_upcasting)]
+
 use std::io;
 use std::fs::File;
 use std::io::{BufWriter, BufRead, BufReader};
@@ -22,6 +24,12 @@
     #[arg(long, short = 'o')]
     output : Option<String>,
 
+    #[clap(flatten)]
+    config : Config
+}
+
+#[derive(Parser, Debug)]
+struct Config {
     #[arg(long, short = 'c')]
     /// Strip comments
     strip_comments : bool,
@@ -31,91 +39,332 @@
     strip_whitespace : bool,
 }
 
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-enum Element {
-    Added,
-    Deleted,
-    Replaced,
-    Other,
-    Comment,
+struct Context {
+    lineno : usize,
+    input_only_ws : bool,
+    cli : Config
+}
+
+type AnyChainRule = Box<dyn ChainRule>;
+type AnyNestedRule = Box<dyn NestedRule>;
+
+trait ChainRule {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule;
+    fn flush(self : Box<Self>, ctx : &Context);
+}
+
+trait NestedRule : ChainRule {
+    fn produce(&mut self, c : char, ctx : &Context);
+    fn next(self : Box<Self>) -> AnyChainRule;
+    fn produce_string(&mut self, s : String, ctx : &Context) {
+        s.chars().for_each(|c| self.produce(c, ctx));
+    }
+    fn start_ignored_comment(&mut self, c : char);
+}
+
+impl<W : Write + 'static> ChainRule for Out<W> {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        basic_consume(self, c, ctx, true)
+    }
+    fn flush(mut self : Box<Self>, _ctx : &Context) {
+        self.output.flush().unwrap();
+    }
+}
+
+impl<W : Write + 'static> NestedRule for Out<W> {
+    fn produce(&mut self, c : char, ctx : &Context) {
+        if c == '\n' {
+            self.line_end(ctx.cli.strip_whitespace, ctx.input_only_ws)
+        } else if c.is_whitespace() {
+            self.stored_whitespace.push(c);
+        } else {
+            write!(self.output, "{}{}", self.stored_whitespace, c).unwrap();
+            self.stored_whitespace.clear();
+            self.only_whitespace = false;
+            self.whitespace_satisfied = false;
+            self.par_satisfied = false;
+        }
+    }
+
+    fn next(self : Box<Self>) -> AnyChainRule {
+        self
+    }
+
+    fn start_ignored_comment(&mut self, c : char) {
+        if self.stored_whitespace.is_empty() && !self.only_whitespace {
+            // The marker needs to be inserted if there is to be no whitespace inserted
+            write!(self.output, "{c}").unwrap();
+            self.whitespace_satisfied = false;
+            self.par_satisfied = false;
+            self.only_whitespace = false;
+        } else if self.only_whitespace {
+            self.ignored_comment_only_line = true
+        }
+    }
+}
+
+fn basic_consume(mut s : AnyNestedRule, c : char, ctx : &Context, print_end : bool)
+-> AnyChainRule {
+    match c {
+        '{' => {
+            s.produce(c, ctx);
+            Box::new(Group(s))
+        },
+        '}' => {
+            if print_end {
+                s.produce(c, ctx);
+            }
+            s.next()
+        },
+        '\\' => {
+            Box::new(CommandName{parent : s, command : "\\".to_string()})
+        },
+        '%' => {
+            if !ctx.cli.strip_comments {
+                s.produce(c, ctx);
+                Box::new(Comment(s))
+            } else {
+                s.start_ignored_comment(c);
+                Box::new(IgnoreComment(s))
+            }
+        },
+        _ => {
+            s.produce(c, ctx);
+            s
+        }
+    }
+}
+
+struct CommandName {
+    parent : AnyNestedRule,
+    command : String
+}
+
+impl ChainRule for CommandName {
+    fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        match c {
+            '}' | '{' | '\\' if self.command.len() <= 1 => {
+                self.command.push(c);
+                self.handle(ctx)
+            },
+            c if c.is_alphanumeric() => {
+                self.command.push(c);
+                self
+            },
+            c => {
+                self.handle(ctx)
+                    .consume(c, ctx)
+            }
+        }
+    }
+
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.handle(ctx)
+            .flush(ctx)
+    }
+}
+
+impl CommandName {
+    fn handle(mut self, ctx : &Context) -> AnyChainRule {
+        match self.command.as_str() {
+            "\\added" => {
+                Scan::new(Added(self.parent))
+            },
+            "\\replaced" => {
+                Scan::new(Replaced(self.parent))
+            },
+            "\\deleted" => {
+                Scan::new(Deleted(self.parent))
+            },
+            _ => {
+                self.parent.produce_string(self.command, ctx);
+                self.parent
+            }
+        }
+    }
 }
 
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-enum Status {
-    Output(Element),
-    Ignore(Element),
-    Scan(Element, bool),
+struct Comment(AnyNestedRule);
+
+impl ChainRule for Comment {
+    fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        if c == '\n' {
+            self.0.consume(c, ctx)
+        } else {
+            self.0.produce(c, ctx);
+            self
+        }
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+struct IgnoreComment(AnyChainRule);
+
+impl ChainRule for IgnoreComment {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        if c == '\n' {
+            self.0.consume(c, ctx)
+        } else {
+            self
+        }
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+struct Group(AnyNestedRule);
+
+impl ChainRule for Group {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        basic_consume(self, c, ctx, true)
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+impl NestedRule for Group {
+    fn produce(&mut self, c : char, ctx : &Context) {
+        self.0.produce(c, ctx)
+    }
+    fn next(self : Box<Self>) -> AnyChainRule {
+        self.0
+    }
+    fn start_ignored_comment(&mut self, c : char) {
+        self.0.start_ignored_comment(c)
+    }
+}
+
+struct Added(AnyNestedRule);
+
+impl ChainRule for Added {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        basic_consume(self, c, ctx, false)
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
 }
 
-use Status::*;
-use Element::*;
+impl NestedRule for Added {
+    fn produce(&mut self, c : char, ctx : &Context) {
+        self.0.produce(c, ctx)
+    }
+    fn next(self : Box<Self>) -> AnyChainRule {
+        self.0
+    }
+    fn start_ignored_comment(&mut self, c : char) {
+        self.0.start_ignored_comment(c)
+    }
+}
+struct Deleted(AnyNestedRule);
+
+impl ChainRule for Deleted {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        basic_consume(self, c, ctx, false)
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+impl NestedRule for Deleted {
+    fn produce(&mut self, _c : char, _ctx : &Context) {
+    }
+    fn next(self : Box<Self>) -> AnyChainRule {
+        self.0
+    }
+    fn start_ignored_comment(&mut self, c : char) {
+        self.0.start_ignored_comment(c)
+    }
+}
+
+struct Replaced(AnyNestedRule);
+
+impl ChainRule for Replaced {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        basic_consume(self, c, ctx, false)
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+impl NestedRule for Replaced {
+    fn produce(&mut self, c : char, ctx : &Context) {
+        self.0.produce(c, ctx)
+    }
+    fn next(self : Box<Self>) -> AnyChainRule {
+        Scan::new(Deleted(self.0))
+    }
+    fn start_ignored_comment(&mut self, c : char) {
+        self.0.start_ignored_comment(c)
+    }
+}
+
+struct Scan(AnyNestedRule);
+
+impl ChainRule for Scan {
+    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule {
+        if c.is_whitespace() || c == '\n' {
+            self
+        } else if c == '{' {
+            self.0
+        } else if c == '%' {
+            Box::new(IgnoreComment(self))
+        } else {
+            panic!("Non-whitespace character ({c}) separating arguments on \
+                    line {lineno}", lineno = ctx.lineno)
+        }
+    }
+    fn flush(self : Box<Self>, ctx : &Context) {
+        self.0.flush(ctx)
+    }
+}
+
+impl Scan {
+    fn new<R : NestedRule + 'static>(r : R) -> Box<dyn ChainRule> {
+        Box::new(Scan(Box::new(r)))
+    }
+}
+
 
 struct Out<W : Write> {
     only_whitespace : bool,
     stored_whitespace : String,
     output : W,
-    stack : Vec<Status>,
     whitespace_satisfied : bool,
     par_satisfied : bool,
+    ignored_comment_only_line : bool
 }
 
 impl<W : Write> Out<W> {
-    fn current(&self) -> Status {
-        self.stack.last().map_or(Output(Other), |s| *s)
-    }
-
-    fn raw_out(&mut self, c : char) {
-        write!(self.output, "{}", c).unwrap();
-    }
-
-    pub fn out(&mut self, c : char) {
-        self.only_whitespace = false;
-        write!(self.output, "{}{}", self.stored_whitespace, c).unwrap();
-        self.stored_whitespace.clear();
-        self.whitespace_satisfied = false;
-        self.par_satisfied = false;
-    }
-
-    pub fn whitespace(&mut self, c : char) {
-        self.stored_whitespace.push(c);
-    }
-
     pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) {
-        let cur = self.current();
         let skip_linefeed = if input_only_ws {
             // Need a paragraph break
             strip_ws && self.par_satisfied
         } else if strip_ws {
             self.only_whitespace && self.whitespace_satisfied
-        } else if let Ignore(Comment) = cur {
+        } else {
             // Skip comment-only lines if the comment is ignored
-            self.only_whitespace
-        } else if let Ignore(_) = cur {
-            // Skip line feeds in ignored bits
-            true
-        } else {
-            false
+            self.ignored_comment_only_line
         };
 
         if !skip_linefeed {
             if !strip_ws {
                 write!(self.output, "{}", self.stored_whitespace).unwrap();
             }
-            self.raw_out('\n');
+            write!(self.output, "\n").unwrap();
             self.whitespace_satisfied = true;
             self.par_satisfied = self.only_whitespace;
         }
 
-        if let Ignore(Comment) | Output(Comment) = cur {
-            self.stack.pop();
-        }
-
         self.stored_whitespace.clear();
         self.only_whitespace = true;
-    }
-
-    pub fn flush(&mut self) {
-        self.output.flush().unwrap();
+        self.ignored_comment_only_line = false;
     }
 }
 
@@ -130,135 +379,26 @@
         |f| Box::new(BufWriter::new(File::create(f).unwrap())) as Box<dyn Write>
     );
     
-    let mut o = Out {
+    let mut rule : Box<dyn ChainRule> = Box::new(Out {
         only_whitespace : true,
         stored_whitespace : String::new(),
         output,
-        stack : Vec::new(),
         whitespace_satisfied : true,
         par_satisfied : true,
-    };
+        ignored_comment_only_line : false
+    });
 
-    let mut lineno = 0;
+    let mut ctx = Context{ lineno : 0, cli : cli.config, input_only_ws : true};
 
     for l in input.lines().map(|l| l.unwrap()) {
-        lineno += 1;
-        let mut chars = l.chars();
-        let mut maybe_next_char = None;
-        let mut input_only_ws = true;
-
-        'process_line: loop {
-            let next_char = match maybe_next_char {
-                None => chars.next(),
-                Some(c) => {
-                    maybe_next_char = None;
-                    Some(c)
-                }
-            };
-            input_only_ws = input_only_ws && next_char.map_or(true, |c| c.is_whitespace());
-            match(o.current(), next_char) {
-                (_, None) => {
-                    break 'process_line;
-                },
-                (st @ (Output(e) | Ignore(e)), Some('\\')) if e != Comment => {
-                    let mut command = String::new();
-                    let mut first = true;
-                    maybe_next_char = 'scan_command: loop {
-                        match chars.next() {
-                            Some(c) if first && (c=='{' || c=='}' || c=='\\') => {
-                                command.push(c);
-                                break 'scan_command None;
-                            },
-                            Some(c) if c.is_alphanumeric() => {
-                                command.push(c);
-                            },
-                            maybe_c => {
-                                break 'scan_command maybe_c;
-                            }
-                        }
-                        first = false;
-                    };
-                    let output_guard = if let Ignore(_) = st { false } else { true };
-                    match command.as_str() {
-                        "added" => {
-                            o.stack.push(Scan(Added, true && output_guard));
-                        },
-                        "replaced" => {
-                            o.stack.push(Scan(Replaced, true && output_guard));
-                        },
-                        "deleted" => {
-                            o.stack.push(Scan(Deleted, false));
-                        },
-                        _ => {
-                            if output_guard {
-                                o.out('\\');
-                                command.chars().for_each(|c| o.out(c.clone()));
-                            }
-                        }
-                    };
-                },
-                (Scan(next, out), Some(c)) => {
-                    match c {
-                        '{' => {
-                            o.stack.pop();
-                            o.stack.push(if out { Output(next) } else { Ignore(next) });
-                        },
-                        ' ' => {
-                        },
-                        _ => panic!("Non-whitespace character ({c}) separating arguments on\
-                                     line {lineno}"),
-                    }
-                },
-                (Output(e), Some('{')) if e != Comment => {
-                    o.out('{');
-                    o.stack.push(Output(Other));
-                },
-                (Ignore(e), Some('{')) if e != Comment => {
-                    o.stack.push(Ignore(Other));
-                },
-                (Output(Added) | Ignore(Added) | Output(Deleted) | Ignore(Deleted), Some('}')) => {
-                    o.stack.pop();
-                },
-                (Output(Replaced) | Ignore(Replaced), Some('}')) => {
-                    o.stack.pop();
-                    o.stack.push(Scan(Deleted, false));
-                },
-                (Output(Other), Some('}')) => {
-                    o.out('}');
-                    o.stack.pop();
-                },
-                (Ignore(e), Some('}')) if e != Comment => {
-                    o.stack.pop();
-                },
-                (Output(e), Some('%')) if e != Comment=> {
-                    if cli.strip_comments {
-                        if o.stored_whitespace.is_empty() && !o.only_whitespace {
-                            // Output comment marker if it is required to maintain
-                            // lack of whitespace.
-                            o.out('%');
-                        }
-                        o.stack.push(Ignore(Comment));
-                    } else {
-                        o.out('%');
-                        o.stack.push(Output(Comment));
-                    }
-                },
-                (Ignore(e), Some('%')) if e != Comment => {
-                    o.stack.push(Ignore(Comment));
-                },
-                (Output(_), Some(c)) if c.is_whitespace() => {
-                    o.whitespace(c);
-                },
-                (Output(_), Some(c)) => {
-                    o.out(c);
-                },
-                (Ignore(_), Some(_)) => {
-                },
-            };
+        ctx.lineno += 1;
+        ctx.input_only_ws = true;
+        for c in l.chars() {
+            ctx.input_only_ws = ctx.input_only_ws && c.is_whitespace();
+            rule = rule.consume(c, &ctx);
         }
-
-        o.line_end(cli.strip_whitespace, input_only_ws);
+        rule = rule.consume('\n', &ctx);
     }
 
-    o.flush();
+    rule.flush(&ctx);
 }

mercurial