Add whitespace and comment stripping

Thu, 19 Oct 2023 15:10:12 -0500

author
Tuomo Valkonen <tuomov@iki.fi>
date
Thu, 19 Oct 2023 15:10:12 -0500
changeset 2
254e1e4bd795
parent 1
a88aed2bdf13
child 3
cec573b16b46

Add whitespace and comment stripping

Cargo.toml file | annotate | diff | comparison | revisions
src/main.rs file | annotate | diff | comparison | revisions
--- a/Cargo.toml	Thu Oct 19 12:08:07 2023 -0500
+++ b/Cargo.toml	Thu Oct 19 15:10:12 2023 -0500
@@ -13,4 +13,5 @@
 ]
 
 [dependencies]
+clap = { version = "~4.0.27", features = ["derive", "unicode", "wrap_help"] }
 
--- a/src/main.rs	Thu Oct 19 12:08:07 2023 -0500
+++ b/src/main.rs	Thu Oct 19 15:10:12 2023 -0500
@@ -1,6 +1,27 @@
+// The main documentation is in the README.
+#![doc = include_str!("../README.md")]
+
 use std::io;
 use std::io::BufWriter;
 use std::io::Write;
+use clap::Parser;
+
+/// Command line parameters
+#[derive(Parser, Debug)]
+#[clap(
+    about = env!("CARGO_PKG_DESCRIPTION"),
+    author = env!("CARGO_PKG_AUTHORS"),
+    version = env!("CARGO_PKG_VERSION"),
+)]
+struct CommandLineArgs {
+    #[arg(long, short = 'c')]
+    /// Strip comments
+    strip_comments : bool,
+
+    #[arg(long, short = 'w')]
+    /// Strip unnecessary whitespace
+    strip_whitespace : bool,
+}
 
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 enum Element {
@@ -18,23 +39,99 @@
     Scan(Element, bool),
 }
 
-fn main() {
-    let input = io::stdin();
-    let mut output = BufWriter::new(io::stdout());
-    let mut status_stack = Vec::new();
+use Status::*;
+use Element::*;
+
+struct Out<W : Write> {
+    only_whitespace : bool,
+    stored_whitespace : String,
+    output : W,
+    stack : Vec<Status>,
+    whitespace_satisfied : bool,
+    par_satisfied : bool,
+}
+
+impl<W : Write> Out<W> {
+    fn current(&self) -> Status {
+        self.stack.last().map_or(Output(Other), |s| *s)
+    }
+
+    fn raw_out(&mut self, c : char) {
+        write!(self.output, "{}", c).unwrap();
+    }
+
+    pub fn out(&mut self, c : char) {
+        self.only_whitespace = false;
+        write!(self.output, "{}{}", self.stored_whitespace, c).unwrap();
+        self.stored_whitespace.clear();
+        self.whitespace_satisfied = false;
+        self.par_satisfied = false;
+    }
+
+    pub fn whitespace(&mut self, c : char) {
+        self.stored_whitespace.push(c);
+    }
 
-    use Status::*;
-    use Element::*;
+    pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) {
+        let cur = self.current();
+        let skip_linefeed = if input_only_ws {
+            // Need a paragraph break
+            strip_ws && self.par_satisfied
+        } else if strip_ws {
+            self.only_whitespace && self.whitespace_satisfied
+        } else if let Ignore(Comment) = cur {
+            // Skip comment-only lines if the comment is ignored
+            self.only_whitespace
+        } else if let Ignore(_) = cur {
+            // Skip line feeds in ignored bits
+            true
+        } else {
+            false
+        };
+
+        if !skip_linefeed {
+            if !strip_ws {
+                write!(self.output, "{}", self.stored_whitespace).unwrap();
+            }
+            self.raw_out('\n');
+            self.whitespace_satisfied = true;
+            self.par_satisfied = self.only_whitespace;
+        }
 
-    let current = |s : &Vec<Status>| s.last().map_or(Output(Other), |s| *s);
-    let mut out = |c : char| { write!(output, "{}", c).unwrap(); };
+        if let Ignore(Comment) | Output(Comment) = cur {
+            self.stack.pop();
+        }
+
+        self.stored_whitespace.clear();
+        self.only_whitespace = true;
+    }
+
+    pub fn flush(&mut self) {
+        self.output.flush().unwrap();
+    }
+}
+
+fn main() {
+    let cli = CommandLineArgs::parse();
+    let input = io::stdin();
+
+    let mut o = Out {
+        only_whitespace : true,
+        stored_whitespace : String::new(),
+        output : BufWriter::new(io::stdout()),
+        stack : Vec::new(),
+        whitespace_satisfied : true,
+        par_satisfied : true,
+    };
+
     let mut lineno = 0;
 
     for l in input.lines().map(|l| l.unwrap()) {
         lineno += 1;
         let mut chars = l.chars();
-        let started_ignore = if let Ignore(_) = current(&status_stack) { true } else { false };
         let mut maybe_next_char = None;
+        let mut input_only_ws = true;
+
         'process_line: loop {
             let next_char = match maybe_next_char {
                 None => chars.next(),
@@ -43,7 +140,8 @@
                     Some(c)
                 }
             };
-            match(current(&status_stack), next_char) {
+            input_only_ws = input_only_ws && next_char.map_or(true, |c| c.is_whitespace());
+            match(o.current(), next_char) {
                 (_, None) => {
                     break 'process_line;
                 },
@@ -68,27 +166,27 @@
                     let output_guard = if let Ignore(_) = st { false } else { true };
                     match command.as_str() {
                         "added" => {
-                            status_stack.push(Scan(Added, true && output_guard));
+                            o.stack.push(Scan(Added, true && output_guard));
                         },
                         "replaced" => {
-                            status_stack.push(Scan(Replaced, true && output_guard));
+                            o.stack.push(Scan(Replaced, true && output_guard));
                         },
                         "deleted" => {
-                            status_stack.push(Scan(Deleted, false));
+                            o.stack.push(Scan(Deleted, false));
                         },
                         _ => {
                             if output_guard {
-                                out('\\');
-                                command.chars().for_each(|c| out(c.clone()));
+                                o.out('\\');
+                                command.chars().for_each(|c| o.out(c.clone()));
                             }
                         }
                     };
                 },
-                (Scan(next, o), Some(c)) => {
+                (Scan(next, out), Some(c)) => {
                     match c {
                         '{' => {
-                            status_stack.pop();
-                            status_stack.push(if o { Output(next) } else { Ignore(next) });
+                            o.stack.pop();
+                            o.stack.push(if out { Output(next) } else { Ignore(next) });
                         },
                         ' ' => {
                         },
@@ -97,59 +195,55 @@
                     }
                 },
                 (Output(e), Some('{')) if e != Comment => {
-                    out('{');
-                    status_stack.push(Output(Other));
+                    o.out('{');
+                    o.stack.push(Output(Other));
                 },
                 (Ignore(e), Some('{')) if e != Comment => {
-                    status_stack.push(Ignore(Other));
+                    o.stack.push(Ignore(Other));
                 },
                 (Output(Added) | Ignore(Added) | Output(Deleted) | Ignore(Deleted), Some('}')) => {
-                    status_stack.pop();
+                    o.stack.pop();
                 },
                 (Output(Replaced) | Ignore(Replaced), Some('}')) => {
-                    status_stack.pop();
-                    status_stack.push(Scan(Deleted, false));
+                    o.stack.pop();
+                    o.stack.push(Scan(Deleted, false));
                 },
                 (Output(Other), Some('}')) => {
-                    out('}');
-                    status_stack.pop();
+                    o.out('}');
+                    o.stack.pop();
                 },
-                (Ignore(_), Some('}')) => {
-                    status_stack.pop();
+                (Ignore(e), Some('}')) if e != Comment => {
+                    o.stack.pop();
                 },
                 (Output(e), Some('%')) if e != Comment=> {
-                    out('%');
-                    status_stack.push(Output(Comment));
+                    if cli.strip_comments {
+                        if o.stored_whitespace.is_empty() && !o.only_whitespace {
+                            // Output comment marker if it is required to maintain
+                            // lack of whitespace.
+                            o.out('%');
+                        }
+                        o.stack.push(Ignore(Comment));
+                    } else {
+                        o.out('%');
+                        o.stack.push(Output(Comment));
+                    }
                 },
                 (Ignore(e), Some('%')) if e != Comment => {
-                    status_stack.push(Ignore(Comment));
+                    o.stack.push(Ignore(Comment));
+                },
+                (Output(_), Some(c)) if c.is_whitespace() => {
+                    o.whitespace(c);
                 },
                 (Output(_), Some(c)) => {
-                    out(c);
+                    o.out(c);
                 },
                 (Ignore(_), Some(_)) => {
                 },
             };
         }
-        match current(&status_stack) {
-            Ignore(e) => {
-                if !started_ignore {
-                    out('\n');
-                }
-                if e == Comment {
-                    status_stack.pop();
-                }
-            },
-            Output(e) => {
-                out('\n');
-                if e == Comment {
-                    status_stack.pop();
-                }
-            },
-            Scan(_, _) => {
-            },
-        }
+
+        o.line_end(cli.strip_whitespace, input_only_ws);
     }
 
-    output.flush().unwrap();
+    o.flush();
 }

mercurial