src/main.rs

Fri, 03 May 2024 14:53:53 -0500

author
Tuomo Valkonen <tuomov@iki.fi>
date
Fri, 03 May 2024 14:53:53 -0500
changeset 9
87d632a18fc2
parent 8
945a396340d2
permissions
-rw-r--r--

Properly handle \%

// The main documentation is in the README.
#![doc = include_str!("../README.md")]

#![feature(trait_upcasting)]

use std::io;
use std::fs::File;
use std::io::{BufWriter, BufRead, BufReader, Write, Read};
use std::fmt::Debug;
use clap::Parser;

/// Command line parameters
#[derive(Parser, Debug, Clone)]
#[clap(
    about = env!("CARGO_PKG_DESCRIPTION"),
    author = env!("CARGO_PKG_AUTHORS"),
    version = env!("CARGO_PKG_VERSION"),
)]
struct CommandLineArgs {
    /// Input file (default is stdin)
    input : Option<String>,

    /// Output file (defalt is stdout)
    #[arg(long, short = 'o')]
    output : Option<String>,

    #[clap(flatten)]
    config : Config
}

#[derive(Parser, Debug, Clone)]
struct Config {
    #[arg(long, short = 'c')]
    /// Strip comments
    strip_comments : bool,

    #[arg(long, short = 'w')]
    /// Strip unnecessary whitespace
    strip_whitespace : bool,
}

struct Context {
    lineno : usize,
    input_only_ws : bool,
    cli : Config
}

type AnyChainRule<W> = Box<dyn ChainRule<W>>;
type AnyNestedRule<W> = Box<dyn NestedRule<W>>;

trait ChainRule<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W>;
    fn flush(self : Box<Self>, ctx : &Context) -> W;
}

trait NestedRule<W : Write> : ChainRule<W> {
    fn produce(&mut self, c : char, ctx : &Context);
    fn next(self : Box<Self>) -> AnyChainRule<W>;
    fn produce_string(&mut self, s : String, ctx : &Context) {
        s.chars().for_each(|c| self.produce(c, ctx));
    }
    fn start_ignored_comment(&mut self, c : char);
}

impl<W : Write + 'static> ChainRule<W> for Out<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        basic_consume(self, c, ctx, true)
    }
    fn flush(mut self : Box<Self>, _ctx : &Context) -> W {
        self.output.flush().unwrap();
        self.output
    }
}

impl<W : Write + 'static> NestedRule<W> for Out<W> {
    fn produce(&mut self, c : char, ctx : &Context) {
        if c == '\n' {
            self.line_end(ctx.cli.strip_whitespace, ctx.input_only_ws)
        } else if c.is_whitespace() {
            self.stored_whitespace.push(c);
        } else {
            write!(self.output, "{}{}", self.stored_whitespace, c).unwrap();
            self.stored_whitespace.clear();
            self.only_whitespace = false;
            self.whitespace_satisfied = false;
            self.par_satisfied = false;
        }
    }

    fn next(self : Box<Self>) -> AnyChainRule<W> {
        self
    }

    fn start_ignored_comment(&mut self, c : char) {
        if self.stored_whitespace.is_empty() && !self.only_whitespace {
            // The marker needs to be inserted if there is to be no whitespace inserted
            write!(self.output, "{c}").unwrap();
            self.whitespace_satisfied = false;
            self.par_satisfied = false;
            self.only_whitespace = false;
        } else if self.only_whitespace {
            self.ignored_comment_only_line = true
        }
    }
}

fn basic_consume<W : Write + 'static>(
    mut s : AnyNestedRule<W>,
    c : char,
    ctx : &Context,
    print_end : bool
) -> AnyChainRule<W> {
    match c {
        '{' => {
            s.produce(c, ctx);
            Box::new(Group(s))
        },
        '}' => {
            if print_end {
                s.produce(c, ctx);
            }
            s.next()
        },
        '\\' => {
            Box::new(CommandName{parent : s, command : "\\".to_string()})
        },
        '%' => {
            if !ctx.cli.strip_comments {
                s.produce(c, ctx);
                Box::new(Comment(s))
            } else {
                s.start_ignored_comment(c);
                Box::new(IgnoreComment(s))
            }
        },
        _ => {
            s.produce(c, ctx);
            s
        }
    }
}

struct CommandName<W : Write> {
    parent : AnyNestedRule<W>,
    command : String
}

impl<W : Write + 'static> ChainRule<W> for CommandName<W> {
    fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        match c {
            '}' | '{' | '\\' | '%' if self.command.len() <= 1 => {
                self.command.push(c);
                self.handle(ctx)
            },
            c if c.is_alphanumeric() => {
                self.command.push(c);
                self
            },
            c => {
                self.handle(ctx)
                    .consume(c, ctx)
            }
        }
    }

    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.handle(ctx)
            .flush(ctx)
    }
}

impl<W : Write + 'static> CommandName<W> {
    fn handle(mut self, ctx : &Context) -> AnyChainRule<W> {
        match self.command.as_str() {
            "\\added" => {
                Scan::new(Added(self.parent))
            },
            "\\replaced" => {
                Scan::new(Replaced(self.parent))
            },
            "\\deleted" => {
                Scan::new(Deleted(self.parent))
            },
            _ => {
                self.parent.produce_string(self.command, ctx);
                self.parent
            }
        }
    }
}

struct Comment<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Comment<W> {
    fn consume(mut self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        if c == '\n' {
            self.0.consume(c, ctx)
        } else {
            self.0.produce(c, ctx);
            self
        }
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

struct IgnoreComment<W : Write>(AnyChainRule<W>);

impl<W : Write +'static> ChainRule<W> for IgnoreComment<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        if c == '\n' {
            self.0.consume(c, ctx)
        } else {
            self
        }
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

struct Group<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Group<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        basic_consume(self, c, ctx, true)
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

impl<W : Write + 'static> NestedRule<W> for Group<W> {
    fn produce(&mut self, c : char, ctx : &Context) {
        self.0.produce(c, ctx)
    }
    fn next(self : Box<Self>) -> AnyChainRule<W> {
        self.0
    }
    fn start_ignored_comment(&mut self, c : char) {
        self.0.start_ignored_comment(c)
    }
}

struct Added<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Added<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        basic_consume(self, c, ctx, false)
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

impl<W : Write + 'static> NestedRule<W> for Added<W> {
    fn produce(&mut self, c : char, ctx : &Context) {
        self.0.produce(c, ctx)
    }
    fn next(self : Box<Self>) -> AnyChainRule<W> {
        self.0
    }
    fn start_ignored_comment(&mut self, c : char) {
        self.0.start_ignored_comment(c)
    }
}
struct Deleted<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Deleted<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        basic_consume(self, c, ctx, false)
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

impl<W : Write + 'static> NestedRule<W> for Deleted<W> {
    fn produce(&mut self, _c : char, _ctx : &Context) {
    }
    fn next(self : Box<Self>) -> AnyChainRule<W> {
        self.0
    }
    fn start_ignored_comment(&mut self, c : char) {
        self.0.start_ignored_comment(c)
    }
}

struct Replaced<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Replaced<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        basic_consume(self, c, ctx, false)
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

impl<W : Write + 'static> NestedRule<W> for Replaced<W> {
    fn produce(&mut self, c : char, ctx : &Context) {
        self.0.produce(c, ctx)
    }
    fn next(self : Box<Self>) -> AnyChainRule<W> {
        Scan::new(Deleted(self.0))
    }
    fn start_ignored_comment(&mut self, c : char) {
        self.0.start_ignored_comment(c)
    }
}

struct Scan<W : Write>(AnyNestedRule<W>);

impl<W : Write + 'static> ChainRule<W> for Scan<W> {
    fn consume(self : Box<Self>, c : char, ctx : &Context) -> AnyChainRule<W> {
        if c.is_whitespace() || c == '\n' {
            self
        } else if c == '{' {
            self.0
        } else if c == '%' {
            Box::new(IgnoreComment(self))
        } else {
            panic!("Non-whitespace character ({c}) separating arguments on \
                    line {lineno}", lineno = ctx.lineno)
        }
    }
    fn flush(self : Box<Self>, ctx : &Context) -> W {
        self.0.flush(ctx)
    }
}

impl<W : Write + 'static> Scan<W> {
    fn new<R : NestedRule<W> + 'static>(r : R) -> Box<dyn ChainRule<W>> {
        Box::new(Scan(Box::new(r)))
    }
}


struct Out<W : Write> {
    only_whitespace : bool,
    stored_whitespace : String,
    output : W,
    whitespace_satisfied : bool,
    par_satisfied : bool,
    ignored_comment_only_line : bool
}

impl<W : Write> Out<W> {
    pub fn line_end(&mut self, strip_ws : bool, input_only_ws : bool) {
        let skip_linefeed = if input_only_ws {
            // Need a paragraph break
            strip_ws && self.par_satisfied
        } else if strip_ws {
            self.only_whitespace && self.whitespace_satisfied
        } else {
            // Skip comment-only lines if the comment is ignored
            self.ignored_comment_only_line
        };

        if !skip_linefeed {
            if !strip_ws {
                write!(self.output, "{}", self.stored_whitespace).unwrap();
            }
            write!(self.output, "\n").unwrap();
            self.whitespace_satisfied = true;
            self.par_satisfied = self.only_whitespace;
        }

        self.stored_whitespace.clear();
        self.only_whitespace = true;
        self.ignored_comment_only_line = false;
    }
}

fn main() {
    let cli = CommandLineArgs::parse();

    match (cli.input, cli.output) {
        (None, None) => {
            process_buffered(cli.config, io::stdin(), io::stdout());
        },
        (None, Some(o)) => {
            process_buffered(cli.config, io::stdin(), File::create(o).unwrap());
        }
        (Some(i), None) => {
            process_buffered(cli.config, File::open(i).unwrap(), io::stdout());
        }
        (Some(i), Some(o)) => {
            process_buffered(cli.config, File::open(i).unwrap(), File::create(o).unwrap());
        }
    }
}

fn process_buffered<I : Read, O : Debug + Write + 'static>(
    config : Config,
    input : I,
    output : O
) -> O {
    process(config, BufReader::new(input), BufWriter::new(output)).into_inner().unwrap()
}

fn process<I : BufRead, O : Write + 'static>(config : Config, input : I, output : O) -> O {
    
    let mut rule : Box<dyn ChainRule<O>> = Box::new(Out {
        only_whitespace : true,
        stored_whitespace : String::new(),
        output,
        whitespace_satisfied : true,
        par_satisfied : true,
        ignored_comment_only_line : false
    });

    let mut ctx = Context{ lineno : 0, cli : config, input_only_ws : true};

    for l in input.lines().map(|l| l.unwrap()) {
        ctx.lineno += 1;
        ctx.input_only_ws = true;
        for c in l.chars() {
            ctx.input_only_ws = ctx.input_only_ws && c.is_whitespace();
            rule = rule.consume(c, &ctx);
        }
        rule = rule.consume('\n', &ctx);
    }

    rule.flush(&ctx)
}


#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    fn process_str(config : Config, input : &str) -> String {
        let b = process(config, Cursor::new(input.as_bytes()), Vec::new());
        String::from_utf8(b).unwrap()
    }

    #[test]
    fn test1_changes() {
        let p = |s| process_str(Config { strip_comments : false, strip_whitespace : false }, s);

        assert_eq!(p("x\\added{y}z\n"), "xyz\n");
        assert_eq!(p("x\\deleted{y}z\n"), "xz\n");
        assert_eq!(p("x\\replaced{y}{q}z\n"), "xyz\n");

        // TODO: working as intended, but not how would be good for LaTeX consistency
        assert_eq!(p("x\\added{y{q}\\bar}z\n"), "xy{q}\\barz\n");
        assert_eq!(p("x\\deleted{y{q}\\bar}z\n"), "xz\n");

        assert_eq!(p("x\\added{y\na}z\n"), "xy\naz\n");
        assert_eq!(p("x\\deleted{y\na}z\n"), "xz\n");
        assert_eq!(p("x\\replaced{y\na}{q}z\n"), "xy\naz\n");

        assert_eq!(p("x\\added{y\\test{\na}}z\n"), "xy\\test{\na}z\n");
        assert_eq!(p("x\\deleted{y\\test{\na}}z\n"), "xz\n");

        assert_eq!(p("x\\deleted{y\\added{a}}z\n"), "xz\n");

        assert_eq!(p("\\added{%\n\\begin{a}\n  x\n  y\n  z\\end{a}}\n"),
                     "%\n\\begin{a}\n  x\n  y\n  z\\end{a}\n");

        assert_eq!(p("\\added\n\n  {q}\n"), "q\n");
        assert_eq!(p("\\replaced{\\{q}\n  \n{z}\n"), "\\{q\n");
        assert_eq!(p("\\replaced{q\\}}\n  \n{z}\n"), "q\\}\n");
    }

    #[test]
    fn test2_comments() {
        let p = |s| process_str(Config { strip_comments : true, strip_whitespace : false }, s);
        
        assert_eq!(p("\\added{%\n\\begin{a}\n  x\n  y\n  z\\end{a}}\n"),
                     "\\begin{a}\n  x\n  y\n  z\\end{a}\n");

        assert_eq!(p("  test % comments"), "  test \n");

        assert_eq!(p("  test% comments"), "  test%\n");

        assert_eq!(p("  % comments\nline"), "line\n");
    }

    #[test]
    fn test3_whitespace() {
        let p = |s| process_str(Config { strip_comments : false, strip_whitespace : true }, s);
        
        assert_eq!(p("a\n\n\n\nb\n"), "a\n\nb\n");
        assert_eq!(p("a\n\n   \n\nb\n"), "a\n\nb\n");
        assert_eq!(p("a   \n\n\n\nb\n"), "a\n\nb\n");

        assert_eq!(p("a\n\n% comment\n\nb\n"), "a\n\n% comment\n\nb\n");
        assert_eq!(p("a\n\n   % comment   \n\nb\n"), "a\n\n   % comment\n\nb\n");
        assert_eq!(p("a   % comment  \n\n\n\nb\n"), "a   % comment\n\nb\n");
    }

    #[test]
    fn test4_comments_whitespace() {
        let p = |s| process_str(Config { strip_comments : true, strip_whitespace : true }, s);
        
        assert_eq!(p("\\added{%\n\\begin{a}\n  x\n  y\n  z\\end{a}}\n"),
                     "\\begin{a}\n  x\n  y\n  z\\end{a}\n");

        assert_eq!(p("  test % comments"), "  test\n");

        assert_eq!(p("  % comments\nline"), "line\n");

        assert_eq!(p("a\n\n% comment\n\nb\n"), "a\n\nb\n");
        assert_eq!(p("a\n\n   % comment   \n\nb\n"), "a\n\nb\n");
        assert_eq!(p("a   % comment  \n\n\n\nb\n"), "a\n\nb\n");

        assert_eq!(p("\\added{a   % comment  \n\n\n\nb}\n"), "a\n\nb\n");
        assert_eq!(p("c\\added{a   % comment  \n\n\n\nb}\n"), "ca\n\nb\n");
    }
}

mercurial