From 4754b420ced2503eb2641d6ddf678736e1aa7369 Mon Sep 17 00:00:00 2001 From: Max Audron Date: Sun, 17 Oct 2021 17:07:09 +0200 Subject: replace sedregex crate This replaces the sedregex crate with our own implementation for multiple reasons: 1. We required to access the parsed regex, this required a patch to the sedregex crate which did not get merged due to an inactive dev, blocking us from publishing on crates.Io 2. We wanted to highlight the changes done in bold 3. We want to add execution of multiple chained sed commands in the future which would require more modification --- src/hooks/sed/parser.rs | 344 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 src/hooks/sed/parser.rs (limited to 'src/hooks/sed/parser.rs') diff --git a/src/hooks/sed/parser.rs b/src/hooks/sed/parser.rs new file mode 100644 index 0000000..eb7ef3e --- /dev/null +++ b/src/hooks/sed/parser.rs @@ -0,0 +1,344 @@ +use std::{borrow::Cow, str::Chars}; + +use bitflags::bitflags; +use regex::Regex; + +use crate::util::formatting::Formatting; + +type Commands = Vec; + +#[derive(Debug, Clone)] +pub struct Command { + left: Regex, + right: String, + flags: Flags, +} + +impl PartialEq for Command { + fn eq(&self, other: &Self) -> bool { + self.left.as_str() == other.left.as_str() + && self.right == other.right + && self.flags == other.flags + } +} + +#[derive(Debug, Clone, PartialEq, thiserror::Error)] +pub enum ParseError { + #[error("not a sed command, does not start with 's/'")] + NotSedCommand, + #[error("unknown flag")] + InvalidFlag, + #[error(transparent)] + InvalidRegex(#[from] regex::Error), +} + +impl Command { + pub fn from_str(input: &str) -> Result { + let mut chars = input.chars(); + + if chars.next().unwrap() == 's' && chars.next().unwrap() == '/' { + let left = Command::parse_segment(&mut chars)?; + let right = Command::parse_segment(&mut chars)?.bold(); + let flags = Flags::from_chars(&mut chars)?; + + let left = Regex::new(&format!("(?{}){}", flags.to_string(), left)) + .map_err(|err| ParseError::InvalidRegex(err))?; + + return Ok(Command { left, right, flags }); + } else { + return Err(ParseError::NotSedCommand); + } + } + + pub fn from_str_multiple(input: &str) -> Result { + let mut commands = Commands::new(); + + let mut chars = input.chars(); + + loop { + let s = chars.next(); + let slash = chars.next(); + + if s.is_some() && slash.is_some() { + if s.unwrap() == 's' && slash.unwrap() == '/' { + let left = Command::parse_segment(&mut chars)?; + let right = Command::parse_segment(&mut chars)?.bold(); + let flags = Flags::from_chars(&mut chars)?; + + let left = Regex::new(&format!("(?{}){}", flags.to_string(), left)) + .map_err(|err| ParseError::InvalidRegex(err))?; + + commands.push(Command { left, right, flags }); + } else { + return Err(ParseError::NotSedCommand); + } + } else { + break; + } + } + + Ok(commands) + } + + fn parse_segment(chars: &mut Chars) -> Result { + let mut last_char = '/'; + let mut output = String::new(); + + while let Some(c) = chars.next() { + if c == '/' && last_char != '\\' { + break; + } else if c == '/' && last_char == '\\' { + output.pop().unwrap(); + } + + output.push(c); + last_char = c; + } + + Ok(output) + } + + pub fn execute(self, target: &str) -> Cow { + let result: Cow; + + if self.flags.contains(Flags::GLOBAL) { + result = self.left.replace_all(target, self.right); + } else { + result = self.left.replace(target, self.right); + } + + return result; + } + + pub fn regex(&self) -> &Regex { + &self.left + } +} + +bitflags! { + /// i case-insensitive: letters match both upper and lower case + /// m multi-line mode: ^ and $ match begin/end of line + /// s allow . to match \n + /// U swap the meaning of x* and x*? + /// x ignore whitespace and allow line comments (starting with `#`) + struct Flags: u32 { + const GLOBAL = 0b00000001; + const CASE_INSENSITIVE = 0b00000010; + const SINGLE_LINE = 0b00001000; + const UNGREEDY = 0b00010000; + const EXTENDED = 0b00100000; + } +} + +impl Flags { + pub fn to_string(&self) -> String { + let mut result = String::new(); + + result.push('m'); + + if self.contains(Flags::CASE_INSENSITIVE) { + result.push('i'); + } + + if self.contains(Flags::SINGLE_LINE) { + result.push('s'); + } + + if self.contains(Flags::UNGREEDY) { + result.push('U'); + } + + if self.contains(Flags::EXTENDED) { + result.push('x'); + } + + return result; + } + + pub fn from_chars(chars: &mut Chars) -> Result { + let mut flags: Flags = Flags::empty(); + + while let Some(c) = chars.next() { + match c { + 'g' => { + flags = flags | Flags::GLOBAL; + } + 'i' => { + flags = flags | Flags::CASE_INSENSITIVE; + } + 's' => { + flags = flags | Flags::SINGLE_LINE; + } + 'U' => { + flags = flags | Flags::UNGREEDY; + } + 'x' => { + flags = flags | Flags::EXTENDED; + } + ';' => return Ok(flags), + _ => return Err(ParseError::InvalidFlag), + }; + } + + Ok(flags) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const COMMAND_SIMPLE: &str = "s/replace/replacee/ig"; + const COMMAND_MULTIPLE: &str = "s/replace/replacee/ig;s/two/tworeplace/i"; + + #[test] + fn test_parse_segment() -> Result<(), ParseError> { + let mut chars = "replace/replacee/ig".chars(); + + let left = "replace"; + let right = Command::parse_segment(&mut chars)?; + + assert_eq!(left, right); + + let left = "replacee"; + let right = Command::parse_segment(&mut chars)?; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_flags_from_chars() -> Result<(), ParseError> { + let mut chars = "ig".chars(); + + let left = Flags::CASE_INSENSITIVE | Flags::GLOBAL; + let right = Flags::from_chars(&mut chars)?; + + assert_eq!(left, right); + + let mut chars = "igf".chars(); + let right = Flags::from_chars(&mut chars); + + assert_eq!(Err(ParseError::InvalidFlag), right); + + Ok(()) + } + + #[test] + fn test_flags_from_chars_with_terminator() -> Result<(), ParseError> { + let mut chars = "ig;bla".chars(); + + let left = Flags::CASE_INSENSITIVE | Flags::GLOBAL; + let right = Flags::from_chars(&mut chars)?; + + assert_eq!(left, right); + assert_eq!("bla", chars.as_str()); + + Ok(()) + } + + #[test] + fn test_new_command_simple() -> Result<(), ParseError> { + let left = Command::from_str(COMMAND_SIMPLE)?; + let right = Command { + left: Regex::new("(?mi)replace").unwrap(), + right: "\x02replacee\x02".to_string(), + flags: Flags::CASE_INSENSITIVE | Flags::GLOBAL, + }; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_new_command_simple_escaped_slash() -> Result<(), ParseError> { + let left = Command::from_str(r#"s/repl\/ace/replacee"#)?; + let right = Command { + left: Regex::new("(?m)repl/ace").unwrap(), + right: "\x02replacee\x02".to_string(), + flags: Flags::empty(), + }; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_new_command_simple_no_terminating_slash() -> Result<(), ParseError> { + let left = Command::from_str("s/replace/replacee")?; + let right = Command { + left: Regex::new("(?m)replace").unwrap(), + right: "\x02replacee\x02".to_string(), + flags: Flags::empty(), + }; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_new_command_complex_regex() -> Result<(), ParseError> { + let left = + Command::from_str(r#"s/http(?:s?):\/\/regex101\.com\/r\/([a-zA-Z0-9]{1,6})?$/$1/g"#)?; + let right = Command { + left: Regex::new(r#"(?m)http(?:s?)://regex101\.com/r/([a-zA-Z0-9]{1,6})?$"#).unwrap(), + right: "\x02$1\x02".to_string(), + flags: Flags::GLOBAL, + }; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_new_command_multiple_fail() -> Result<(), ParseError> { + let left = Command::from_str_multiple(COMMAND_SIMPLE)?; + let right = vec![Command { + left: Regex::new("(?mi)replace").unwrap(), + right: "\x02replacee\x02".to_string(), + flags: Flags::CASE_INSENSITIVE | Flags::GLOBAL, + }]; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_new_command_multiple() -> Result<(), ParseError> { + let left = Command::from_str_multiple(COMMAND_MULTIPLE)?; + let right = vec![ + Command { + left: Regex::new("(?mi)replace").unwrap(), + right: "\x02replacee\x02".to_string(), + flags: Flags::CASE_INSENSITIVE | Flags::GLOBAL, + }, + Command { + left: Regex::new("(?mi)two").unwrap(), + right: "\x02tworeplace\x02".to_string(), + flags: Flags::CASE_INSENSITIVE, + }, + ]; + + assert_eq!(left, right); + + Ok(()) + } + + #[test] + fn test_run_regex() -> Result<(), ParseError> { + let cmd = Command::from_str(COMMAND_SIMPLE)?; + + let left = "this is a sentence to \x02replacee\x02 text in"; + let right = cmd.execute("this is a sentence to replace text in"); + + assert_eq!(left, right); + + Ok(()) + } +} -- cgit v1.2.3