From 10fc132ed443ccf0b910f06257f6b4643d913a26 Mon Sep 17 00:00:00 2001 From: Vedang Manerikar Date: Tue, 12 Mar 2024 13:57:52 +0530 Subject: [PATCH 1/3] Simplify pre-compiled regular expressions for sigils MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I discovered `tagref` today, when looking for a system inspired from GHC notes. I was delighted and tried it on a code-base where I have been writing notes, but it didn't work! Let me describe the problem: * Setup: Here is my sysinfo, as published by `neofetch`: OS: macOS 14.3.1 23D60 arm64 Host: MacBookAir10,1 Shell: fish 3.7.0 Terminal: tmux CPU: Apple M1 On this system, `tagref` did not recognize any tags in my code, even when I had tags. ``` ─> tagref 0 tags, 0 tag references, 0 file references, and 0 directory references validated in 464 files. ``` The problem seems to be in the capture group used in the regular expressions, which I verified using `rg` and `gsed`: ``` ─> rg "\[\s*tag\s*:.*\s*\]" components/whatsapp/src/ai/salher/whatsapp/core.clj 267:;; ## [tag:GIFs need to be converted to video/mp4 format] components/chapters/src/ai/salher/chapters/core.clj 173:;;; ## [tag:Rules for deciding when we should update a chapter record] 185:;;; ## [tag:Rules to mark a chapter as DONE for now] ... ``` The current capture group does not support spaces in the identifier name! ``` ─> cat components/whatsapp/src/ai/salher/whatsapp/core.clj | gsed -En '/\[\s*tag\s*:\s*([^\]\s]*)\]/p' ─> cat components/whatsapp/src/ai/salher/whatsapp/core.clj | gsed -En '/\[\s*tag\s*:\s*.*\]/p' ;; ## [tag:GIFs need to be converted to video/mp4 format] ─> cat components/whatsapp/src/ai/salher/whatsapp/core.clj | gsed -En 's/.*\[\s*tag\s*:\s*(.*)\s*\].*/\1/p' GIFs need to be converted to video/mp4 format ``` On seeing this, I tried to understand what the expressions are trying to do, and add support for spaces. My current understanding is as follows: 1. Look for a starting square-bracket `[` 2. There can be 0 or more spaces between the square bracket and the sigil, ignore them. 3. Match the sigil 4. There can be 0 or more spaces between the sigil and `:` token, ignore them 5. There can be 0 or more spaces between the `:` token and the actual identifier, ignore them 6. Capture the identifier. 7. There can be 0 or more spaces between the end of the identifier and the closing square-bracket, ignore them. 8. Stop at the closing square-bracket `]` With this in mind, I changed the capture group to a lazy-capture (from `([^\\]\\s]*)` to `(\\w.*?)`. This ensures that everything is captured until the closing square bracket (ignoring all empty space immediately before the closing bracket). The only constraint it imposes is that the first character of the identifier should be a letter, digit or underscore (via the `\w`). This is to ensure that `[tag:]` is not treated as a valid tag. After this change, `tagref` is now working correctly on my local system. I ran `cargo tests` (all passing) and also tested on my own project, which now shows: ``` ─> tagref 21 tags, 24 tag references, 0 file references, and 0 directory references validated in 464 files. ``` Update: based on feedback received in the PR #218, I have changed the capture group from `(\\w.*?)` to `(.+?)`, to support identifiers like `[file:.gitignore]`. This means a tag needs to be at least one character which is not a line-break. --- src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index b8ed9e7..cad7138 100644 --- a/src/main.rs +++ b/src/main.rs @@ -209,22 +209,22 @@ fn entry() -> Result<(), String> { // Compile the regular expressions in advance. let tag_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*([^\\]\\s]*)\\s*\\]", + "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] escape(&settings.tag_sigil), )) .unwrap(); // Safe by manual inspection let ref_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*([^\\]\\s]*)\\s*\\]", + "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] escape(&settings.ref_sigil), )) .unwrap(); // Safe by manual inspection let file_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*([^\\]\\s]*)\\s*\\]", + "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] escape(&settings.file_sigil), )) .unwrap(); // Safe by manual inspection let dir_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*([^\\]\\s]*)\\s*\\]", + "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] escape(&settings.dir_sigil), )) .unwrap(); // Safe by manual inspection From 0f2c99d9cafdb375b1e8624c502ed80c843e7e13 Mon Sep 17 00:00:00 2001 From: Vedang Manerikar Date: Thu, 14 Mar 2024 12:24:53 +0530 Subject: [PATCH 2/3] Internal whitespace support: Add tests, documentation and update regex Based on feedback from @stepchowfun in PR #218: - Update regex to explicitly decree that tag cannot contain `]`: this is clearer and easier to maintain. - Implement tests to exercise the change in behaviour we introduced - Update the README section on naming tags Since the regular expression for identifying a reference needs to be updated in multiple places, we introduce a directive for it. This will ensure that any future change to the regular expression remembers to update all the appropriate places. --- README.md | 2 +- src/directive.rs | 24 ++++++++++++------------ src/main.rs | 15 ++++++++++----- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index f451f92..534d379 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ A directory reference guarantees that the given directory exists. For example: ## Tag names -The name of a tag may consist of any UTF-8 text except whitespace and the right square bracket `]`. For example, `[tag:foo_bar]` and `[tag:ほげ〜ふが]` are valid, but `[tag:foo bar]` is not. Tag names are case-sensitive, so `[tag:foo]` and `[tag:Foo]` are different tags. +The name of a tag may consist of any UTF-8 text except whitespace and the right square bracket `]`. Internal whitespace (as in `[tag:foo bar]`) is allowed, and surrounding whitespace (as in `[tag: baz ]`) is ignored. More examples of valid tags: `[tag:foo_bar]` and `[tag:ほげ〜ふが]`. Tag names are case-sensitive, so `[tag:foo]` and `[tag:Foo]` are different tags. You can use any naming convention you like. The Tagref authors prefer to use lowercase words separated by underscores `_`, like `[tag:important_note]`. diff --git a/src/directive.rs b/src/directive.rs index 3d96c61..0547932 100644 --- a/src/directive.rs +++ b/src/directive.rs @@ -132,10 +132,10 @@ mod tests { std::path::Path, }; - const TAG_REGEX: &str = "(?i)\\[\\s*tag\\s*:\\s*([^\\]\\s]*)\\s*\\]"; - const REF_REGEX: &str = "(?i)\\[\\s*ref\\s*:\\s*([^\\]\\s]*)\\s*\\]"; - const FILE_REGEX: &str = "(?i)\\[\\s*file\\s*:\\s*([^\\]\\s]*)\\s*\\]"; - const DIR_REGEX: &str = "(?i)\\[\\s*dir\\s*:\\s*([^\\]\\s]*)\\s*\\]"; + const TAG_REGEX: &str = "(?i)\\[\\s*tag\\s*:\\s*([^\\]]*?)\\s*\\]"; // [ref:directive_regex] + const REF_REGEX: &str = "(?i)\\[\\s*ref\\s*:\\s*([^\\]]*?)\\s*\\]"; // [ref:directive_regex] + const FILE_REGEX: &str = "(?i)\\[\\s*file\\s*:\\s*([^\\]]*?)\\s*\\]"; // [ref:directive_regex] + const DIR_REGEX: &str = "(?i)\\[\\s*dir\\s*:\\s*([^\\]]*?)\\s*\\]"; // [ref:directive_regex] #[test] fn parse_empty() { @@ -409,10 +409,10 @@ mod tests { fn parse_whitespace() { let path = Path::new("file.rs").to_owned(); let contents = r" - [ ?tag : label ] - [ ?ref : label ] - [ ?file : foo/bar/baz.txt ] - [ ?dir : foo/bar/baz ] + [ ?tag : foo bar ] + [ ?ref : foo bar ] + [ ?file : foo bar/baz qux.txt ] + [ ?dir : foo bar/baz qux ] " .trim() .replace('?', "") @@ -435,25 +435,25 @@ mod tests { assert_eq!(directives.tags.len(), 1); assert_eq!(directives.tags[0].r#type, Type::Tag); - assert_eq!(directives.tags[0].label, "label"); + assert_eq!(directives.tags[0].label, "foo bar"); assert_eq!(directives.tags[0].path, path); assert_eq!(directives.tags[0].line_number, 1); assert_eq!(directives.refs.len(), 1); assert_eq!(directives.refs[0].r#type, Type::Ref); - assert_eq!(directives.refs[0].label, "label"); + assert_eq!(directives.refs[0].label, "foo bar"); assert_eq!(directives.refs[0].path, path); assert_eq!(directives.refs[0].line_number, 2); assert_eq!(directives.files.len(), 1); assert_eq!(directives.files[0].r#type, Type::File); - assert_eq!(directives.files[0].label, "foo/bar/baz.txt"); + assert_eq!(directives.files[0].label, "foo bar/baz qux.txt"); assert_eq!(directives.files[0].path, path); assert_eq!(directives.files[0].line_number, 3); assert_eq!(directives.dirs.len(), 1); assert_eq!(directives.dirs[0].r#type, Type::Dir); - assert_eq!(directives.dirs[0].label, "foo/bar/baz"); + assert_eq!(directives.dirs[0].label, "foo bar/baz qux"); assert_eq!(directives.dirs[0].path, path); assert_eq!(directives.dirs[0].line_number, 4); } diff --git a/src/main.rs b/src/main.rs index cad7138..a6b3cc0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -207,24 +207,29 @@ fn entry() -> Result<(), String> { // Parse the command-line options. let settings = settings(); - // Compile the regular expressions in advance. + // [tag:directive_regex] Compile the regular expressions in + // advance. The string literal used here in the format macro is + // also used in other places. IF you change the literal, make sure + // to change all references to the tag as well. See: + // https://github.com/rust-lang/rust/issues/69133 for why format + // needs a string literal. let tag_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] + "(?i)\\[\\s*{}\\s*:\\s*([^\\]]*?)\\s*\\]", // [ref:directive_regex] escape(&settings.tag_sigil), )) .unwrap(); // Safe by manual inspection let ref_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] + "(?i)\\[\\s*{}\\s*:\\s*([^\\]]*?)\\s*\\]", // [ref:directive_regex] escape(&settings.ref_sigil), )) .unwrap(); // Safe by manual inspection let file_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] + "(?i)\\[\\s*{}\\s*:\\s*([^\\]]*?)\\s*\\]", // [ref:directive_regex] escape(&settings.file_sigil), )) .unwrap(); // Safe by manual inspection let dir_regex: Regex = Regex::new(&format!( - "(?i)\\[\\s*{}\\s*:\\s*(.+?)\\s*\\]", // [ref:identifier_regex_fmt_str] + "(?i)\\[\\s*{}\\s*:\\s*([^\\]]*?)\\s*\\]", // [ref:directive_regex] escape(&settings.dir_sigil), )) .unwrap(); // Safe by manual inspection From 4f65fe6c2bfd354276d498e3b09353c951a02a81 Mon Sep 17 00:00:00 2001 From: Vedang Manerikar Date: Thu, 14 Mar 2024 12:57:03 +0530 Subject: [PATCH 3/3] Bump up the version of tagref to 1.10.0 --- CHANGELOG.md | 5 +++++ Cargo.lock | 2 +- Cargo.toml | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f510836..9b29359 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.10.0] - 2024-03-14 + +### Changed +- Tagref now supports whitespace in tag names and in paths in file and directory references. + ## [1.9.1] - 2024-02-21 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index c3b3241..68f35e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -273,7 +273,7 @@ dependencies = [ [[package]] name = "tagref" -version = "1.9.1" +version = "1.10.0" dependencies = [ "atty", "clap", diff --git a/Cargo.toml b/Cargo.toml index 626c26a..3733f90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tagref" -version = "1.9.1" +version = "1.10.0" authors = ["Stephan Boyer "] edition = "2021" description = "Tagref helps you maintain cross-references in your code."