Skip to content

Commit

Permalink
New: apply unicode normalization while resolving notes
Browse files Browse the repository at this point in the history
The unicode standard allows for certain (visually) identical characters to
be represented in different ways.

For example the character ä may be represented as a single combined
codepoint "Latin Small Letter A with Diaeresis" (U+00E4) or by the
combination of "Latin Small Letter A" (U+0061) followed by "Combining
Diaeresis" (U+0308).

When encoded with UTF-8, these are represented as respectively the two
bytes 0xC3 0xA4, and the three bytes 0x61 0xCC 0x88.

A user linking to notes with these characters in their titles would
expect these two variants to link to the same file, given they are
visually identical and have the exact same semantic meaning.

The unicode standard defines a method to deconstruct and normalize these
forms, so that a byte comparison on the normalized forms of these
variants ends up comparing the same thing. This is called Unicode
Normalization, defined in Unicode® Standard Annex #15
(http://www.unicode.org/reports/tr15/).

The W3C Working Group has written an excellent explanation of the
problems regarding string matching, and how unicode normalization helps
with this process: https://www.w3.org/TR/charmod-norm/#unicodeNormalization

With this change, obsidian-export will perform unicode normalization
(specifically the C (or NFC) normalization form) on all note titles
while looking up link references, ensuring visually identical links are
treated as being similar, even if they were encoded as different
variants.

A special thanks to Hans Raaf (@oderwat) for reporting and helping track
down this issue.

---

Closes #126
  • Loading branch information
zoni committed Nov 19, 2022
1 parent c5ba5b7 commit b5b2ea2
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 11 deletions.
182 changes: 182 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@ regex = "1.7.0"
serde_yaml = "0.9.14"
slug = "0.1.4"
snafu = "0.7.3"
unicode-normalization = "0.1.22"

[dev-dependencies]
pretty_assertions = "1.3.0"
rstest = "0.15.0"
tempfile = "3.3.0"
walkdir = "2.3.2"
108 changes: 97 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use std::io::prelude::*;
use std::io::ErrorKind;
use std::path::{Path, PathBuf};
use std::str;
use unicode_normalization::UnicodeNormalization;

/// A series of markdown [Event]s that are generated while traversing an Obsidian markdown note.
pub type MarkdownEvents<'a> = Vec<Event<'a>>;
Expand Down Expand Up @@ -706,22 +707,33 @@ impl<'a> Exporter<'a> {
}
}

/// Get the full path for the given filename when it's contained in vault_contents, taking into
/// account:
///
/// 1. Standard Obsidian note references not including a .md extension.
/// 2. Case-insensitive matching
/// 3. Unicode normalization rules using normalization form C
/// (https://www.w3.org/TR/charmod-norm/#unicodeNormalization)
fn lookup_filename_in_vault<'a>(
filename: &str,
vault_contents: &'a [PathBuf],
) -> Option<&'a PathBuf> {
// Markdown files don't have their .md extension added by Obsidian, but other files (images,
// PDFs, etc) do so we match on both possibilities.
//
// References can also refer to notes in a different case (to lowercase text in a
// sentence even if the note is capitalized for example) so we also try a case-insensitive
// lookup.
let filename = PathBuf::from(filename);
let filename_normalized = filename.to_string_lossy().nfc().collect::<String>();

vault_contents.iter().find(|path| {
let path_lowered = PathBuf::from(path.to_string_lossy().to_lowercase());
path.ends_with(filename)
|| path_lowered.ends_with(&filename.to_lowercase())
|| path.ends_with(format!("{}.md", &filename))
|| path_lowered.ends_with(format!("{}.md", &filename.to_lowercase()))
let path_normalized_str = path.to_string_lossy().nfc().collect::<String>();
let path_normalized = PathBuf::from(&path_normalized_str);
let path_normalized_lowered = PathBuf::from(&path_normalized_str.to_lowercase());

// It would be convenient if we could just do `filename.set_extension("md")` at the start
// of this funtion so we don't need multiple separate + ".md" match cases here, however
// that would break with a reference of `[[Note.1]]` linking to `[[Note.1.md]]`.

path_normalized.ends_with(&filename_normalized)
|| path_normalized.ends_with(filename_normalized.clone() + ".md")
|| path_normalized_lowered.ends_with(&filename_normalized.to_lowercase())
|| path_normalized_lowered.ends_with(filename_normalized.to_lowercase() + ".md")
})
}

Expand Down Expand Up @@ -876,3 +888,77 @@ fn codeblock_kind_to_owned<'a>(codeblock_kind: CodeBlockKind) -> CodeBlockKind<'
CodeBlockKind::Fenced(cowstr) => CodeBlockKind::Fenced(CowStr::from(cowstr.into_string())),
}
}

#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use rstest::rstest;

lazy_static! {
static ref VAULT: Vec<std::path::PathBuf> = vec![
PathBuf::from("NoteA.md"),
PathBuf::from("Document.pdf"),
PathBuf::from("Note.1.md"),
PathBuf::from("nested/NoteA.md"),
PathBuf::from("Note\u{E4}.md"), // Noteä.md, see also encodings() below
];
}

#[test]
fn encodings() {
// Standard "Latin Small Letter A with Diaeresis" (U+00E4)
// Encoded in UTF-8 as two bytes: 0xC3 0xA4
assert_eq!(String::from_utf8(vec![0xC3, 0xA4]).unwrap(), "ä");
assert_eq!("\u{E4}", "ä");

// Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308)
// Renders the same visual appearance but is encoded in UTF-8 as three bytes:
// 0x61 0xCC 0x88
assert_eq!(String::from_utf8(vec![0x61, 0xCC, 0x88]).unwrap(), "ä");
assert_eq!("a\u{308}", "ä");
assert_eq!("\u{61}\u{308}", "ä");

// For more examples and a better explanation of this concept, see
// https://www.w3.org/TR/charmod-norm/#aringExample
}

#[rstest]
// Exact match
#[case("NoteA.md", "NoteA.md")]
#[case("NoteA", "NoteA.md")]
// Same note in subdir, exact match should find it
#[case("nested/NoteA.md", "nested/NoteA.md")]
#[case("nested/NoteA", "nested/NoteA.md")]
// Different extensions
#[case("Document.pdf", "Document.pdf")]
#[case("Note.1", "Note.1.md")]
#[case("Note.1.md", "Note.1.md")]
// Case-insensitive matches
#[case("notea.md", "NoteA.md")]
#[case("notea", "NoteA.md")]
#[case("NESTED/notea.md", "nested/NoteA.md")]
#[case("NESTED/notea", "nested/NoteA.md")]
// "Latin Small Letter A with Diaeresis" (U+00E4)
#[case("Note\u{E4}.md", "Note\u{E4}.md")]
#[case("Note\u{E4}", "Note\u{E4}.md")]
// Basic (ASCII) lowercase a followed by Unicode Character “◌̈” (U+0308)
// The UTF-8 encoding is different but it renders the same visual appearance as the case above,
// so we expect it to find the same file.
#[case("Note\u{61}\u{308}.md", "Note\u{E4}.md")]
#[case("Note\u{61}\u{308}", "Note\u{E4}.md")]
// We should expect this to work with lowercasing as well, so NoteÄ should find Noteä
// NoteÄ where Ä = Single Ä (U+00C4)
#[case("Note\u{C4}.md", "Note\u{E4}.md")]
#[case("Note\u{C4}", "Note\u{E4}.md")]
// NoteÄ where Ä = decomposed to A (U+0041) + ◌̈ (U+0308)
#[case("Note\u{41}\u{308}.md", "Note\u{E4}.md")]
#[case("Note\u{41}\u{308}", "Note\u{E4}.md")]
fn test_lookup_filename_in_vault(#[case] input: &str, #[case] expected: &str) {
let result = lookup_filename_in_vault(input, &VAULT);
println!("Test input: {:?}", input);
println!("Expecting: {:?}", expected);
println!("Got: {:?}", result.unwrap_or(&PathBuf::from("")));
assert_eq!(result, Some(&PathBuf::from(expected)))
}
}

0 comments on commit b5b2ea2

Please sign in to comment.