add synology note station

marph91 · Apr 13, 2024 · d42ae17 · d42ae17
1 parent 8fe9b1f
commit d42ae17
Show file tree

Hide file tree

Showing 19 changed files with 433 additions and 19 deletions.
diff --git a/readme.md b/readme.md
@@ -56,6 +56,7 @@ There are many more apps supported implicitly if they export text files to a fol
 | [Notion](https://www.notion.so/) | notion | [link](https://www.notion.so/de-de/help/export-your-content) [1] |
 | [Obsidian](https://obsidian.md/) | obsidian | |
 | [Simplenote](https://simplenote.com/) | simplenote | [link](https://simplenote.com/help/#export) |
+| [Synology Note Station](https://www.synology.com/en-global/dsm/feature/note_station) | synology_note_stattion | [link](https://kb.synology.com/en-global/DSM/help/NoteStation/note_station_managing_notes?version=7#t7) |
 | [TiddlyWiki](https://tiddlywiki.com/) | tiddlywiki | [JSON only](https://tiddlywiki.com/static/How%2520to%2520export%2520tiddlers.html) [2] |
 | [Todo.txt](http://todotxt.org/) | todo_txt | |
 | [Todoist](https://todoist.com/) | todoist | [link](https://todoist.com/de/help/articles/introduction-to-backups-ywaJeQbN) [3] |

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 joppy
 platformdirs
+puremagic
 pypandoc_binary
 python-frontmatter
 pytodotxt
diff --git a/setup.cfg b/setup.cfg
@@ -18,6 +18,8 @@ warn_unused_ignores = True
 no_implicit_reexport = True
 strict_equality = True
 extra_checks = True
+[mypy-puremagic.*]
+ignore_missing_imports = True
 [mypy-pypandoc.*]
 ignore_missing_imports = True
 [mypy-pytodotxt.*]

diff --git a/src/apps/nimbus_note.py b/src/apps/nimbus_note.py
@@ -3,8 +3,7 @@
 from pathlib import Path
 import zipfile
 
-import pypandoc
-
+import common
 import converter
 import intermediate_format as imf
 
@@ -22,10 +21,7 @@ def convert(self, file_or_folder: Path):
                 for html_note in html_notes:
                     with zip_ref.open(html_note) as zip_note:
                         note_body_html = zip_note.read().decode("UTF-8")
-                    # Don't use "commonmark_x". There would be too many noise.
-                    note_body_markdown = pypandoc.convert_text(
-                        note_body_html, "markdown_strict-raw_html", format="html"
-                    )
+                    note_body_markdown = common.html_text_to_markdown(note_body_html)
                     note_joplin = imf.Note(
                         {
                             "title": file_.stem,

diff --git a/src/apps/synology_note_station.py b/src/apps/synology_note_station.py
@@ -0,0 +1,158 @@
+"""Convert Synology Note Station notes to the intermediate format."""
+
+from dataclasses import dataclass
+import hashlib
+import json
+from pathlib import Path
+import re
+import zipfile
+
+import common
+import converter
+import intermediate_format as imf
+
+
+@dataclass
+class Attachment:
+    """Represents a Note Station attachment."""
+
+    filename: Path
+    md5: str
+    ref: str | None = None
+    title: str | None = None
+
+
+class Converter(converter.BaseConverter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.available_resources = []
+
+    def prepare_input(self, input_: Path) -> Path | None:
+        if input_.suffix.lower() in (".nsx", ".zip"):
+            temp_folder = common.get_temp_folder()
+            with zipfile.ZipFile(input_) as zip_ref:
+                zip_ref.extractall(temp_folder)
+            return temp_folder
+        if input_.is_dir():
+            return input_
+        self.logger.error(f"Unsupported format for {self.app}")
+        return None
+
+    def find_parent_notebook(self, parent_id) -> imf.Notebook:
+        for notebook in self.root_notebook.child_notebooks:
+            if notebook.original_id == parent_id:
+                return notebook
+        self.logger.debug(f"Couldn't find parent notebook with id {parent_id}")
+        return self.root_notebook
+
+    def handle_markdown_links(self, body: str) -> tuple[list, list]:
+        resources = []
+        for file_prefix, description, url in common.get_markdown_links(body):
+            if url.startswith("http") or url.startswith("mailto:"):
+                continue  # web link / mail
+            original_text = f"{file_prefix}[{description}]({url})"
+            # resource
+            # Find resource file by "ref".
+            matched_resources = [
+                res for res in self.available_resources if res.ref == url
+            ]
+            if len(matched_resources) != 1:
+                self.logger.debug(
+                    "Found too less or too many resource: {len(matched_resources)}"
+                )
+                continue
+            resource = matched_resources[0]
+            resources.append(
+                imf.Resource(
+                    resource.filename, original_text, description or resource.title
+                )
+            )
+        return resources, []
+
+    def convert_notebooks(self, input_json: dict):
+        for notebook_id in input_json["notebook"]:
+            notebook = json.loads((self.root_path / notebook_id).read_text())
+
+            self.root_notebook.child_notebooks.append(
+                imf.Notebook(
+                    {
+                        "title": notebook["title"],
+                        "user_created_time": notebook["ctime"],
+                        "user_updated_time": notebook["mtime"],
+                    },
+                    original_id=notebook_id,
+                )
+            )
+
+    def map_resources_by_hash(self, note: dict) -> list[imf.Resource]:
+        resources = []
+        for note_resource in note.get("attachment", {}).values():
+            for file_resource in self.available_resources:
+                if note_resource["md5"] == file_resource.md5:
+                    if (ref := note_resource.get("ref")) is not None:
+                        file_resource.ref = ref
+                        file_resource.title = note_resource["name"]
+                    else:
+                        # The attachment is not referenced. Add it here.
+                        # Referenced attachments are added later.
+                        resources.append(
+                            imf.Resource(
+                                file_resource.filename, title=note_resource["name"]
+                            )
+                        )
+                    break
+        return resources
+
+    def convert(self, file_or_folder: Path):
+        self.root_path = self.prepare_input(file_or_folder)
+        if self.root_path is None:
+            return
+        input_json = json.loads((self.root_path / "config.json").read_text())
+
+        # TODO: What is input_json["shortcut"]?
+        # TODO: Are nested notebooks possible?
+
+        self.convert_notebooks(input_json)
+
+        # dirty hack: Only option to map the files from file system
+        # to the note content is by MD5 hash.
+        for item in self.root_path.iterdir():
+            if item.is_file() and item.stem.startswith("file_"):
+                self.available_resources.append(
+                    Attachment(item, hashlib.md5(item.read_bytes()).hexdigest())
+                )
+
+        for note_id in input_json["note"]:
+            note = json.loads((self.root_path / note_id).read_text())
+
+            # resources / attachments
+            resources = self.map_resources_by_hash(note)
+
+            data = {
+                "title": note["title"],
+                "user_created_time": note["ctime"],
+                "user_updated_time": note["mtime"],
+                "source_application": self.app,
+            }
+            if (content_html := note.get("content")) is not None:
+                # dirty hack: In the original data, the attachment_id is stored in the
+                # "ref" attribute. Mitigate by storing it in the "src" attribute.
+                content_html = re.sub("<img.*?ref=", "<img src=", content_html)
+                content_markdown = common.html_text_to_markdown(content_html)
+                resources_referenced, _ = self.handle_markdown_links(content_markdown)
+                resources.extend(resources_referenced)
+                data["body"] = content_markdown
+
+            common.try_transfer_dicts(
+                note, data, ["latitude", "longitude", "source_url"]
+            )
+
+            parent_notebook = self.find_parent_notebook(note["parent_id"])
+            parent_notebook.child_notes.append(
+                imf.Note(
+                    data,
+                    tags=[imf.Tag({"title": tag}, tag) for tag in note.get("tag", [])],
+                    resources=resources,
+                    original_id=note_id,
+                )
+            )
diff --git a/src/common.py b/src/common.py
@@ -7,6 +7,8 @@
 import tempfile
 import time
 
+import pypandoc
+
 
 LOGGER = logging.getLogger("joplin_custom_importer")
 
@@ -15,7 +17,23 @@
 # operations on note body
 ###########################################################
 
-MARKDOWN_LINK_REGEX = re.compile(r"(!)?\[([^\]]+)\]\(([^)]+)\)")
+
+def try_transfer_dicts(source: dict, target: dict, keys: list[str | tuple[str, str]]):
+    """Try to transfer values from one to another dict if they exist."""
+    for key in keys:
+        if isinstance(key, tuple):
+            source_key, target_key = key
+        else:
+            source_key = target_key = key
+        if (value := source.get(source_key)) is not None:
+            target[target_key] = value
+
+
+###########################################################
+# operations on note body
+###########################################################
+
+MARKDOWN_LINK_REGEX = re.compile(r"(!)?\[([^\]]*)\]\(([^)]+)\)")
 WIKILINK_LINK_REGEX = re.compile(r"(!)?\[\[(.+?)(?:\|(.+?))?\]\]")
 
 
@@ -46,6 +64,13 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
     return list(tags)
 
 
+def html_text_to_markdown(html_text: str) -> str:
+    # Don't use "commonmark_x". There would be too many noise.
+    return pypandoc.convert_text(
+        html_text, "markdown_strict+pipe_tables-raw_html", format="html"
+    )
+
+
 ###########################################################
 # folder operations
 ###########################################################

diff --git a/src/importer.py b/src/importer.py
@@ -48,7 +48,7 @@ def import_note(self, note: imf.Note):
             )
             if resource.original_text is None:
                 # append
-                note.data["body"] = f"{note.data['body']}\n{resource_markdown}"
+                note.data["body"] = f"{note.data.get('body', '')}\n{resource_markdown}"
             else:
                 # replace existing link
                 note.data["body"] = note.data["body"].replace(

diff --git a/src/intermediate_format.py b/src/intermediate_format.py
@@ -4,6 +4,7 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
+import puremagic
 
 
 @dataclass
@@ -28,19 +29,15 @@ class Resource:
     original_text: str | None = None
     # [title_or_filename](:/resource_id)
     title: str | None = None
+    is_image: bool = field(init=False)
 
-    @property
-    def is_image(self) -> bool:
-        # Just take the supported image types of Joplin:
+    def __post_init__(self):
+        # Supported image types of Joplin:
         # https://github.com/laurent22/joplin/blob/a3eec19b32684b86202c751c94c092c7339c6307/packages/lib/models/utils/resourceUtils.ts#L40-L43
-        return self.filename.suffix.lower() in (
-            ".jpg",
-            ".jpeg",
-            ".png",
-            ".gif",
-            ".svg",
-            ".webp",
-            ".avif",
+        # We can't simply match by extension, because sometimes the files/images
+        # are stored as binary blob without extension.
+        self.is_image = puremagic.from_file(self.filename, mime=True).startswith(
+            "image/"
         )
 
 
@@ -72,3 +69,4 @@ class Notebook:
     data: dict
     child_notebooks: list[Notebook] = field(default_factory=list)
     child_notes: list[Note] = field(default_factory=list)
+    original_id: str | None = None
diff --git a/test/example_commands.sh b/test/example_commands.sh
@@ -19,5 +19,6 @@ python src/joplin_custom_importer.py test_inputs/dynalist --app dynalist
 python src/joplin_custom_importer.py test_inputs/Google\ Keep --app google_keep
 python src/joplin_custom_importer.py test_inputs/obsidian_vault --app obsidian
 python src/joplin_custom_importer.py test_inputs/simplenote --app simplenote
+python src/joplin_custom_importer.py test_inputs/synology_note_station --app synology_note_station
 python src/joplin_custom_importer.py test_inputs/todo_txt/examples_from_readme.txt --app todo_txt
 python src/joplin_custom_importer.py test_inputs/todoist/Privates.csv --app todoist
diff --git a/test/test_app.py b/test/test_app.py
@@ -65,6 +65,12 @@ def test_simplenote(self):
         stats = self.get_stats([TEST_INPUTS / "simplenote"], "simplenote")
         self.assert_stats(stats, notes=2, tags=2, note_links=1)
 
+    def test_synology_note_station(self):
+        stats = self.get_stats(
+            [TEST_INPUTS / "synology_note_station"], "synology_note_station"
+        )
+        self.assert_stats(stats, notebooks=3, notes=4, tags=6, resources=2)
+
     def test_tiddlywiki(self):
         # TODO
         self.skipTest("no public test data yet")

diff --git a/test_inputs/synology_note_station/1026_4OIJ8PR6215E9637KLADHU3K5S b/test_inputs/synology_note_station/1026_4OIJ8PR6215E9637KLADHU3K5S
@@ -0,0 +1 @@
+{"category":"notebook","ctime":1581622991,"mtime":1581622991,"stack":"","title":"Test-Book 2"}
diff --git a/test_inputs/synology_note_station/1026_89F091GQG53B992ODLNRUEAVKS b/test_inputs/synology_note_station/1026_89F091GQG53B992ODLNRUEAVKS
@@ -0,0 +1 @@
+{"ctime":1581623497,"encrypt":false,"latitude":51.3497041,"longitude":6.4225708,"mtime":1581623507,"parent_id":"1026_4OIJ8PR6215E9637KLADHU3K5S","title":"Empty note"}
diff --git a/test_inputs/synology_note_station/1026_95U5E7L2IH3B73EF6RA152KJ3S b/test_inputs/synology_note_station/1026_95U5E7L2IH3B73EF6RA152KJ3S
@@ -0,0 +1 @@
+{"category":"notebook","ctime":1581019360,"mtime":1581622970,"stack":"","title":"Test-Book 1"}
diff --git a/test_inputs/synology_note_station/1026_9OKGI8LVSP1AR2F5FQR7V5RT6S b/test_inputs/synology_note_station/1026_9OKGI8LVSP1AR2F5FQR7V5RT6S
@@ -0,0 +1 @@
+{"attachment":{"_X6UIjCvWr1GScrqvI1bLGw":{"md5":"4b41a3475132bd861b30a878e30aa56a","name":"sample.pdf","rotate":true,"size":3028,"type":"binary"}},"ctime":1581622674,"encrypt":false,"latitude":51.3497041,"longitude":6.4225708,"mtime":1581622807,"parent_id":"1026_95U5E7L2IH3B73EF6RA152KJ3S","source_url":"http://www.africau.edu/images/default/sample.pdf","tag":["pdf-only"],"thumb":null,"title":"note with pdf attached"}
diff --git a/test_inputs/synology_note_station/1026_9SLT09NT4P2A3DO2KQHUPT34QG b/test_inputs/synology_note_station/1026_9SLT09NT4P2A3DO2KQHUPT34QG
@@ -0,0 +1 @@
+{"brief":"This is a simple test note without any attachment.\nbullet point 1\nbullet point 2","content":"<div>This is a simple test note without any attachment.</div><ul><li>bullet point 1</li><li>bullet point 2</li></ul>","ctime":1577746800,"encrypt":false,"latitude":51.3497041,"longitude":6.4225708,"mtime":1581623094,"parent_id":"1026_4OIJ8PR6215E9637KLADHU3K5S","tag":["test_tag_1","test_tag_2"],"title":"Note without any attachment"}
diff --git a/...43F456724EA95C09995C6D6B20650B28058CA1FCD3D2A9CF2F726B821E71156ED03DA074A700732C505E2190E b/...43F456724EA95C09995C6D6B20650B28058CA1FCD3D2A9CF2F726B821E71156ED03DA074A700732C505E2190E
@@ -0,0 +1 @@
+{"attachment":{"_uWhCuJkW2p3GYY1In8x82A":{"ext":"jpg","height":239,"md5":"4774521912a394266afdd8dc9510992e","name":"tractor-3-1386656.jpg","ref":"MTU4MTYyMzM3NTcyOHRyYWN0b3ItMy0xMzg2NjU2LmpwZw==","rotate":true,"size":25350,"type":"image","width":360}},"brief":"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod ","content":"<div>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</div><div></div><div></div><div><span style=\"font-size: 12pt;\">Some jpg</span></div><div></div><div><img class=\"syno-notestation-image-object\" src=\"webman/3rdparty/NoteStation/images/transparent.gif\" border=\"0\" ref=\"MTU4MTYyMzM3NTcyOHRyYWN0b3ItMy0xMzg2NjU2LmpwZw==\" adjust=\"true\" /></div><div></div><div>Screenshot</div><div></div><div></div><div><table style=\"border-collapse: collapse; width: 120pt;\" border=\"0\" width=\"160\" cellspacing=\"0\" cellpadding=\"0\"><tbody><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt; width: 60pt;\" width=\"80\" height=\"20\">Test1</td><td style=\"width: 60pt;\" width=\"80\">Test2</td></tr><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt;\" align=\"right\" height=\"20\">1</td><td>a</td></tr><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt;\" align=\"right\" height=\"20\">2</td><td>b</td></tr><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt;\" align=\"right\" height=\"20\">3</td><td>c</td></tr><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt;\" align=\"right\" height=\"20\">4</td><td>d</td></tr><tr style=\"height: 15.0pt;\"><td style=\"height: 15.0pt;\" align=\"right\" height=\"20\">5</td><td>e</td></tr></tbody></table></div>","ctime":655671600,"latitude":0,"longitude":0,"mtime":1582401173,"parent_id":"1026_95U5E7L2IH3B73EF6RA152KJ3S","source_url":"","tag":["test_tag_1","test_tag_2","test & tag % 3"],"thumb":"_uWhCuJkW2p3GYY1In8x82A","title":"Note with some regular text, some pictures, and some tags"}
diff --git a/test_inputs/synology_note_station/config.json b/test_inputs/synology_note_station/config.json
@@ -0,0 +1,22 @@
+{
+  "note": [
+    "1026_DFB638943F456724EA95C09995C6D6B20650B28058CA1FCD3D2A9CF2F726B821E71156ED03DA074A700732C505E2190E",
+    "1026_89F091GQG53B992ODLNRUEAVKS",
+    "1026_9SLT09NT4P2A3DO2KQHUPT34QG",
+    "1026_9OKGI8LVSP1AR2F5FQR7V5RT6S"
+  ],
+  "notebook": [
+    "1026_95U5E7L2IH3B73EF6RA152KJ3S",
+    "1026_4OIJ8PR6215E9637KLADHU3K5S"
+  ],
+  "shortcut": {
+    "id": [
+      "1026_B2AB2ACD445B4049127284D71C3BB6C796356DB3198C02E5790A676F66AE32D7DE1D92007909C6DFF8A25C184E272830",
+      "1026_A7A13E7021E0651D3970131E85B5DA88E37AF4E55E36B360C7011F67BFA1C915F9F83D345A7FBA4BD49E90F61032C3C4",
+      "1026_AF02E6F04AAF28C39912518354DE88AEC587EC92340C145741410D68B4B7CDB1A556C2FA7D21A8DF2487B0269BC70264",
+      "1026_LM0IOCP9QH51B0AQTNCS4RVHP4"
+    ],
+    "stack": [],
+    "tag": []
+  }
+}
diff --git a/test_inputs/synology_note_station/file_4774521912a394266afdd8dc9510992e b/test_inputs/synology_note_station/file_4774521912a394266afdd8dc9510992e