add support for Day One

marph91 · Jul 24, 2024 · 18835dd · 18835dd
1 parent 798fa1a
commit 18835dd
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 0 deletions.
diff --git a/docs/formats/day_one.md b/docs/formats/day_one.md
@@ -0,0 +1,23 @@
+- [Website](https://dayoneapp.com/)
+- Typical extension: `.zip`
+
+## Export Instructions
+
+- <https://dayoneapp.com/guides/tips-and-tutorials/exporting-entries/>
+- Choose "Day One JSON (.zip)"
+
+## Import to Joplin
+
+Example: `jimmy-cli-linux Export-Tagebuch.zip --format day_one`
+
+## Import Structure
+
+- Each day is converted to a notebook.
+- Entries are converted to notes and grouped into the corresponding notebook of that day.
+- Referenced photos are imported as attachments.
+
+## Known Limitations
+
+- Unreferenced photos are not imported.
+- Photos that are references by multiple notes are only imported once (i. e. in one note). This seems to be a bug in the Day One export.
+- Audio, PDF and video attachments are not imported. They are a Day One premium feature. If you would like to see support, please provide an example file.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -39,6 +39,7 @@ nav:
     - Cacher: formats/cacher.md
     - Cherrytree: formats/cherrytree.md
     - Clipto: formats/clipto.md
+    - Day One: formats/day_one.md
     - Dynalist: formats/dynalist.md
     - FuseBase: formats/fusebase.md
     - Google Keep: formats/google_keep.md

diff --git a/src/formats/day_one.py b/src/formats/day_one.py
@@ -0,0 +1,224 @@
+"""Convert Day One notes to the intermediate format."""
+
+import datetime as dt
+from pathlib import Path
+import json
+
+import common
+import converter
+import intermediate_format as imf
+
+
+def guess_title(body):
+    for line in body.split("\n"):
+        if line.startswith("!["):
+            continue
+        if not line.strip():
+            continue
+        return line.lstrip("#").strip()
+    return ""
+
+
+class Converter(converter.BaseConverter):
+    accepted_extensions = [".zip"]
+
+    def prepare_input(self, input_: Path) -> Path:
+        return common.extract_zip(input_)
+
+    def parse_rich_text(self, json_rich_text):
+        # TODO: WIP
+        md_content = []
+        for element in json_rich_text:
+            element_text = element.get("text", "")
+            for attribute, value in element.get("attributes", {}).items():
+                match attribute:
+                    case "autolink":
+                        element_text = f"<{element_text}>"
+                    case "bold" | "highlightedColor":
+                        element_text = f"**{element_text}**"
+                    case "inlineCode":
+                        element_text = f"`{element_text}`"
+                    case "italic":
+                        element_text = f"*{element_text}*"
+                    case "line":
+                        if (header := value.get("header", 0)) > 0:
+                            element_text = f"{'#' * header} {element_text}*"
+                        elif (list_style := value.get("listStyle")) is not None:
+                            match list_style:
+                                case "bulleted":
+                                    bullet = "-"
+                                case "numbered":
+                                    bullet = "1."
+                                case "checkbox":
+                                    if value.get("checked"):
+                                        bullet = "- [x]"
+                                    else:
+                                        bullet = "- [ ]"
+                                case _:
+                                    self.logger.warning(
+                                        f"Unsupported list style {list_style}"
+                                    )
+                                    bullet = "-"
+                            indentation = "    " * (value["indentLevel"] - 1)
+                            element_text = f"{indentation}{bullet} {element_text}"
+                        elif value.get("quote", False):
+                            indentation = "> " * (value["indentLevel"])
+                            element_text = f"{indentation}{element_text}"
+                        else:
+                            self.logger.warning(value, element)
+                    case "linkURL":
+                        if "://" in value:
+                            element_text = f"[{element_text}]({value})"
+                        else:
+                            # assume this is a link to the dayone homepage
+                            element_text = (
+                                f"[{element_text}](https://dayoneapp.com"
+                                f"/guides/tips-and-tutorials/{value})"
+                            )
+                    case _:
+                        self.logger.warning(
+                            f"Unsupported rich text attribute {attribute}"
+                        )
+            md_content.append(element_text)
+        return "".join(md_content)
+
+    def create_notebook_hierarchy(self, date_):
+        def find_or_create_child_notebook(title, parent_notebook):
+            for child_notebook in parent_notebook.child_notebooks:
+                if child_notebook.data["title"] == title:
+                    return child_notebook
+            new_notebook = imf.Notebook({"title": title})
+            parent_notebook.child_notebooks.append(new_notebook)
+            return new_notebook
+
+        return find_or_create_child_notebook(
+            date_.strftime("%Y-%m-%d"), self.root_notebook
+        )
+
+    def get_resource_maps(self, entries):
+        # Create "global" maps. The resources are attached to single entries, but they
+        # can be referenced. For example when copying the same photo to another note,
+        # the same photo gets another id. But both IDs are referenced at the first note
+        # photos...
+        audio_ids = []
+        pdf_ids = []
+        photo_id_filename_map = {}
+        video_ids = []
+
+        assert self.root_path is not None  # for mypy
+        for entry in entries:
+            for audio in entry.get("audios", []):
+                # premium feature - not yet supported
+                audio_ids.append(audio["identifier"])
+            for pdf in entry.get("pdfAttachments", []):
+                # premium feature - not yet supported
+                pdf_ids.append(pdf["identifier"])
+            for photo in entry.get("photos", []):
+                potential_matches = list(
+                    (self.root_path / "photos").glob(f"{photo['md5']}.*")
+                )
+                if len(potential_matches) == 0:
+                    self.logger.warning(f"Couldn't find photo {photo['md5']}")
+                elif len(potential_matches) == 1:
+                    photo_id_filename_map[photo["identifier"]] = Path(
+                        potential_matches[0]
+                    )
+                else:
+                    self.logger.debug(f"Ambiguous photo {photo['md5']}")
+                    photo_id_filename_map[photo["identifier"]] = Path(
+                        potential_matches[0]
+                    )
+            for video in entry.get("videos", []):
+                # premium feature - not yet supported
+                video_ids.append(video["identifier"])
+
+        if audio_ids or pdf_ids or video_ids:
+            self.logger.warning(
+                "Audio/PDF/Video attachments are a Day One premium feature and not "
+                "yet implemented. Please provide an example file if you would like "
+                "to see support."
+            )
+
+        return photo_id_filename_map
+
+    def handle_markdown_links(
+        self, body: str, photo_id_filename_map: dict
+    ) -> tuple[list, list]:
+        assert self.root_path is not None  # for mypy
+
+        resources = []
+        note_links = []
+        for link in common.get_markdown_links(body):
+            if link.is_web_link or link.is_mail_link:
+                continue  # keep the original links
+            if link.url.startswith("dayone2://view?entryId="):
+                # internal link
+                original_id = link.url.replace("dayone2://view?entryId=", "")
+                note_links.append(imf.NoteLink(str(link), original_id, link.text))
+            elif link.url.startswith("dayone-moment://"):
+                # image
+                original_id = link.url.replace("dayone-moment://", "")
+                if original_id not in photo_id_filename_map:
+                    self.logger.warning(f"Couldn't find resource id {original_id}")
+                    continue
+                source_path = (
+                    self.root_path / "photos" / photo_id_filename_map[original_id]
+                )
+                if not source_path.is_file():
+                    continue
+                resources.append(imf.Resource(source_path, str(link), link.text))
+            else:
+                self.logger.warning(f"Unknown URL protocol {link.url}")
+        return resources, note_links
+
+    def convert(self, file_or_folder: Path):
+        self.root_path = self.prepare_input(file_or_folder)
+
+        potential_sources = list(self.root_path.glob("*.json"))
+        if len(potential_sources) != 1:
+            self.logger.warning(
+                f"Found to many or less json files {len(potential_sources)}"
+            )
+            return
+
+        file_dict = json.loads(potential_sources[0].read_text(encoding="utf-8"))
+
+        photo_id_filename_map = self.get_resource_maps(file_dict["entries"])
+
+        for entry in file_dict["entries"]:
+            # TODO: attach non-referenced photos, videos, audios, pdfAttachments
+
+            note_body = entry.get("text", "")
+            note_data = {
+                "title": guess_title(note_body),
+                "body": note_body,  # TODO: Is there any advantage of rich text?
+                "user_created_time": common.iso_to_unix_ms(entry["creationDate"]),
+                "user_updated_time": common.iso_to_unix_ms(entry["modifiedDate"]),
+                "source_application": self.format,
+            }
+
+            common.try_transfer_dicts(
+                entry.get("location", {}), note_data, ["latitude", "longitude"]
+            )
+
+            tags = entry.get("tags", [])
+            if entry.get("starred"):
+                tags.append("day-one-starred")
+            if entry.get("pinned"):
+                tags.append("day-one-pinned")
+
+            resources, note_links = self.handle_markdown_links(
+                note_body, photo_id_filename_map
+            )
+
+            note_joplin = imf.Note(
+                note_data,
+                resources=resources,
+                tags=[imf.Tag({"title": tag}) for tag in tags],
+                note_links=note_links,
+                original_id=entry["uuid"],
+            )
+
+            creation_date = dt.datetime.fromisoformat(entry["creationDate"])
+            parent_notebook = self.create_notebook_hierarchy(creation_date)
+            parent_notebook.child_notes.append(note_joplin)
diff --git a/test/example_commands.sh b/test/example_commands.sh
@@ -15,6 +15,7 @@ $EXECUTABLE "$CACHE/bear/backup.bear2bk" --format bear
 $EXECUTABLE "$CACHE/cacher/cacher-export-202406182304.json" --format cacher
 $EXECUTABLE "$CACHE/cherrytree/cherry.ctb.ctd" --format cherrytree
 $EXECUTABLE "$CACHE/clipto/clipto_backup_240401_105154.json" --format clipto
+$EXECUTABLE "$CACHE/day_one/Export-Tagebuch.zip" --format day_one
 $EXECUTABLE "$CACHE/dynalist/dynalist-backup-2024-04-12.zip" --format dynalist
 $EXECUTABLE "$CACHE/google_keep/takeout-20240401T160516Z-001.zip" --format google_keep
 $EXECUTABLE "$CACHE/google_keep/takeout-20240401T160556Z-001.tgz" --format google_keep