Skip to content

Commit 3122c6a

Browse files
committed
⚗️ example about how to extract text from base64 yjs document
1 parent ac86a4e commit 3122c6a

File tree

2 files changed

+36
-0
lines changed

2 files changed

+36
-0
lines changed

src/backend/core/tests/documents/test_api_documents_retrieve.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
Tests for Documents API endpoint in impress's core app: retrieve
33
"""
44

5+
import base64
6+
from bs4 import BeautifulSoup
57
import pytest
8+
import y_py as Y
9+
610
from rest_framework.test import APIClient
711

812
from core import factories, models
@@ -581,3 +585,33 @@ def test_api_documents_retrieve_authenticated_related_team_owners(
581585
"created_at": document.created_at.isoformat().replace("+00:00", "Z"),
582586
"updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
583587
}
588+
589+
590+
def test_read_document_yjs_blocknote():
591+
# I wrote "Hello world" in the blocknote editor
592+
# This is the base64 string of the Yjs document saved in Minio
593+
base64_string = "ARCymr/3DgAHAQ5kb2N1bWVudC1zdG9yZQMKYmxvY2tHcm91cAcAspq/9w4AAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4BAwlwYXJhZ3JhcGgHALKav/cOAgYEALKav/cOAwFIKACymr/3DgINdGV4dEFsaWdubWVudAF3BGxlZnQoALKav/cOAQJpZAF3DmluaXRpYWxCbG9ja0lkKACymr/3DgEJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4BD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSHspq/9w4BAw5ibG9ja0NvbnRhaW5lcgcAspq/9w4JAwlwYXJhZ3JhcGgoALKav/cOCg10ZXh0QWxpZ25tZW50AXcEbGVmdCgAspq/9w4JAmlkAXckMTFjYTgzYmEtZGM3OS00N2Q3LTllNzYtNmM4OTQwNzc1ZjE3KACymr/3DgkJdGV4dENvbG9yAXcHZGVmYXVsdCgAspq/9w4JD2JhY2tncm91bmRDb2xvcgF3B2RlZmF1bHSEspq/9w4EC2VsbG8gd29ybGQgAA=="
594+
decoded_bytes = base64.b64decode(base64_string)
595+
uint8_array = bytearray(decoded_bytes)
596+
597+
d1 = Y.YDoc()
598+
Y.apply_update(d1, uint8_array)
599+
blocknote = str(d1.get_xml_element('document-store'))
600+
601+
# blocknote var will look like this:
602+
# <UNDEFINED>
603+
# <blockGroup>
604+
# <blockContainer "backgroundColor"="default" "id"="initialBlockId" "textColor"="default">
605+
# <paragraph "textAlignment"="left">Hello world </paragraph>
606+
# </blockContainer>
607+
# <blockContainer "id"="11ca83ba-dc79-47d7-9e76-6c8940775f17" "backgroundColor"="default" "textColor"="default">
608+
# <paragraph "textAlignment"="left"></paragraph>
609+
# </blockContainer>
610+
# </blockGroup>
611+
# </UNDEFINED>
612+
613+
# BeautifulSoup is used to extract the text from the previous structure
614+
soup = BeautifulSoup(blocknote, "html.parser")
615+
soupValue = soup.get_text(separator=' ').strip()
616+
617+
assert soupValue == "Hello world"

src/backend/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ license = { file = "LICENSE" }
2525
readme = "README.md"
2626
requires-python = ">=3.10"
2727
dependencies = [
28+
"beautifulsoup4==4.12.3",
2829
"boto3==1.35.10",
2930
"Brotli==1.1.0",
3031
"celery[redis]==5.4.0",
@@ -57,6 +58,7 @@ dependencies = [
5758
"WeasyPrint>=60.2",
5859
"whitenoise==6.7.0",
5960
"mozilla-django-oidc==4.0.1",
61+
"y-py==0.5.5",
6062
]
6163

6264
[project.urls]

0 commit comments

Comments
 (0)