Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pydatalab/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ server = [
"Flask-PyMongo ~= 2.3",
"Flask-Mail ~= 0.10",
"Flask-Compress ~= 1.15",
"APScheduler ~= 3.10",
"Werkzeug ~= 3.0",
"python-dotenv ~= 1.0",
"pillow ~= 11.0",
Expand Down
169 changes: 169 additions & 0 deletions pydatalab/src/pydatalab/export_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import json
import shutil
import tempfile
import zipfile
from datetime import datetime, timezone
from pathlib import Path

from bson import ObjectId

from pydatalab.config import CONFIG
from pydatalab.mongo import flask_mongo


def _convert_objectids_in_dict(d: dict) -> dict:
"""Recursively convert ObjectIds and datetimes in a dictionary to strings."""
result: dict = {}
for key, value in d.items():
if isinstance(value, ObjectId):
result[key] = str(value)
elif isinstance(value, datetime):
result[key] = value.isoformat()
elif isinstance(value, dict):
result[key] = _convert_objectids_in_dict(value)
elif isinstance(value, list):
result[key] = [
str(v)
if isinstance(v, ObjectId)
else v.isoformat()
if isinstance(v, datetime)
else _convert_objectids_in_dict(v)
if isinstance(v, dict)
else v
for v in value
]
else:
result[key] = value
return result


def generate_ro_crate_metadata(collection_data: dict, child_items: list[dict]) -> dict:
"""Generate RO-Crate metadata for the .eln file.

Parameters:
collection_data: The collection metadata
child_items: List of items in the collection

Returns:
RO-Crate metadata as a dictionary
"""

experiments: list[dict] = []
for item in child_items:
experiments.append(
{
"@id": f"./{item['item_id']}/",
}
)

graph: list[dict] = [
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {"@id": "./"},
"conformsTo": {"@id": "https://w3id.org/ro/crate/1.1"},
"dateCreated": datetime.now(tz=timezone.utc).isoformat(),
"sdPublisher": {
"@id": CONFIG.IDENTIFIER_PREFIX or "https://github.com/datalab-org/datalab"
},
},
{
"@id": "./",
"@type": "Dataset",
"name": collection_data.get("title", collection_data.get("collection_id")),
"description": collection_data.get("description", ""),
"hasPart": experiments,
},
]

metadata = {
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": graph,
}

for item in child_items:
item_metadata = {
"@id": f"./{item['item_id']}/",
"@type": "Dataset",
"name": item.get("name", item["item_id"]),
"identifier": item["item_id"],
"dateCreated": item.get("date", datetime.now(tz=timezone.utc)).isoformat()
if isinstance(item.get("date"), datetime)
else item.get("date"),
}

if item.get("file_ObjectIds"):
files = []
for file_id in item["file_ObjectIds"]:
file_data = flask_mongo.db.files.find_one({"_id": ObjectId(file_id)})
if file_data:
files.append({"@id": f"./{item['item_id']}/{file_data['name']}"})
if files:
item_metadata["hasPart"] = files

graph.append(item_metadata)

return metadata


def create_eln_file(collection_id: str, output_path: str) -> None:
"""Create a .eln file for a collection.

Parameters:
collection_id: ID of the collection to export
output_path: Path where the .eln file should be saved
"""

collection_data = flask_mongo.db.collections.find_one({"collection_id": collection_id})
if not collection_data:
raise ValueError(f"Collection {collection_id} not found")

collection_immutable_id = collection_data["_id"]
child_items = list(
flask_mongo.db.items.find(
{
"relationships": {
"$elemMatch": {"type": "collections", "immutable_id": collection_immutable_id}
}
}
)
)

with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)

root_folder_name = collection_id
root_folder = temp_path / root_folder_name
root_folder.mkdir()

ro_crate_metadata = generate_ro_crate_metadata(collection_data, child_items)
with open(root_folder / "ro-crate-metadata.json", "w", encoding="utf-8") as f:
json.dump(ro_crate_metadata, f, indent=2, ensure_ascii=False)

for item in child_items:
item_folder = root_folder / item["item_id"]
item_folder.mkdir()

item_metadata = {k: v for k, v in item.items() if k not in ["_id", "file_ObjectIds"]}

item_metadata = _convert_objectids_in_dict(item_metadata)

with open(item_folder / "metadata.json", "w", encoding="utf-8") as f:
json.dump(item_metadata, f, indent=2, ensure_ascii=False)

if item.get("file_ObjectIds"):
for file_id in item.get("file_ObjectIds", []):
file_data = flask_mongo.db.files.find_one({"_id": ObjectId(file_id)})
if file_data:
source_path = Path(file_data["location"])
if source_path.exists():
dest_file = item_folder / file_data["name"]
shutil.copy2(source_path, dest_file)
else:
print(f"Warning: File not found on disk: {file_data['location']}")

with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zipf:
for file_path in root_folder.rglob("*"):
if file_path.is_file():
arcname = file_path.relative_to(temp_path)
zipf.write(file_path, arcname)
34 changes: 34 additions & 0 deletions pydatalab/src/pydatalab/models/export_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from datetime import datetime, timezone
from enum import Enum

from pydantic import BaseModel, Field


class ExportStatus(str, Enum):
"""Status of an export task."""

PENDING = "pending"
PROCESSING = "processing"
READY = "ready"
ERROR = "error"


class ExportTask(BaseModel):
"""Model for an export task."""

task_id: str = Field(..., description="Unique identifier for the export task")
collection_id: str = Field(..., description="ID of the collection being exported")
status: ExportStatus = Field(
default=ExportStatus.PENDING, description="Current status of the task"
)
creator_id: str = Field(..., description="ID of the user who created the export")
created_at: datetime = Field(
default_factory=lambda: datetime.now(tz=timezone.utc),
description="When the task was created",
)
completed_at: datetime | None = Field(None, description="When the task was completed")
file_path: str | None = Field(None, description="Path to the generated .eln file")
error_message: str | None = Field(None, description="Error message if status is ERROR")

class Config:
use_enum_values = True
11 changes: 11 additions & 0 deletions pydatalab/src/pydatalab/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,15 @@ def create_user_fts():
db.users.drop_index(user_fts_name)
ret += create_user_fts()

ret += db.export_tasks.create_index(
"task_id", unique=True, name="unique task ID", background=background
)
ret += db.export_tasks.create_index(
"creator_id", name="export task creator", background=background
)
ret += db.export_tasks.create_index(
"created_at", name="export task created at", background=background
)
ret += db.export_tasks.create_index("status", name="export task status", background=background)

return ret
2 changes: 2 additions & 0 deletions pydatalab/src/pydatalab/routes/v0_1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .auth import AUTH, OAUTH, OAUTH_PROXIES
from .blocks import BLOCKS
from .collections import COLLECTIONS
from .export import EXPORT
from .files import FILES
from .graphs import GRAPHS
from .healthcheck import HEALTHCHECK
Expand All @@ -25,6 +26,7 @@
HEALTHCHECK,
INFO,
GRAPHS,
EXPORT,
)

__all__ = ("BLUEPRINTS", "OAUTH", "__api_version__", "OAUTH_PROXIES")
Loading