diff --git a/backend/src/impl/db_utils/system_db_utils.py b/backend/src/impl/db_utils/system_db_utils.py index d05ab8ec..ae0df579 100644 --- a/backend/src/impl/db_utils/system_db_utils.py +++ b/backend/src/impl/db_utils/system_db_utils.py @@ -90,17 +90,6 @@ def system_from_dict( document["system_id"] = str(document.pop("_id")) if document.get("is_private") is None: document["is_private"] = True - if document.get("dataset_metadata_id") and document.get("dataset") is None: - dataset = DatasetDBUtils.find_dataset_by_id(document["dataset_metadata_id"]) - if dataset: - split = document.get("dataset_split") - # this check only valid for create - if split and split not in dataset.split: - abort_with_error_message( - 400, f"{split} is not a valid split for {dataset.dataset_name}" - ) - document["dataset"] = dataset.to_dict() - document.pop("dataset_metadata_id") # Parse the shared users shared_users = document.get("shared_users", None) @@ -216,15 +205,15 @@ def find_systems( if task: search_conditions.append({"task": task}) if dataset_name: - search_conditions.append({"system_info.dataset_name": dataset_name}) + search_conditions.append({"dataset.dataset_name": dataset_name}) if subdataset_name: - search_conditions.append({"system_info.sub_dataset_name": subdataset_name}) + search_conditions.append({"dataset.sub_dataset": subdataset_name}) if source_language: search_conditions.append({"source_language": source_language}) if target_language: search_conditions.append({"target_language": target_language}) if split: - search_conditions.append({"system_info.dataset_split": split}) + search_conditions.append({"dataset.split": split}) if creator: search_conditions.append({"creator": creator}) if shared_users: @@ -233,9 +222,9 @@ def find_systems( if dataset_list: dataset_dicts = [ { - "system_info.dataset_name": ds[0], - "system_info.sub_dataset_name": ds[1], - "system_info.dataset_split": ds[2], + "dataset.dataset_name": ds[0], + "dataset.sub_dataset": ds[1], + "dataset.split": ds[2], } for ds in dataset_list ] @@ -274,7 +263,7 @@ def _load_sys_output( dataset_file_type=FileType(custom_dataset.file_type), output_file_type=FileType(system_output.file_type), ).load() - else: + elif system.dataset: return ( get_loader_class(task=metadata.task) .from_datalab( @@ -290,6 +279,7 @@ def _load_sys_output( ) .load() ) + raise ValueError("neither dataset or custom_dataset is available") @staticmethod def _process( @@ -358,33 +348,57 @@ def create_system( """ Create a system given metadata and outputs, etc. """ - document = metadata.to_dict() - # -- parse the system details if getting a string from the frontend - SystemDBUtils._parse_system_details_in_doc(document, metadata) + def _validate_and_create_system(): + system = {} - # -- set the creator - user = get_user() - document["creator"] = user.id - - # -- set the preferred_username to conform with the return schema - document["preferred_username"] = user.preferred_username - - # -- create the object defined in openapi.yaml from only uploaded metadata - system: System = SystemDBUtils.system_from_dict(document) + user = get_user() + system["creator"] = user.id + system["preferred_username"] = user.preferred_username - # -- validate the dataset metadata - if metadata.dataset_metadata_id: - if not system.dataset: - abort_with_error_message( - 400, f"dataset: {metadata.dataset_metadata_id} does not exist" - ) - if metadata.task not in system.dataset.tasks: - abort_with_error_message( - 400, - f"dataset {system.dataset.dataset_name} cannot be used for " - f"{metadata.task} tasks", + if metadata.dataset_metadata_id: + if not metadata.dataset_split: + abort_with_error_message( + 400, "dataset split is required if a dataset is chosen" + ) + if custom_dataset: + abort_with_error_message( + 400, + "both datalab dataset and custom dataset are " + "provided. please only select one.", + ) + dataset = DatasetDBUtils.find_dataset_by_id( + metadata.dataset_metadata_id ) + if dataset: + if metadata.dataset_split not in dataset.split: + abort_with_error_message( + 400, + f"{metadata.dataset_split} is not a valid split " + f"for {dataset.dataset_name}", + ) + if metadata.task not in dataset.tasks: + abort_with_error_message( + 400, + f"dataset {dataset.dataset_name} cannot be used for " + f"{metadata.task} tasks", + ) + system["dataset"] = { + "dataset_id": dataset.dataset_id, + "split": metadata.dataset_split, + "dataset_name": dataset.dataset_name, + "sub_dataset": dataset.sub_dataset, + } + else: + abort_with_error_message( + 400, f"dataset: {metadata.dataset_metadata_id} cannot be found" + ) + system.update(metadata.to_dict()) + # -- parse the system details if getting a string from the frontend + SystemDBUtils._parse_system_details_in_doc(system, metadata) + return SystemDBUtils.system_from_dict(system) + + system = _validate_and_create_system() try: # -- find the dataset and grab custom features if they exist @@ -428,10 +442,6 @@ def db_operations(session: ClientSession) -> str: document = general_to_dict(system) document.pop("system_id") document.pop("preferred_username") - document["dataset_metadata_id"] = ( - system.dataset.dataset_id if system.dataset else None - ) - document.pop("dataset") system_id = DBUtils.insert_one( DBUtils.DEV_SYSTEM_METADATA, document, session=session ) diff --git a/backend/src/impl/default_controllers_impl.py b/backend/src/impl/default_controllers_impl.py index f036b0d8..8581dec4 100644 --- a/backend/src/impl/default_controllers_impl.py +++ b/backend/src/impl/default_controllers_impl.py @@ -297,30 +297,21 @@ def systems_post(body: SystemCreateProps) -> System: aborts with error if fails TODO: error handling """ - if body.metadata.dataset_metadata_id: - if not body.metadata.dataset_split: - abort_with_error_message( - 400, "dataset split is required if a dataset is chosen" - ) - if body.custom_dataset: - abort_with_error_message( - 400, - "both datalab dataset and custom dataset are " - "provided. please only select one.", - ) - try: body.system_output.data = decode_base64(body.system_output.data) - if body.custom_dataset and body.custom_dataset.data: + if body.custom_dataset: + if not body.custom_dataset.data: + abort_with_error_message(400, "custom_dataset.data cannot be empty") body.custom_dataset.data = decode_base64(body.custom_dataset.data) - system = SystemDBUtils.create_system( - body.metadata, body.system_output, body.custom_dataset - ) - return system except binascii.Error as e: abort_with_error_message( 400, f"file should be sent in plain text base64. ({e})" ) + else: + system = SystemDBUtils.create_system( + body.metadata, body.system_output, body.custom_dataset + ) + return system def systems_update_by_id(body: SystemUpdateProps, system_id: str): @@ -343,15 +334,15 @@ def system_outputs_get_by_id( system = SystemDBUtils.find_system_by_id(system_id) if not _has_read_access(system): abort_with_error_message(403, "system access denied", 40302) - if is_private_dataset( + if system.dataset and is_private_dataset( DatalabLoaderOption( - system.system_info.dataset_name, - system.system_info.sub_dataset_name, - system.system_info.dataset_split, + system.dataset.dataset_name, + system.dataset.sub_dataset, + system.dataset.split, ) ): abort_with_error_message( - 403, f"{system.system_info.dataset_name} is a private dataset", 40301 + 403, f"{system.dataset.dataset_name} is a private dataset", 40301 ) return SystemDBUtils.find_system_outputs(system_id, output_ids) @@ -368,15 +359,15 @@ def system_cases_get_by_id( system = SystemDBUtils.find_system_by_id(system_id) if not _has_read_access(system): abort_with_error_message(403, "system access denied", 40302) - if is_private_dataset( + if system.dataset and is_private_dataset( DatalabLoaderOption( - system.system_info.dataset_name, - system.system_info.sub_dataset_name, - system.system_info.dataset_split, + system.dataset.dataset_name, + system.dataset.sub_dataset, + system.dataset.split, ) ): abort_with_error_message( - 403, f"{system.system_info.dataset_name} is a private dataset", 40301 + 403, f"{system.dataset.dataset_name} is a private dataset", 40301 ) return SystemDBUtils.find_analysis_cases(system_id, level=level, case_ids=case_ids) diff --git a/frontend/src/components/SystemsTable/SystemTableContent.tsx b/frontend/src/components/SystemsTable/SystemTableContent.tsx index 9b2ca45f..90ae28c1 100644 --- a/frontend/src/components/SystemsTable/SystemTableContent.tsx +++ b/frontend/src/components/SystemsTable/SystemTableContent.tsx @@ -130,12 +130,11 @@ export function SystemTableContent({ ), }, { - dataIndex: ["system_info", "split"], + dataIndex: ["dataset", "split"], width: 110, title: "Dataset Split", fixed: "left", align: "center", - render: (_, record) => record.system_info.dataset_split || "unspecified", }, { dataIndex: "source_language", diff --git a/openapi/openapi.yaml b/openapi/openapi.yaml index 2bcc54b1..ed018ca6 100644 --- a/openapi/openapi.yaml +++ b/openapi/openapi.yaml @@ -991,19 +991,18 @@ components: dataset: description: if not specified, it is undefined type: object + nullable: true properties: dataset_id: type: string + split: + type: string dataset_name: type: string - tasks: - type: array - items: - type: string sub_dataset: type: string nullable: true - required: [dataset_id, dataset_name, tasks] + required: [dataset_id, dataset_name, split] system_details: type: object description: |