vdp_protocol.yaml

---
"$schema": http://json-schema.org/draft-07/schema#
"$id": https://github.com/instill-ai/vdp/blob/main/protocol/vdp_protocol.yaml
title: VDP Protocol
type: object
description: VDP Protocol structs
additionalProperties: true
anyOf:
  - required:
      - classification
  - required:
      - detection
  - required:
      - keypoint
  - required:
      - ocr
  - required:
      - instance_segmentation
  - required:
      - semantic_segmentation
  - required:
      - text_to_image
  - required:
      - text_generation
  - required:
      - unspecified
properties:
  classification:
    description: "Classify an image into pre-defined categories"
    "$ref": "#/definitions/Classification"
  detection:
    description: "Detect and localise multiple objects"
    "$ref": "#/definitions/Detection"
  keypoint:
    description: "Detect and localise keypoints of multiple objects"
    "$ref": "#/definitions/Keypoint"
  ocr:
    description: "Detect, localise and recognise texts"
    "$ref": "#/definitions/Ocr"
  instance_segmentation:
    description: "Detect, localise and delineate multiple objects"
    "$ref": "#/definitions/InstanceSegmentation"
  semantic_segmentation:
    description: "Classify image pixels into predefined categories"
    "$ref": "#/definitions/SemanticSegmentation"
  text_to_image:
    description: "Generates images from input text"
    "$ref": "#/definitions/TextToImage"
  text_generation:
    description: "Generate new texts from input text"
    "$ref": "#/definitions/TextGeneration"
  unspecified:
    description: "Unspecified task with output in the free form"
    "$ref": "#/definitions/Unspecified"
definitions:
  Classification:
    type: object
    additionalProperties: false
    required:
      - category
      - score
    properties:
      category:
        description: "The predicted category of the input"
        type: string
      score:
        description: "The confidence score of the predicted category of the input"
        type: number
  Detection:
    type: object
    additionalProperties: false
    required:
      - objects
    properties:
      objects:
        description: "A list of detected objects"
        type: array
        items:
          type: object
          required:
            - bounding_box
            - category
            - score
          properties:
            bounding_box:
              "$ref": "#/definitions/BoundingBox"
            category:
              description: "The predicted category of the bounding box"
              type: string
            score:
              description: "The confidence score of the predicted category of the bounding box"
              type: number
  Keypoint:
    type: object
    additionalProperties: true
    required:
      - objects
    properties:
      objects:
        description: "A list of keypoint objects, a keypoint object includes all the pre-defined keypoints of a detected object"
        type: array
        items:
          type: object
          required:
            - keypoints
            - score
          properties:
            keypoints:
              description: "A keypoint group is composed of a list of pre-defined keypoints of a detected object"
              type: array
              items:
                type: object
                required:
                  - "x"
                  - "y"
                  - "v"
                properties:
                  x:
                    description: "x coordinate of the keypoint"
                    type: number
                  y:
                    description: "y coordinate of the keypoint"
                    type: number
                  v:
                    description: "visibility score of the keypoint"
                    type: number
            score:
              description: "The confidence score of the predicted object"
              type: number
  Ocr:
    type: object
    additionalProperties: false
    required:
      - objects
    properties:
      objects:
        description: "A list of detected bounding boxes"
        type: array
        items:
          type: object
          required:
            - bounding_box
            - text
            - score
          properties:
            bounding_box:
              "$ref": "#/definitions/BoundingBox"
            text:
              description: "Text string recognised per bounding box in the `bounding_boxes`"
              type: string
            score:
              description: "The confidence score of the predicted object"
              type: number
  InstanceSegmentation:
    type: object
    additionalProperties: false
    required:
      - objects
    properties:
      objects:
        description: "A list of detected instance bounding boxes"
        type: array
        items:
          type: object
          required:
            - rle
            - bounding_box
            - category
            - score
          properties:
            rle:
              description: Run Length Encoding (RLE) of instance mask within the bounding box
              type: string
            bounding_box:
              "$ref": "#/definitions/BoundingBox"
            category:
              description: "The predicted category of the bounding box"
              type: string
            score:
              description: "The confidence score of the predicted instance object"
              type: number
  SemanticSegmentation:
    type: object
    additionalProperties: false
    required:
      - stuffs
    properties:
      stuffs:
        description: "A list of RLE binary masks"
        type: array
        items:
          type: object
          required:
            - rle
            - category
          properties:
            rle:
              description: Run Length Encoding (RLE) of each stuff mask within the image
              type: string
            category:
              description: "Category text string corresponding to each stuff mask"
              type: string
  TextToImage:
    type: object
    additionalProperties: false
    required:
      - images
    properties:
      images:
        description: "A list of generated images"
        type: array
        items:
          description: "Generated image string in base64 format"
          type: string
  TextGeneration:
    type: object
    additionalProperties: false
    required:
      - text
    properties:
      text:
        description: "Generated text string"
        type: string
  Unspecified:
    type: object
    additionalProperties: false
    required:
      - raw_outputs
    properties:
      raw_outputs:
        description: "The raw output of the model"
        type: array
        items:
          type: object
          required:
            - name
            - data_type
            - shape
            - data
          properties:
            name:
              description: "The corresponding output name of the model output data"
              type: string
            data_type:
              description: "The type of the output data"
              type: string
            shape:
              description: "The shape of the output data"
              type: array
              items:
                type: integer
            data:
              description: "The output data"
              type: array
              items: {} # Allow any type
  BoundingBox:
    type: object
    description: "The detected bounding box in (left, top, width, height) format"
    additionalProperties: false
    required:
      - left
      - top
      - width
      - height
    properties:
      left:
        description: "Bounding box left x-axis value"
        type: number
      top:
        description: "Bounding box top y-axis value"
        type: number
      width:
        description: "Bounding box width value"
        type: number
      height:
        description: "Bounding box height value"
        type: number