Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cadc 11585 Added ray to the science-platform (keel-dev) for evaluation purpose. #370

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions deployment/k8s-config/ray/install-ray.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
helm -n cadc-ray install example-cluster ./ray

23 changes: 23 additions & 0 deletions deployment/k8s-config/ray/ray/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
10 changes: 10 additions & 0 deletions deployment/k8s-config/ray/ray/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v2
name: ray
description: A Helm chart for deployments of Ray on Kubernetes.
type: application

# Chart version.
version: 0.1.0

# Ray version.
appVersion: "latest"
4,321 changes: 4,321 additions & 0 deletions deployment/k8s-config/ray/ray/crds/cluster_crd.yaml

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions deployment/k8s-config/ray/ray/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{{/*
Compute clusterMaxWorkers as the sum of per-pod-type max workers.
*/}}
{{- define "ray.clusterMaxWorkers" -}}
{{- $total := 0 }}
{{- range .Values.podTypes }}
{{- $total = add $total .maxWorkers }}
{{- end }}
{{- $total }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{{- if and (not .Values.namespacedOperator) (not .Values.clusterOnly) }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ray-operator
namespace: {{ .Values.operatorNamespace }}
spec:
replicas: 1
selector:
matchLabels:
cluster.ray.io/component: operator
template:
metadata:
labels:
cluster.ray.io/component: operator
spec:
serviceAccountName: default
containers:
- name: ray
imagePullPolicy: Always
image: {{ .Values.operatorImage }}
command: ["ray-operator"]
env:
- name: AUTOSCALER_MAX_NUM_FAILURES
value: "inf"
resources:
requests:
cpu: 1
memory: 1Gi
ephemeral-storage: 1Gi
limits:
memory: 2Gi
cpu: 1
{{- end }}
38 changes: 38 additions & 0 deletions deployment/k8s-config/ray/ray/templates/operator_namespaced.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{{- if and (.Values.namespacedOperator) (not .Values.clusterOnly) }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ray-operator
spec:
replicas: 1
selector:
matchLabels:
cluster.ray.io/component: operator
template:
metadata:
labels:
cluster.ray.io/component: operator
spec:
serviceAccountName: default
containers:
- name: ray
imagePullPolicy: Always
image: {{ .Values.operatorImage }}
command: ["ray-operator"]
env:
- name: RAY_OPERATOR_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: AUTOSCALER_MAX_NUM_FAILURES
value: "inf"
resources:
requests:
cpu: 1
memory: 1Gi
ephemeral-storage: 1Gi
limits:
memory: 2Gi
cpu: 1
{{- end }}
100 changes: 100 additions & 0 deletions deployment/k8s-config/ray/ray/templates/raycluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{{- if not .Values.operatorOnly }}
apiVersion: cluster.ray.io/v1
kind: RayCluster
metadata:
name: {{ .Release.Name }}
spec:
# The maximum number of workers nodes to launch in addition to the head node.
maxWorkers: {{ include "ray.clusterMaxWorkers" . }}
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscalingSpeed: {{ .Values.upscalingSpeed | default 1.0 }}
# If a node is idle for this many minutes, it will be removed.
idleTimeoutMinutes: {{ .Values.idleTimeoutMinutes | default 5 }}
# Specify the pod type for the ray head node (as configured below).
headPodType: {{ .Values.headPodType }}
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
{{- range $key, $val := .Values.podTypes }}
- name: {{ $key }}
minWorkers: {{ $val.minWorkers | default 0}}
maxWorkers: {{ $val.maxWorkers | default 0}}
{{- if $val.rayResources }}
rayResources:
{{- toYaml $val.rayResources | nindent 8 }}
{{- end }}
podConfig:
apiVersion: v1
kind: Pod
metadata:
generateName: {{ kebabcase $key }}-
spec:
restartPolicy: Never
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-node
imagePullPolicy: Always
image: {{ $.Values.image }}
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
env:
- name: RAY_gcs_server_rpc_server_thread_num
value: "1"
ports:
- containerPort: 6379 # Redis port for Ray <= 1.10.0. GCS server port for Ray >= 1.11.0.
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
- containerPort: 8000 # Used by Ray Serve

# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: {{ .CPU }}
memory: {{ .memory }}
limits:
cpu: {{ .CPU }}
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: {{ .memory }}
{{- if .GPU }}
nvidia.com/gpu: {{ .GPU }}
{{- end }}
{{- if .nodeSelector }}
nodeSelector:
{{- toYaml $val.nodeSelector | nindent 12 }}
{{- end }}
{{- if $val.tolerations }}
tolerations:
{{- toYaml $val.tolerations | nindent 10 }}
{{- end }}
{{- end }}
# Commands to start Ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
headStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --head --port=6379 --no-monitor --dashboard-host 0.0.0.0
# Commands to start Ray on worker nodes. You don't need to change this.
workerStartRayCommands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
{{- end }}
111 changes: 111 additions & 0 deletions deployment/k8s-config/ray/ray/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Default values for Ray.

# RayCluster settings:

# image is Ray image to use for the head and workers of this Ray cluster.
# It's recommended to build custom dependencies for your workload into this image,
# taking one of the offical `rayproject/ray` images as base.
image: rayproject/ray:latest
# The autoscaler will scale up the cluster faster with higher upscaling speed.
# If the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscalingSpeed: 1.0
# If a node is idle for this many minutes, it will be removed.
idleTimeoutMinutes: 5
# headPodType is the podType used for the Ray head node (as configured below).
headPodType: rayHeadType
# podTypes is the list of pod configurations available for use as Ray nodes.
podTypes:
# The key for each podType is a user-defined string.
# Since we set headPodType: rayHeadType, the Ray head pod will use the configuration
# defined in this entry of podTypes:
rayHeadType:
# CPU is the number of CPUs used by this pod type.
# (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.)
CPU: 1
# memory is the memory used by this Pod type.
# (Used for both requests and limits.)
memory: 1Gi
# GPU is the number of NVIDIA GPUs used by this pod type.
# (Optional, requires GPU nodes with appropriate setup. See https://docs.ray.io/en/master/cluster/kubernetes-gpu.html)
GPU: 0
# rayResources is an optional string-int mapping signalling additional resources to Ray.
# "CPU", "GPU", and "memory" are filled automatically based on the above settings, but can be overriden;
# For example, rayResources: {"CPU": 0} can be used in the head podType to prevent Ray from scheduling tasks on the head.
# See https://docs.ray.io/en/master/advanced.html#dynamic-remote-parameters for an example of usage of custom resources in a Ray task.
rayResources: {}
# Optionally, set a node selector for this podType: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector
nodeSelector: {}

# tolerations for Ray pods of this podType (the head's podType in this case)
# ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
# Note that it is often not necessary to manually specify tolerations for GPU
# usage on managed platforms such as AKS, EKS, and GKE.
# ref: https://docs.ray.io/en/master/cluster/kubernetes-gpu.html
tolerations: []
# - key: "nvidia.com/gpu"
# operator: Exists
# effect: NoSchedule

# The key for each podType is a user-defined string.
rayWorkerType:
# minWorkers is the minimum number of Ray workers of this pod type to keep running.
minWorkers: 2
# maxWorkers is the maximum number of Ray workers of this pod type to which Ray will scale.
maxWorkers: 3
# memory is the memory used by this Pod type.
# (Used for both requests and limits.)
memory: 1Gi
# CPU is the number of CPUs used by this pod type.
# (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.)
CPU: 1
# GPU is the number of NVIDIA GPUs used by this pod type.
# (Optional, requires GPU nodes with appropriate setup. See https://docs.ray.io/en/master/cluster/kubernetes-gpu.html)
GPU: 0
# rayResources is an optional string-int mapping signalling additional resources to Ray.
# "CPU", "GPU", and "memory" are filled automatically based on the above settings, but can be overriden;
# For example, rayResources: {"CPU": 0} can be used in the head podType to prevent Ray from scheduling tasks on the head.
# See https://docs.ray.io/en/master/advanced.html#dynamic-remote-parameters for an example of usage of custom resources in a Ray task.
rayResources: {}
# Optionally, set a node selector for this Pod type. See https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector
nodeSelector: {}

# tolerations for Ray pods of this podType
# ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
# Note that it is often not necessary to manually specify tolerations for GPU
# usage on managed platforms such as AKS, EKS, and GKE.
# ref: https://docs.ray.io/en/master/cluster/kubernetes-gpu.html
tolerations: []
# - key: nvidia.com/gpu
# operator: Exists
# effect: NoSchedule

# Optionally, define more worker podTypes
# rayWorkerType2:
# minWorkers: 0
# maxWorkers: 10
# memory: ...

# Operator settings:

# operatorOnly - If true, will only set up the Operator with this release,
# without launching a Ray cluster.
operatorOnly: false
# clusterOnly - If true, will only create a RayCluster resource with this release,
# without setting up the Operator.
# (Useful when launching multiple Ray clusters.)
clusterOnly: false
# namespacedOperator - If true, the operator is scoped to the Release namespace
# and only manages RayClusters in that namespace.
# By default, the operator is cluster-scoped and runs in the default namespace.
namespacedOperator: true
# operatorNamepsace - If using a cluster-scoped operator (namespacedOperator: false), set the namespace
# in which to launch the operator.
operatorNamespace: default
# operatorImage - The image used in the operator deployment.
# It is recommended to use one of the official `rayproject/ray` images for the operator.
# It is recommended to use the same Ray version in the operator as in the Ray clusters managed
# by the operator. In other words, the images specified under the fields `operatorImage` and `image`
# should carry matching Ray versions.
operatorImage: rayproject/ray:latest
2 changes: 2 additions & 0 deletions deployment/k8s-config/ray/uninstall-ray.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
kubectl -n cadc-ray delete raycluster example-cluster
helm -n cadc-ray uninstall example-cluster