Skip to content
This repository has been archived by the owner on Nov 27, 2023. It is now read-only.

Commit

Permalink
Merge pull request #628 from docker/machine
Browse files Browse the repository at this point in the history
Guess AWS machine type based on service resources reservations
  • Loading branch information
ndeloof authored Sep 18, 2020
2 parents 10372b7 + b22ebd6 commit 57c14e7
Show file tree
Hide file tree
Showing 3 changed files with 329 additions and 0 deletions.
2 changes: 2 additions & 0 deletions ecs/compatibility.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ var compatibleComposeAttributes = []string{
"services.deploy.resources.reservations",
"services.deploy.resources.reservations.cpus",
"services.deploy.resources.reservations.memory",
"services.deploy.resources.reservations.generic_resources",
"services.deploy.resources.reservations.generic_resources.discrete_resource_spec",
"services.deploy.update_config",
"services.deploy.update_config.parallelism",
"services.entrypoint",
Expand Down
212 changes: 212 additions & 0 deletions ecs/gpu.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
/*
Copyright 2020 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package ecs

import (
"fmt"
"math"
"strconv"

"github.com/compose-spec/compose-go/types"
"github.com/docker/go-units"
)

type machine struct {
id string
cpus float64
memory types.UnitBytes
gpus int64
}

type family []machine

var p3family = family{
{
id: "p3.2xlarge",
cpus: 8,
memory: 64 * units.GiB,
gpus: 2,
},
{
id: "p3.8xlarge",
cpus: 32,
memory: 244 * units.GiB,
gpus: 4,
},
{
id: "p3.16xlarge",
cpus: 64,
memory: 488 * units.GiB,
gpus: 8,
},
}

type filterFn func(machine) bool

func (f family) filter(fn filterFn) family {
var filtered family
for _, machine := range f {
if fn(machine) {
filtered = append(filtered, machine)
}
}
return filtered
}

func (f family) firstOrError(msg string, args ...interface{}) (machine, error) {
if len(f) == 0 {
return machine{}, fmt.Errorf(msg, args...)
}
return f[0], nil
}

func guessMachineType(project *types.Project) (string, error) {
// we select a machine type to match all gpus-bound services requirements
// once https://github.com/aws/containers-roadmap/issues/631 is implemented we can define dedicated CapacityProviders per service.
requirements, err := getResourceRequirements(project)
if err != nil {
return "", err
}

instanceType, err := p3family.
filter(func(m machine) bool {
return m.memory >= requirements.memory
}).
filter(func(m machine) bool {
return m.cpus >= requirements.cpus
}).
filter(func(m machine) bool {
return m.gpus >= requirements.gpus
}).
firstOrError("none of the Amazon EC2 P3 instance types meet the requirements for memory:%d cpu:%f gpus:%d", requirements.memory, requirements.cpus, requirements.gpus)
if err != nil {
return "", err
}
return instanceType.id, nil
}

type resourceRequirements struct {
memory types.UnitBytes
cpus float64
gpus int64
}

func getResourceRequirements(project *types.Project) (*resourceRequirements, error) {
return toResourceRequirementsSlice(project).
filter(func(requirements *resourceRequirements) bool {
return requirements.gpus != 0
}).
max()
}

type eitherRequirementsOrError struct {
requirements []*resourceRequirements
err error
}

func toResourceRequirementsSlice(project *types.Project) eitherRequirementsOrError {
var requirements []*resourceRequirements
for _, service := range project.Services {
r, err := toResourceRequirements(service)
if err != nil {
return eitherRequirementsOrError{nil, err}
}
requirements = append(requirements, r)
}
return eitherRequirementsOrError{requirements, nil}
}

func (r eitherRequirementsOrError) filter(fn func(*resourceRequirements) bool) eitherRequirementsOrError {
if r.err != nil {
return r
}
var requirements []*resourceRequirements
for _, req := range r.requirements {
if fn(req) {
requirements = append(requirements, req)
}
}
return eitherRequirementsOrError{requirements, nil}
}

func toResourceRequirements(service types.ServiceConfig) (*resourceRequirements, error) {
if service.Deploy == nil {
return nil, nil
}
reservations := service.Deploy.Resources.Reservations
if reservations == nil {
return nil, nil
}

var requiredGPUs int64
for _, r := range reservations.GenericResources {
if r.DiscreteResourceSpec.Kind == "gpus" {
requiredGPUs = r.DiscreteResourceSpec.Value
break
}
}

var nanocpu float64
if reservations.NanoCPUs != "" {
v, err := strconv.ParseFloat(reservations.NanoCPUs, 64)
if err != nil {
return nil, err
}
nanocpu = v
}
return &resourceRequirements{
memory: reservations.MemoryBytes,
cpus: nanocpu,
gpus: requiredGPUs,
}, nil
}

func (r resourceRequirements) combine(o *resourceRequirements) resourceRequirements {
if o == nil {
return r
}
return resourceRequirements{
memory: maxUnitBytes(r.memory, o.memory),
cpus: math.Max(r.cpus, o.cpus),
gpus: maxInt64(r.gpus, o.gpus),
}
}

func (r eitherRequirementsOrError) max() (*resourceRequirements, error) {
if r.err != nil {
return nil, r.err
}
min := resourceRequirements{}
for _, req := range r.requirements {
min = min.combine(req)
}
return &min, nil
}

func maxInt64(a, b int64) int64 {
if a > b {
return a
}
return b
}

func maxUnitBytes(a, b types.UnitBytes) types.UnitBytes {
if a > b {
return a
}
return b
}
115 changes: 115 additions & 0 deletions ecs/gpu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
Copyright 2020 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package ecs

import (
"testing"
)

func TestGuessMachineType(t *testing.T) {
tests := []struct {
name string
yaml string
want string
wantErr bool
}{
{
name: "1-gpus",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 1
`,
want: "p3.2xlarge",
wantErr: false,
},
{
name: "4-gpus",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 4
`,
want: "p3.8xlarge",
wantErr: false,
},
{
name: "1-gpus, high-memory",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
memory: 300Gb
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 2
`,
want: "p3.16xlarge",
wantErr: false,
},
{
name: "1-gpus, high-cpu",
yaml: `
services:
learning:
image: tensorflow/tensorflow:latest-gpus
deploy:
resources:
reservations:
memory: 32Gb
cpus: "32"
generic_resources:
- discrete_resource_spec:
kind: gpus
value: 2
`,
want: "p3.8xlarge",
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
project := loadConfig(t, tt.yaml)
got, err := guessMachineType(project)
if (err != nil) != tt.wantErr {
t.Errorf("guessMachineType() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("guessMachineType() got = %v, want %v", got, tt.want)
}
})
}
}

0 comments on commit 57c14e7

Please sign in to comment.