Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ jobs:
- name: Run tests
run: make test

integration:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Install kind
uses: helm/kind-action@v1.13.0
with:
version: 'v0.20.0'
- name: Run integration tests
run: make integration
env:
DOCKER_TAG: 0.0.0-dev

set-release-vars:
if: ${{ github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/') }}
runs-on: ubuntu-latest
Expand All @@ -67,14 +87,14 @@ jobs:

release-docker:
runs-on: ubuntu-latest
needs: [set-release-vars, lint, test]
needs: [set-release-vars, lint, test, integration]
if: ${{ needs.set-release-vars.outputs.release_version != '' }}
permissions:
contents: read
packages: write
strategy:
matrix:
component: [device-plugin, status-updater, kwok-gpu-device-plugin, status-exporter, topology-server, mig-faker, jupyter-notebook]
component: [device-plugin, dra-plugin-gpu, status-updater, kwok-gpu-device-plugin, status-exporter, topology-server, mig-faker, jupyter-notebook]
steps:
- uses: actions/checkout@v4
- name: Log in to GitHub Container Registry
Expand Down
35 changes: 1 addition & 34 deletions .github/workflows/synced-jira-ticket-test-design-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,37 +31,4 @@ jobs:
exit 1
else
echo "found jira ticket: ${{steps.ticket.outputs.issue}}"
fi
- name: Setup Jira CLI
uses: atlassian/gajira-cli@v2.0.2
- name: Verify not epic and that design and test plan fields are filled out
run: |
export JIRA_API_TOKEN=${{ secrets.JIRA_SECRET }}

jira list -q "ID=${{steps.ticket.outputs.issue}} AND type!=Story AND type!=Bug" > /tmp/query1-result.txt
cat /tmp/query1-result.txt
if [ -s /tmp/query1-result.txt ]; then
echo "Rejected: Ticket: ${{steps.ticket.outputs.issue}} is not a bug and not a story"
exit 1
else
echo "Good: Ticket is a bug or story"
fi

jira list -q "(ID=${{steps.ticket.outputs.issue}} AND type=Story AND (\"Design[Paragraph]\" is EMPTY OR \"Test Plan[Paragraph]\" is EMPTY))" > /tmp/query2-result.txt
cat /tmp/query2-result.txt
if [ -s /tmp/query2-result.txt ]; then
echo "Rejected: Its a story and either design or a test plan at not filled out in ticket: ${{steps.ticket.outputs.issue}} "
exit 1
else
echo "All OK. Either its a bug, or a story where both Design and Test Plan fields are filled out in ticket ${{steps.ticket.outputs.issue}}"
fi

jira list -q "ID=${{steps.ticket.outputs.issue}} AND status=Done" > /tmp/query3-result.txt
cat /tmp/query3-result.txt
if [ -s /tmp/query3-result.txt ]; then
echo "Rejected: Ticket: ${{steps.ticket.outputs.issue}} ticket is already done"
exit 1
else
echo "Good: Ticket is not done"
fi

fi
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
# Go workspace file
go.work

# Pre-commit config
.pre-commit-config.yaml

### Go Patch ###
/vendor/
/Godeps/
Expand Down Expand Up @@ -59,4 +62,5 @@ go.work
bin/
.env
.vscode/
.local/
.local/
.cursor/
11 changes: 10 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM golang:1.22.1 AS common-builder
FROM --platform=$BUILDPLATFORM golang:1.24.0 AS common-builder
WORKDIR $GOPATH/src/github.com/run-ai/fake-gpu-operator
COPY go.mod .
COPY go.sum .
Expand Down Expand Up @@ -41,6 +41,10 @@ COPY ./cmd/mig-faker/ ./cmd/mig-faker/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=mig-faker

FROM common-builder AS dra-plugin-gpu-builder
COPY ./cmd/dra-plugin-gpu/ ./cmd/dra-plugin-gpu/
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=dra-plugin-gpu

FROM common-builder AS preloader-builder
COPY ./cmd/preloader/ ./cmd/preloader/
RUN make build-preloader
Expand Down Expand Up @@ -74,3 +78,8 @@ ENTRYPOINT ["/bin/mig-faker"]
FROM ubuntu AS kwok-gpu-device-plugin
COPY --from=kwok-gpu-device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-gpu-device-plugin /bin/
ENTRYPOINT ["/bin/kwok-gpu-device-plugin"]

FROM ubuntu AS dra-plugin-gpu
COPY --from=dra-plugin-gpu-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/dra-plugin-gpu /bin/
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
ENTRYPOINT ["/bin/dra-plugin-gpu"]
19 changes: 17 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
BUILD_DIR=$(shell pwd)/bin
COMPONENTS?=device-plugin status-updater kwok-gpu-device-plugin status-exporter topology-server mig-faker jupyter-notebook
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin status-exporter topology-server mig-faker

DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator
DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator
DOCKER_TAG?=0.0.0-dev
NAMESPACE=gpu-operator

Expand Down Expand Up @@ -38,6 +38,21 @@ test: ginkgo
$(GINKGO) -r --procs=1 --output-dir=/tmp/artifacts/test-results/service-tests --compilers=1 --randomize-all --randomize-suites --fail-on-pending --keep-going --timeout=5m --race --trace --json-report=report.json
.PHONY: test

setup-integration:
test/integration/setup.sh
.PHONY: setup-integration

test-integration: ginkgo
cd test/integration && $(GINKGO) --procs=1 --timeout=30m --trace
.PHONY: test-integration

teardown-integration:
test/integration/teardown.sh
.PHONY: teardown-integration

integration: setup-integration test-integration teardown-integration
.PHONY: integration

clean:
rm -rf ${BUILD_DIR}
.PHONY: clean
Expand Down
174 changes: 174 additions & 0 deletions cmd/dra-plugin-gpu/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package main

import (
"context"
"errors"
"fmt"
"os"
"os/signal"
"syscall"

"github.com/urfave/cli/v2"

"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"

dra_plugin_gpu "github.com/run-ai/fake-gpu-operator/internal/dra-plugin-gpu"
"sigs.k8s.io/dra-example-driver/pkg/flags"
)

type Flags struct {
kubeClientConfig flags.KubeClientConfig
loggingConfig *flags.LoggingConfig

nodeName string
cdiRoot string
kubeletRegistrarDirectoryPath string
kubeletPluginsDirectoryPath string
healthcheckPort int
}

func main() {
if err := newApp().Run(os.Args); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
}

func newApp() *cli.App {
flags := &Flags{
loggingConfig: flags.NewLoggingConfig(),
}
cliFlags := []cli.Flag{
&cli.StringFlag{
Name: "node-name",
Usage: "The name of the node to be worked on.",
Required: true,
Destination: &flags.nodeName,
EnvVars: []string{"NODE_NAME"},
},
&cli.StringFlag{
Name: "cdi-root",
Usage: "Absolute path to the directory where CDI files will be generated.",
Value: "/etc/cdi",
Destination: &flags.cdiRoot,
EnvVars: []string{"CDI_ROOT"},
},
&cli.StringFlag{
Name: "kubelet-registrar-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin registrations.",
Value: kubeletplugin.KubeletRegistryDir,
Destination: &flags.kubeletRegistrarDirectoryPath,
EnvVars: []string{"KUBELET_REGISTRAR_DIRECTORY_PATH"},
},
&cli.StringFlag{
Name: "kubelet-plugins-directory-path",
Usage: "Absolute path to the directory where kubelet stores plugin data.",
Value: kubeletplugin.KubeletPluginsDir,
Destination: &flags.kubeletPluginsDirectoryPath,
EnvVars: []string{"KUBELET_PLUGINS_DIRECTORY_PATH"},
},
&cli.IntFlag{
Name: "healthcheck-port",
Usage: "Port to start a gRPC healthcheck service. When positive, a literal port number. When zero, a random port is allocated. When negative, the healthcheck service is disabled.",
Value: -1,
Destination: &flags.healthcheckPort,
EnvVars: []string{"HEALTHCHECK_PORT"},
},
}
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
cliFlags = append(cliFlags, flags.loggingConfig.Flags()...)

app := &cli.App{
Name: "dra-example-kubeletplugin",
Usage: "dra-example-kubeletplugin implements a DRA driver plugin.",
ArgsUsage: " ",
HideHelpCommand: true,
Flags: cliFlags,
Before: func(c *cli.Context) error {
if c.Args().Len() > 0 {
return fmt.Errorf("arguments not supported: %v", c.Args().Slice())
}
return flags.loggingConfig.Apply()
},
Action: func(c *cli.Context) error {
ctx := c.Context
clientSets, err := flags.kubeClientConfig.NewClientSets()
if err != nil {
return fmt.Errorf("create client: %v", err)
}

internalFlags := &dra_plugin_gpu.Flags{
NodeName: flags.nodeName,
CDIRoot: flags.cdiRoot,
KubeletRegistrarDirectoryPath: flags.kubeletRegistrarDirectoryPath,
KubeletPluginsDirectoryPath: flags.kubeletPluginsDirectoryPath,
HealthcheckPort: flags.healthcheckPort,
}

config := &dra_plugin_gpu.Config{
Flags: internalFlags,
CoreClient: clientSets.Core,
}

return RunPlugin(ctx, config)
},
}

return app
}

func RunPlugin(ctx context.Context, config *dra_plugin_gpu.Config) error {
logger := klog.FromContext(ctx)

err := os.MkdirAll(config.DriverPluginPath(), 0750)
if err != nil {
return err
}

info, err := os.Stat(config.Flags.CDIRoot)
switch {
case err != nil && os.IsNotExist(err):
err := os.MkdirAll(config.Flags.CDIRoot, 0750)
if err != nil {
return err
}
case err != nil:
return err
case !info.IsDir():
return fmt.Errorf("path for cdi file generation is not a directory: '%v'", err)
}

ctx, stop := signal.NotifyContext(ctx, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
defer stop()
ctx, cancel := context.WithCancelCause(ctx)
config.CancelMainCtx = cancel

driver, err := dra_plugin_gpu.NewDriver(ctx, config)
if err != nil {
return err
}

// Set up node controller to watch for annotation changes
if err := dra_plugin_gpu.SetupNodeController(ctx, driver.GetState(), config.Flags.NodeName); err != nil {
logger.Error(err, "Failed to setup node controller")
return fmt.Errorf("failed to setup node controller: %w", err)
}

<-ctx.Done()
// restore default signal behavior as soon as possible in case graceful
// shutdown gets stuck.
stop()
if err := context.Cause(ctx); err != nil && !errors.Is(err, context.Canceled) {
// A canceled context is the normal case here when the process receives
// a signal. Only log the error for more interesting cases.
logger.Error(err, "error from context")
}

err = driver.Shutdown(logger)
if err != nil {
logger.Error(err, "Unable to cleanly shutdown driver")
}

return nil
}
Loading
Loading