From be0792563a896cda3db40b08beaa497df14a7f20 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Mon, 1 Dec 2025 11:08:15 +0200 Subject: [PATCH 1/6] Add initial implementation of provision-probe service --- cmd/provision-probe/main.go | 50 ++++++++ configs/config.example.yaml | 20 +++ docker-compose.yml | 23 ++++ go.mod | 81 ++++++++++++ go.sum | 237 ++++++++++++++++++++++++++++++++++++ pkg/app/app.go | 148 ++++++++++++++++++++++ pkg/config/config.go | 168 +++++++++++++++++++++++++ pkg/db/db.go | 94 ++++++++++++++ pkg/grid/client.go | 135 ++++++++++++++++++++ pkg/grid/proxy.go | 61 ++++++++++ pkg/logger/logger.go | 29 +++++ 11 files changed, 1046 insertions(+) create mode 100644 cmd/provision-probe/main.go create mode 100644 configs/config.example.yaml create mode 100644 docker-compose.yml create mode 100644 go.mod create mode 100644 go.sum create mode 100644 pkg/app/app.go create mode 100644 pkg/config/config.go create mode 100644 pkg/db/db.go create mode 100644 pkg/grid/client.go create mode 100644 pkg/grid/proxy.go create mode 100644 pkg/logger/logger.go diff --git a/cmd/provision-probe/main.go b/cmd/provision-probe/main.go new file mode 100644 index 00000000..8acf26b6 --- /dev/null +++ b/cmd/provision-probe/main.go @@ -0,0 +1,50 @@ +package main + +import ( + "context" + "flag" + "fmt" + "os" + "os/signal" + "syscall" + + "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/app" + "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/logger" +) + +func main() { + configPath := flag.String("config", "configs/config.yaml", "path to config file") + flag.Parse() + + cfg, err := config.Load(*configPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to load config: %v\n", err) + os.Exit(1) + } + + logger.Init(cfg.LogLevel) + + application, err := app.New(cfg) + if err != nil { + log.Fatal().Err(err).Msg("Failed to initialize app") + } + defer application.Close() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + + go func() { + <-sigChan + log.Info().Msg("Shutting down...") + cancel() + }() + + if err := application.Run(ctx); err != nil { + log.Fatal().Err(err).Msg("Service failed") + } +} diff --git a/configs/config.example.yaml b/configs/config.example.yaml new file mode 100644 index 00000000..6c83f003 --- /dev/null +++ b/configs/config.example.yaml @@ -0,0 +1,20 @@ +interval: "6h" +workers: 10 +timeout: "10m" +log_level: "info" +workload: "light" + +grid: + network: "dev" + mnemonic: "" + +nodes: + status: ["up"] + farms: [1] + nodes: [11] + exclude: [] + +score_window: "90d" + +timescaledb: + url: "postgresql://postgres:postgres@localhost:5432/provision_probe" diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..dc4f3dcd --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +version: '3.8' + +services: + timescaledb: + image: timescale/timescaledb:latest-pg16 + container_name: provision-probe-db + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: provision_probe + ports: + - "5432:5432" + volumes: + - timescaledb-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 + +volumes: + timescaledb-data: + diff --git a/go.mod b/go.mod new file mode 100644 index 00000000..25f3486d --- /dev/null +++ b/go.mod @@ -0,0 +1,81 @@ +module github.com/threefoldtech/provision-probe + +go 1.23.0 + +require ( + github.com/jackc/pgx/v5 v5.5.0 + github.com/rs/zerolog v1.34.0 + github.com/spf13/viper v1.18.2 +) + +require ( + github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4 + github.com/threefoldtech/tfgrid-sdk-go/grid-proxy v0.17.4 +) + +require ( + github.com/ChainSafe/go-schnorrkel v1.1.0 // indirect + github.com/cenkalti/backoff v2.2.1+incompatible // indirect + github.com/cenkalti/backoff/v3 v3.2.2 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/centrifuge/go-substrate-rpc-client/v4 v4.0.12 // indirect + github.com/cosmos/go-bip39 v1.0.0 // indirect + github.com/deckarep/golang-set v1.8.0 // indirect + github.com/decred/base58 v1.0.5 // indirect + github.com/decred/dcrd/crypto/blake256 v1.0.1 // indirect + github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect + github.com/ethereum/go-ethereum v1.11.6 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/getsentry/sentry-go v0.29.1 // indirect + github.com/go-stack/stack v1.8.1 // indirect + github.com/golang-jwt/jwt v3.2.2+incompatible // indirect + github.com/gomodule/redigo v2.0.0+incompatible // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/gorilla/schema v1.4.1 // indirect + github.com/gorilla/websocket v1.5.3 // indirect + github.com/gtank/merlin v0.1.1 // indirect + github.com/gtank/ristretto255 v0.1.2 // indirect + github.com/hashicorp/errwrap v1.1.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/holiman/uint256 v1.2.3 // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect + github.com/jackc/puddle/v2 v2.2.1 // indirect + github.com/jbenet/go-base58 v0.0.0-20150317085156-6237cf65f3a6 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/mattn/go-colorable v0.1.14 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/mimoo/StrobeGo v0.0.0-20220103164710-9a04d6ca976b // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/pelletier/go-toml/v2 v2.1.0 // indirect + github.com/pierrec/xxHash v0.1.5 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/rs/cors v1.10.1 // indirect + github.com/sagikazarmark/locafero v0.4.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/spf13/cast v1.6.0 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30 // indirect + github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.17.3 // indirect + github.com/threefoldtech/zosbase v1.0.3 // indirect + github.com/vedhavyas/go-subkey v1.0.3 // indirect + go.uber.org/atomic v1.9.0 // indirect + go.uber.org/multierr v1.9.0 // indirect + golang.org/x/crypto v0.37.0 // indirect + golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect + golang.org/x/sync v0.13.0 // indirect + golang.org/x/sys v0.32.0 // indirect + golang.org/x/text v0.24.0 // indirect + golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b // indirect + gonum.org/v1/gonum v0.16.0 // indirect + google.golang.org/protobuf v1.36.6 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect + gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) + +replace github.com/threefoldtech/tfgrid-sdk-go => github.com/threefoldtech/tfgrid-sdk-go v0.12.0 diff --git a/go.sum b/go.sum new file mode 100644 index 00000000..0a585e31 --- /dev/null +++ b/go.sum @@ -0,0 +1,237 @@ +github.com/ChainSafe/go-schnorrkel v1.1.0 h1:rZ6EU+CZFCjB4sHUE1jIu8VDoB/wRKZxoe1tkcO71Wk= +github.com/ChainSafe/go-schnorrkel v1.1.0/go.mod h1:ABkENxiP+cvjFiByMIZ9LYbRoNNLeBLiakC1XeTFxfE= +github.com/btcsuite/btcd v0.22.0-beta h1:LTDpDKUM5EeOFBPM8IXpinEcmZ6FWfNZbE3lfrfdnWo= +github.com/btcsuite/btcd/btcec/v2 v2.2.0 h1:fzn1qaOt32TuLjFlkzYSsBC35Q3KUjT1SwPxiMSCF5k= +github.com/btcsuite/btcd/btcec/v2 v2.2.0/go.mod h1:U7MHm051Al6XmscBQ0BoNydpOTsFAn707034b5nY8zU= +github.com/btcsuite/btcutil v1.0.3-0.20201208143702-a53e38424cce h1:YtWJF7RHm2pYCvA5t0RPmAaLUhREsKuKd+SLhxFbFeQ= +github.com/btcsuite/btcutil v1.0.3-0.20201208143702-a53e38424cce/go.mod h1:0DVlHczLPewLcPGEIeUEzfOJhqGPQ0mJJRDBtD307+o= +github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= +github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= +github.com/cenkalti/backoff/v3 v3.2.2 h1:cfUAAO3yvKMYKPrvhDuHSwQnhZNk/RMHKdZqKTxfm6M= +github.com/cenkalti/backoff/v3 v3.2.2/go.mod h1:cIeZDE3IrqwwJl6VUwCN6trj1oXrTS4rc0ij+ULvLYs= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/centrifuge/go-substrate-rpc-client/v4 v4.0.12 h1:DCYWIBOalB0mKKfUg2HhtGgIkBbMA1fnlnkZp7fHB18= +github.com/centrifuge/go-substrate-rpc-client/v4 v4.0.12/go.mod h1:5g1oM4Zu3BOaLpsKQ+O8PAv2kNuq+kPcA1VzFbsSqxE= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/cosmos/go-bip39 v1.0.0 h1:pcomnQdrdH22njcAatO0yWojsUnCO3y2tNoV1cb6hHY= +github.com/cosmos/go-bip39 v1.0.0/go.mod h1:RNJv0H/pOIVgxw6KS7QeX2a0Uo0aKUlfhZ4xuwvCdJw= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/deckarep/golang-set/v2 v2.1.0 h1:g47V4Or+DUdzbs8FxCCmgb6VYd+ptPAngjM6dtGktsI= +github.com/deckarep/golang-set/v2 v2.1.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/decred/base58 v1.0.5 h1:hwcieUM3pfPnE/6p3J100zoRfGkQxBulZHo7GZfOqic= +github.com/decred/base58 v1.0.5/go.mod h1:s/8lukEHFA6bUQQb/v3rjUySJ2hu+RioCzLukAVkrfw= +github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= +github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 h1:rpfIENRNNilwHwZeG5+P150SMrnNEcHYvcCuK6dPZSg= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= +github.com/ethereum/go-ethereum v1.11.6 h1:2VF8Mf7XiSUfmoNOy3D+ocfl9Qu8baQBrCNbo2CXQ8E= +github.com/ethereum/go-ethereum v1.11.6/go.mod h1:+a8pUj1tOyJ2RinsNQD4326YS+leSoKGiG/uVVb0x6Y= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/getsentry/sentry-go v0.29.1 h1:DyZuChN8Hz3ARxGVV8ePaNXh1dQ7d76AiB117xcREwA= +github.com/getsentry/sentry-go v0.29.1/go.mod h1:x3AtIzN01d6SiWkderzaH28Tm0lgkafpJ5Bm3li39O0= +github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= +github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= +github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY= +github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= +github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= +github.com/golang/mock v1.6.0/go.mod h1:p6yTPP+5HYm5mzsMV8JkE6ZKdX+/wYM6Hr+LicevLPs= +github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0= +github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.1.1-0.20200604201612-c04b05f3adfa h1:Q75Upo5UN4JbPFURXZ8nLKYUvF85dyFRop/vQ0Rv+64= +github.com/google/gofuzz v1.1.1-0.20200604201612-c04b05f3adfa/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/schema v1.4.1 h1:jUg5hUjCSDZpNGLuXQOgIWGdlgrIdYvgQ0wZtdK1M3E= +github.com/gorilla/schema v1.4.1/go.mod h1:Dg5SSm5PV60mhF2NFaTV1xuYYj8tV8NOPRo4FggUMnM= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gtank/merlin v0.1.1 h1:eQ90iG7K9pOhtereWsmyRJ6RAwcP4tHTDBHXNg+u5is= +github.com/gtank/merlin v0.1.1/go.mod h1:T86dnYJhcGOh5BjZFCJWTDeTK7XW8uE+E21Cy/bIQ+s= +github.com/gtank/ristretto255 v0.1.2 h1:JEqUCPA1NvLq5DwYtuzigd7ss8fwbYay9fi4/5uMzcc= +github.com/gtank/ristretto255 v0.1.2/go.mod h1:Ph5OpO6c7xKUGROZfWVLiJf9icMDwUeIvY4OmlYW69o= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= +github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/holiman/uint256 v1.2.3 h1:K8UWO1HUJpRMXBxbmaY1Y8IAMZC/RsKB+ArEnnK4l5o= +github.com/holiman/uint256 v1.2.3/go.mod h1:SC8Ryt4n+UBbPbIBKaG9zbbDlp4jOru9xFZmPzLUTxw= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a h1:bbPeKD0xmW/Y25WS6cokEszi5g+S0QxI/d45PkRi7Nk= +github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.5.0 h1:NxstgwndsTRy7eq9/kqYc/BZh5w2hHJV86wjvO+1xPw= +github.com/jackc/pgx/v5 v5.5.0/go.mod h1:Ig06C2Vu0t5qXC60W8sqIthScaEnFvojjj9dSljmHRA= +github.com/jackc/puddle/v2 v2.2.1 h1:RhxXJtFG022u4ibrCSMSiu5aOq1i77R3OHKNJj77OAk= +github.com/jackc/puddle/v2 v2.2.1/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/jbenet/go-base58 v0.0.0-20150317085156-6237cf65f3a6 h1:4zOlv2my+vf98jT1nQt4bT/yKWUImevYPJ2H344CloE= +github.com/jbenet/go-base58 v0.0.0-20150317085156-6237cf65f3a6/go.mod h1:r/8JmuR0qjuCiEhAolkfvdZgmPiHTnJaG0UXCSeR1Zo= +github.com/jsimonetti/rtnetlink v0.0.0-20190606172950-9527aa82566a/go.mod h1:Oz+70psSo5OFh8DBl0Zv2ACw7Esh6pPUphlvZG9x7uw= +github.com/jsimonetti/rtnetlink v0.0.0-20200117123717-f846d4f6c1f4/go.mod h1:WGuG/smIU4J/54PblvSbh+xvCZmpJnFgr3ds6Z55XMQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= +github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mdlayher/genetlink v1.0.0/go.mod h1:0rJ0h4itni50A86M2kHcgS85ttZazNt7a8H2a2cw0Gc= +github.com/mdlayher/netlink v0.0.0-20190409211403-11939a169225/go.mod h1:eQB3mZE4aiYnlUsyGGCOpPETfdQq4Jhsgf1fk3cwQaA= +github.com/mdlayher/netlink v1.0.0/go.mod h1:KxeJAFOFLG6AjpyDkQ/iIhxygIUKD+vcwqcnu43w/+M= +github.com/mdlayher/netlink v1.1.0/go.mod h1:H4WCitaheIsdF9yOYu8CFmCgQthAPIWZmcKp9uZHgmY= +github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721/go.mod h1:Ickgr2WtCLZ2MDGd4Gr0geeCH5HybhRJbonOgQpvSxc= +github.com/mimoo/StrobeGo v0.0.0-20181016162300-f8f6d4d2b643/go.mod h1:43+3pMjjKimDBf5Kr4ZFNGbLql1zKkbImw+fZbw3geM= +github.com/mimoo/StrobeGo v0.0.0-20220103164710-9a04d6ca976b h1:QrHweqAtyJ9EwCaGHBu1fghwxIPiopAHV06JlXrMHjk= +github.com/mimoo/StrobeGo v0.0.0-20220103164710-9a04d6ca976b/go.mod h1:xxLb2ip6sSUts3g1irPVHyk/DGslwQsNOo9I7smJfNU= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= +github.com/pelletier/go-toml/v2 v2.1.0/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= +github.com/pierrec/xxHash v0.1.5 h1:n/jBpwTHiER4xYvK3/CdPVnLDPchj8eTJFFLUb4QHBo= +github.com/pierrec/xxHash v0.1.5/go.mod h1:w2waW5Zoa/Wc4Yqe0wgrIYAGKqRMf7czn2HNKXmuL+I= +github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4= +github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/rs/cors v1.10.1 h1:L0uuZVXIKlI1SShY2nhFfo44TYvDPQ1w4oFkUJNfhyo= +github.com/rs/cors v1.10.1/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= +github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= +github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= +github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= +github.com/sagikazarmark/locafero v0.4.0 h1:HApY1R9zGo4DBgr7dqsTH/JJxLTTsOt7u6keLGt6kNQ= +github.com/sagikazarmark/locafero v0.4.0/go.mod h1:Pe1W6UlPYUk/+wc/6KFhbORCfqzgYEpgQ3O5fPuL3H4= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= +github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= +github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.18.2 h1:LUXCnvUvSM6FXAsj6nnfc8Q2tp1dIgUfY9Kc8GsSOiQ= +github.com/spf13/viper v1.18.2/go.mod h1:EKmWIqdnk5lOcmR72yw6hS+8OPYcwD0jteitLMVB+yk= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30 h1:sH/hiHxCEpeIm2gJsmu4GxKskfQVPZMz9PAgDwk1BfY= +github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30/go.mod h1:cOL5YgHUmDG5SAXrsZxFjUECRQQuAqOoqvXhZG5sEUw= +github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4 h1:eIKmYyFL+ta10ZL/bJhUaMsTJOEBeI++qBrlfdUUxUg= +github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4/go.mod h1:KuuHrJjabagNnIzVEJfWLj7ZB1G2GTFvEorunJzAH3Q= +github.com/threefoldtech/tfgrid-sdk-go/grid-proxy v0.17.4 h1:KR6nAAq/at1qvR+P3jTX4BqkM2OJPSRr0FFIlbqfSyk= +github.com/threefoldtech/tfgrid-sdk-go/grid-proxy v0.17.4/go.mod h1:q49Rjz05v3Nz6UxJ//9rv4PoeQoQ+CsLDQfsVocY6b8= +github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.17.3 h1:NkDEy9pzuGpO6fp5t6UnhLMsmDim5t02jvoCxMlx004= +github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.17.3/go.mod h1:T+PZydVl3fxywqoUhCmzs+hUarfE1q9IMRl9xa+GIYo= +github.com/threefoldtech/zosbase v1.0.3 h1:e03oz+KTvuu8Hsm2hDpf/nOIkCz1K6xsXsVvZaahVBc= +github.com/threefoldtech/zosbase v1.0.3/go.mod h1:3tWVlnT9nZ0r/u1L1JCIZpwSlnvi20FG6Z/yTOCT/7U= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= +github.com/vedhavyas/go-subkey v1.0.3 h1:iKR33BB/akKmcR2PMlXPBeeODjWLM90EL98OrOGs8CA= +github.com/vedhavyas/go-subkey v1.0.3/go.mod h1:CloUaFQSSTdWnINfBRFjVMkWXZANW+nd8+TI5jYcl6Y= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= +go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= +go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191002192127-34f69633bfdc/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200204104054-c9f3fb736b72/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= +golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191003171128-d98b1b443823/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190411185658-b44545bcd369/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191003212358-c178f38b412c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= +golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.zx2c4.com/wireguard v0.0.20200121/go.mod h1:P2HsVp8SKwZEufsnezXZA4GRX/T49/HlU7DGuelXsU4= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b h1:l4mBVCYinjzZuR5DtxHuBD6wyd4348TGiavJ5vLrhEc= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b/go.mod h1:UdS9frhv65KTfwxME1xE8+rHYoFpbm36gOud1GhBe9c= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce h1:+JknDZhAj8YMt7GC73Ei8pv4MzjDUNPHgQWJdtMAaDU= +gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce/go.mod h1:5AcXVHNjg+BDxry382+8OKon8SEWiKktQR07RKPsv1c= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/app/app.go b/pkg/app/app.go new file mode 100644 index 00000000..10e9f671 --- /dev/null +++ b/pkg/app/app.go @@ -0,0 +1,148 @@ +package app + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/db" + "github.com/threefoldtech/provision-probe/pkg/grid" +) + +type App struct { + cfg *config.Config + database *db.DB + gridClient *grid.Client +} + +func New(cfg *config.Config) (*App, error) { + database, err := db.New(context.Background(), cfg.TimescaleDB.URL) + if err != nil { + return nil, fmt.Errorf("failed to connect to database: %w", err) + } + + gridClient, err := grid.NewClient(cfg.Grid.Network, cfg.Grid.Mnemonic, cfg.LogLevel) + if err != nil { + database.Close() + return nil, fmt.Errorf("failed to create grid client: %w", err) + } + + return &App{ + cfg: cfg, + database: database, + gridClient: gridClient, + }, nil +} + +func (a *App) Run(ctx context.Context) error { + ticker := time.NewTicker(a.cfg.Interval()) + defer ticker.Stop() + + log.Info(). + Str("interval", a.cfg.Interval().String()). + Str("network", a.cfg.Grid.Network). + Msg("Starting provision probe service") + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + if err := a.runCycle(ctx); err != nil { + log.Error().Err(err).Msg("Cycle failed") + } + } + } +} + +func (a *App) Close() { + a.database.Close() +} + +func (a *App) runCycle(ctx context.Context) error { + log.Info().Msg("Starting deployment cycle") + + filters := a.buildFilters() + proxyClient := a.gridClient.GetProxyClient() + nodes, err := grid.GetNodes(ctx, proxyClient, filters) + if err != nil { + return fmt.Errorf("failed to get nodes: %w", err) + } + + if len(nodes) == 0 { + log.Warn().Msg("No eligible nodes found") + return nil + } + + node := nodes[0] + cpu, memoryMB, diskMB := a.cfg.GetWorkload() + + log.Info(). + Int("node_id", node.NodeID). + Int("farm_id", node.FarmID). + Str("workload", a.cfg.Workload). + Msg("Deploying VM to node") + + timeoutCtx, cancel := context.WithTimeout(ctx, a.cfg.Timeout()) + defer cancel() + + result, err := a.gridClient.DeployVM(timeoutCtx, uint32(int(node.NodeID)), cpu, memoryMB, diskMB) + + attempt := db.Attempt{ + Time: time.Now().Unix(), + NodeID: int64(int(node.NodeID)), + FarmID: int64(int(node.FarmID)), + WorkloadType: a.cfg.Workload, + Status: "failed", + } + + if err != nil { + errorCode := "unknown_error" + if result != nil { + errorCode = result.ErrorCode + attempt.DeployDurationMs = &result.DeployDurationMs + attempt.TotalDurationMs = &result.TotalDurationMs + } + attempt.ErrorCode = &errorCode + log.Error().Err(err).Str("error_code", errorCode).Msg("Deployment failed") + } else if result != nil { + attempt.Status = "success" + attempt.DeployDurationMs = &result.DeployDurationMs + attempt.StartDurationMs = &result.StartDurationMs + attempt.TotalDurationMs = &result.TotalDurationMs + log.Info().Msg("Deployment succeeded") + } + + if err := a.database.RecordAttempt(ctx, attempt); err != nil { + return fmt.Errorf("failed to record attempt: %w", err) + } + + return nil +} + +func (a *App) buildFilters() grid.NodeFilters { + var status *string + if len(a.cfg.Nodes.Status) > 0 { + s := a.cfg.Nodes.Status[0] + status = &s + } + + var farmIDs []uint64 + for _, id := range a.cfg.Nodes.Farms { + farmIDs = append(farmIDs, uint64(id)) + } + + var nodeIDs []uint64 + for _, id := range a.cfg.Nodes.Nodes { + nodeIDs = append(nodeIDs, uint64(id)) + } + + return grid.NodeFilters{ + Status: status, + FarmIDs: farmIDs, + NodeIDs: nodeIDs, + Exclude: a.cfg.Nodes.Exclude, + } +} diff --git a/pkg/config/config.go b/pkg/config/config.go new file mode 100644 index 00000000..29b39f64 --- /dev/null +++ b/pkg/config/config.go @@ -0,0 +1,168 @@ +package config + +import ( + "fmt" + "regexp" + "strconv" + "time" + + "github.com/spf13/viper" +) + +const ( + WorkloadLightCPU = 1 + WorkloadLightMemory = 1 + WorkloadLightDisk = 10 + + WorkloadMediumCPU = 2 + WorkloadMediumMemory = 4 + WorkloadMediumDisk = 50 + + WorkloadHeavyCPU = 4 + WorkloadHeavyMemory = 8 + WorkloadHeavyDisk = 100 +) + +type Config struct { + IntervalStr string `mapstructure:"interval"` + Workers int `mapstructure:"workers"` + TimeoutStr string `mapstructure:"timeout"` + LogLevel string `mapstructure:"log_level"` + Grid GridConfig `mapstructure:"grid"` + Nodes NodesConfig `mapstructure:"nodes"` + Workload string `mapstructure:"workload"` + ScoreWindowStr string `mapstructure:"score_window"` + TimescaleDB TimescaleDB `mapstructure:"timescaledb"` + + interval time.Duration + timeout time.Duration + scoreWindow time.Duration +} + +type GridConfig struct { + Network string `mapstructure:"network"` + Mnemonic string `mapstructure:"mnemonic"` +} + +type NodesConfig struct { + Status []string `mapstructure:"status"` + Farms []int `mapstructure:"farms"` + Nodes []int `mapstructure:"nodes"` + Exclude []int `mapstructure:"exclude"` +} + +type TimescaleDB struct { + URL string `mapstructure:"url"` +} + +func Load(configPath string) (*Config, error) { + viper.SetConfigFile(configPath) + viper.SetConfigType("yaml") + + if err := viper.ReadInConfig(); err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + var cfg Config + if err := viper.Unmarshal(&cfg); err != nil { + return nil, fmt.Errorf("failed to unmarshal config: %w", err) + } + + if cfg.IntervalStr != "" { + d, err := parseDuration(cfg.IntervalStr) + if err != nil { + return nil, fmt.Errorf("invalid interval format: %w", err) + } + cfg.interval = d + } + + if cfg.TimeoutStr != "" { + d, err := parseDuration(cfg.TimeoutStr) + if err != nil { + return nil, fmt.Errorf("invalid timeout format: %w", err) + } + cfg.timeout = d + } + + if cfg.ScoreWindowStr != "" { + d, err := parseDuration(cfg.ScoreWindowStr) + if err != nil { + return nil, fmt.Errorf("invalid score_window format: %w", err) + } + cfg.scoreWindow = d + } + + if cfg.Workload == "" { + cfg.Workload = "light" + } + + if err := cfg.validate(); err != nil { + return nil, fmt.Errorf("config validation failed: %w", err) + } + + return &cfg, nil +} + +func parseDuration(s string) (time.Duration, error) { + re := regexp.MustCompile(`(\d+)d`) + if re.MatchString(s) { + matches := re.FindStringSubmatch(s) + if len(matches) == 2 { + days, err := strconv.Atoi(matches[1]) + if err != nil { + return 0, fmt.Errorf("invalid days value: %s", matches[1]) + } + hours := days * 24 + s = re.ReplaceAllString(s, fmt.Sprintf("%dh", hours)) + } + } + return time.ParseDuration(s) +} + +func (c *Config) GetWorkload() (cpu uint8, memoryMB uint64, diskMB uint64) { + switch c.Workload { + case "medium": + return WorkloadMediumCPU, uint64(WorkloadMediumMemory * 1024), uint64(WorkloadMediumDisk * 1024) + case "heavy": + return WorkloadHeavyCPU, uint64(WorkloadHeavyMemory * 1024), uint64(WorkloadHeavyDisk * 1024) + default: + return WorkloadLightCPU, uint64(WorkloadLightMemory * 1024), uint64(WorkloadLightDisk * 1024) + } +} + +func (c *Config) Interval() time.Duration { + return c.interval +} + +func (c *Config) Timeout() time.Duration { + return c.timeout +} + +func (c *Config) ScoreWindow() time.Duration { + return c.scoreWindow +} + +func (c *Config) validate() error { + if c.interval <= 0 { + return fmt.Errorf("interval must be positive") + } + if c.Workers <= 0 { + return fmt.Errorf("workers must be positive") + } + if c.timeout <= 0 { + return fmt.Errorf("timeout must be positive") + } + if c.Grid.Network == "" { + return fmt.Errorf("grid.network is required") + } + if c.Grid.Mnemonic == "" { + return fmt.Errorf("grid.mnemonic is required") + } + if c.TimescaleDB.URL == "" { + return fmt.Errorf("timescaledb.url is required") + } + if c.Workload != "light" && c.Workload != "medium" && c.Workload != "heavy" { + return fmt.Errorf("workload must be light, medium, or heavy") + } + return nil +} diff --git a/pkg/db/db.go b/pkg/db/db.go new file mode 100644 index 00000000..3ee9a825 --- /dev/null +++ b/pkg/db/db.go @@ -0,0 +1,94 @@ +package db + +import ( + "context" + "fmt" + + "github.com/jackc/pgx/v5/pgxpool" +) + +type DB struct { + pool *pgxpool.Pool +} + +func New(ctx context.Context, url string) (*DB, error) { + pool, err := pgxpool.New(ctx, url) + if err != nil { + return nil, fmt.Errorf("failed to create connection pool: %w", err) + } + + if err := pool.Ping(ctx); err != nil { + return nil, fmt.Errorf("failed to ping database: %w", err) + } + + db := &DB{pool: pool} + if err := db.initSchema(ctx); err != nil { + return nil, fmt.Errorf("failed to initialize schema: %w", err) + } + + return db, nil +} + +func (d *DB) initSchema(ctx context.Context) error { + queries := []string{ + `CREATE TABLE IF NOT EXISTS provision_attempts ( + time TIMESTAMPTZ NOT NULL, + node_id BIGINT NOT NULL, + farm_id BIGINT NOT NULL, + workload_type VARCHAR(20) NOT NULL, + status VARCHAR(20) NOT NULL, + deploy_duration_ms INTEGER, + start_duration_ms INTEGER, + total_duration_ms INTEGER, + error_code VARCHAR(100) + )`, + `SELECT create_hypertable('provision_attempts', 'time', if_not_exists => TRUE)`, + `CREATE INDEX IF NOT EXISTS idx_node_time ON provision_attempts(node_id, time DESC)`, + `CREATE INDEX IF NOT EXISTS idx_farm_time ON provision_attempts(farm_id, time DESC)`, + } + + for _, query := range queries { + if _, err := d.pool.Exec(ctx, query); err != nil { + return fmt.Errorf("failed to execute query: %w", err) + } + } + + return nil +} + +func (d *DB) RecordAttempt(ctx context.Context, attempt Attempt) error { + query := `INSERT INTO provision_attempts + (time, node_id, farm_id, workload_type, status, deploy_duration_ms, start_duration_ms, total_duration_ms, error_code) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)` + + _, err := d.pool.Exec(ctx, query, + attempt.Time, + attempt.NodeID, + attempt.FarmID, + attempt.WorkloadType, + attempt.Status, + attempt.DeployDurationMs, + attempt.StartDurationMs, + attempt.TotalDurationMs, + attempt.ErrorCode, + ) + + return err +} + +func (d *DB) Close() { + d.pool.Close() +} + +type Attempt struct { + Time int64 + NodeID int64 + FarmID int64 + WorkloadType string + Status string + DeployDurationMs *int + StartDurationMs *int + TotalDurationMs *int + ErrorCode *string +} + diff --git a/pkg/grid/client.go b/pkg/grid/client.go new file mode 100644 index 00000000..a85651b4 --- /dev/null +++ b/pkg/grid/client.go @@ -0,0 +1,135 @@ +package grid + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog/log" + "github.com/threefoldtech/tfgrid-sdk-go/grid-client/deployer" + "github.com/threefoldtech/tfgrid-sdk-go/grid-client/workloads" + "github.com/threefoldtech/tfgrid-sdk-go/grid-client/zos" + proxy "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/client" +) + +type Client struct { + tfPlugin deployer.TFPluginClient +} + +func NewClient(network, mnemonic string, logLevel string) (*Client, error) { + opts := []deployer.PluginOpt{ + deployer.WithNetwork(network), + } + + if logLevel == "debug" { + opts = append(opts, deployer.WithLogs()) + } + + tfPlugin, err := deployer.NewTFPluginClient(mnemonic, opts...) + if err != nil { + return nil, fmt.Errorf("failed to create grid client: %w", err) + } + + return &Client{tfPlugin: tfPlugin}, nil +} + +func (c *Client) GetProxyClient() proxy.Client { + return c.tfPlugin.GridProxyClient +} + +type DeploymentResult struct { + Success bool + DeployDurationMs int + StartDurationMs int + TotalDurationMs int + ErrorCode string +} + +func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryMB uint64, diskMB uint64) (*DeploymentResult, error) { + startTime := time.Now() + + networkName := fmt.Sprintf("probe-net-%d", time.Now().Unix()) + ipRange, err := zos.ParseIPNet("10.1.0.0/16") + if err != nil { + return &DeploymentResult{ + Success: false, + ErrorCode: "ip_parse_failed", + }, fmt.Errorf("failed to parse IP range: %w", err) + } + network := workloads.ZNet{ + Name: networkName, + Description: "Probe network", + Nodes: []uint32{nodeID}, + IPRange: ipRange, + } + + deployStart := time.Now() + err = c.tfPlugin.NetworkDeployer.Deploy(ctx, &network) + if err != nil { + return &DeploymentResult{ + Success: false, + DeployDurationMs: int(time.Since(deployStart).Milliseconds()), + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "network_deploy_failed", + }, fmt.Errorf("failed to deploy network: %w", err) + } + + vm := workloads.VM{ + Name: fmt.Sprintf("probe-%d", time.Now().Unix()), + Flist: "https://hub.grid.tf/tf-official-apps/base:latest.flist", + CPU: cpu, + MemoryMB: memoryMB, + RootfsSizeMB: diskMB, + Entrypoint: "/sbin/zinit init", + NetworkName: networkName, + IP: "10.1.0.5", + } + + dl := workloads.NewDeployment("probe", nodeID, "", nil, networkName, nil, nil, []workloads.VM{vm}, nil, nil, nil) + + vmDeployStart := time.Now() + err = c.tfPlugin.DeploymentDeployer.Deploy(ctx, &dl) + if err != nil { + c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + return &DeploymentResult{ + Success: false, + DeployDurationMs: int(time.Since(vmDeployStart).Milliseconds()), + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "deploy_failed", + }, fmt.Errorf("failed to deploy VM: %w", err) + } + + vmDeployDuration := time.Since(vmDeployStart) + + startStart := time.Now() + _, err = c.tfPlugin.State.LoadVMFromGrid(ctx, nodeID, vm.Name, dl.Name) + if err != nil { + c.tfPlugin.DeploymentDeployer.Cancel(ctx, &dl) + c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + return &DeploymentResult{ + Success: false, + DeployDurationMs: int(vmDeployDuration.Milliseconds()), + StartDurationMs: int(time.Since(startStart).Milliseconds()), + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "start_failed", + }, fmt.Errorf("failed to start VM: %w", err) + } + + startDuration := time.Since(startStart) + totalDuration := time.Since(startTime) + + log.Info(). + Int("deploy_ms", int(vmDeployDuration.Milliseconds())). + Int("start_ms", int(startDuration.Milliseconds())). + Msg("VM deployed successfully") + + c.tfPlugin.DeploymentDeployer.Cancel(ctx, &dl) + c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + + return &DeploymentResult{ + Success: true, + DeployDurationMs: int(vmDeployDuration.Milliseconds()), + StartDurationMs: int(startDuration.Milliseconds()), + TotalDurationMs: int(totalDuration.Milliseconds()), + }, nil +} diff --git a/pkg/grid/proxy.go b/pkg/grid/proxy.go new file mode 100644 index 00000000..9b290a6b --- /dev/null +++ b/pkg/grid/proxy.go @@ -0,0 +1,61 @@ +package grid + +import ( + "context" + "fmt" + + "github.com/rs/zerolog/log" + "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/client" + "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" +) + +func GetNodes(ctx context.Context, proxyClient client.Client, filters NodeFilters) ([]types.Node, error) { + limit := types.Limit{ + Size: 100, + Page: 1, + } + var allNodes []types.Node + + var excludeIDs []uint64 + for _, id := range filters.Exclude { + excludeIDs = append(excludeIDs, uint64(id)) + } + + for { + var status []string + if filters.Status != nil { + status = []string{*filters.Status} + } + + filter := types.NodeFilter{ + Status: status, + FarmIDs: filters.FarmIDs, + NodeIDs: filters.NodeIDs, + Excluded: excludeIDs, + } + + nodes, total, err := proxyClient.Nodes(ctx, filter, limit) + if err != nil { + return nil, fmt.Errorf("failed to query nodes: %w", err) + } + + allNodes = append(allNodes, nodes...) + + if len(allNodes) >= total || len(nodes) == 0 { + break + } + + limit.Page++ + } + + log.Info().Int("count", len(allNodes)).Msg("Found eligible nodes") + + return allNodes, nil +} + +type NodeFilters struct { + Status *string + FarmIDs []uint64 + NodeIDs []uint64 + Exclude []int +} diff --git a/pkg/logger/logger.go b/pkg/logger/logger.go new file mode 100644 index 00000000..79f11f1f --- /dev/null +++ b/pkg/logger/logger.go @@ -0,0 +1,29 @@ +package logger + +import ( + "os" + + "github.com/rs/zerolog" + "github.com/rs/zerolog/log" +) + +func Init(level string) { + zerolog.TimeFieldFormat = zerolog.TimeFormatUnix + zerolog.SetGlobalLevel(parseLevel(level)) + log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr}) +} + +func parseLevel(level string) zerolog.Level { + switch level { + case "debug": + return zerolog.DebugLevel + case "info": + return zerolog.InfoLevel + case "warn": + return zerolog.WarnLevel + case "error": + return zerolog.ErrorLevel + default: + return zerolog.InfoLevel + } +} From 8bf4ee9dfe3d56693317601e33f74df93a942a3e Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Mon, 1 Dec 2025 13:28:37 +0200 Subject: [PATCH 2/6] support concurrent deployments with semaphore --- .gitignore | 1 + configs/config.example.yaml | 2 +- go.mod | 2 +- pkg/app/app.go | 119 +++++++++++++++++++++++++----------- pkg/config/config.go | 22 +++---- 5 files changed, 99 insertions(+), 47 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a5394700 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +config.yaml \ No newline at end of file diff --git a/configs/config.example.yaml b/configs/config.example.yaml index 6c83f003..6ad75a55 100644 --- a/configs/config.example.yaml +++ b/configs/config.example.yaml @@ -1,5 +1,5 @@ interval: "6h" -workers: 10 +concurrency_limit: 10 timeout: "10m" log_level: "info" workload: "light" diff --git a/go.mod b/go.mod index 25f3486d..d8042186 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( require ( github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4 github.com/threefoldtech/tfgrid-sdk-go/grid-proxy v0.17.4 + golang.org/x/sync v0.13.0 ) require ( @@ -67,7 +68,6 @@ require ( go.uber.org/multierr v1.9.0 // indirect golang.org/x/crypto v0.37.0 // indirect golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect - golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect golang.org/x/text v0.24.0 // indirect golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b // indirect diff --git a/pkg/app/app.go b/pkg/app/app.go index 10e9f671..7adef819 100644 --- a/pkg/app/app.go +++ b/pkg/app/app.go @@ -3,12 +3,15 @@ package app import ( "context" "fmt" + "sync" "time" "github.com/rs/zerolog/log" "github.com/threefoldtech/provision-probe/pkg/config" "github.com/threefoldtech/provision-probe/pkg/db" "github.com/threefoldtech/provision-probe/pkg/grid" + "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" + "golang.org/x/sync/semaphore" ) type App struct { @@ -45,6 +48,10 @@ func (a *App) Run(ctx context.Context) error { Str("network", a.cfg.Grid.Network). Msg("Starting provision probe service") + if err := a.runCycle(ctx); err != nil { + log.Error().Err(err).Msg("Initial cycle failed") + } + for { select { case <-ctx.Done(): @@ -76,49 +83,93 @@ func (a *App) runCycle(ctx context.Context) error { return nil } - node := nodes[0] - cpu, memoryMB, diskMB := a.cfg.GetWorkload() - log.Info(). - Int("node_id", node.NodeID). - Int("farm_id", node.FarmID). + Int("nodes", len(nodes)). + Int("max_concurrent", a.cfg.ConcurrencyLimit). Str("workload", a.cfg.Workload). - Msg("Deploying VM to node") + Msg("Starting deployment cycle") - timeoutCtx, cancel := context.WithTimeout(ctx, a.cfg.Timeout()) - defer cancel() + sem := semaphore.NewWeighted(int64(a.cfg.ConcurrencyLimit)) + var wg sync.WaitGroup + var mu sync.Mutex + var cycleErrors []error - result, err := a.gridClient.DeployVM(timeoutCtx, uint32(int(node.NodeID)), cpu, memoryMB, diskMB) + cpu, memoryMB, diskMB := a.cfg.GetWorkload() - attempt := db.Attempt{ - Time: time.Now().Unix(), - NodeID: int64(int(node.NodeID)), - FarmID: int64(int(node.FarmID)), - WorkloadType: a.cfg.Workload, - Status: "failed", - } + for i, node := range nodes { + wg.Add(1) + go func(idx int, n types.Node) { + defer wg.Done() - if err != nil { - errorCode := "unknown_error" - if result != nil { - errorCode = result.ErrorCode - attempt.DeployDurationMs = &result.DeployDurationMs - attempt.TotalDurationMs = &result.TotalDurationMs - } - attempt.ErrorCode = &errorCode - log.Error().Err(err).Str("error_code", errorCode).Msg("Deployment failed") - } else if result != nil { - attempt.Status = "success" - attempt.DeployDurationMs = &result.DeployDurationMs - attempt.StartDurationMs = &result.StartDurationMs - attempt.TotalDurationMs = &result.TotalDurationMs - log.Info().Msg("Deployment succeeded") - } + if err := sem.Acquire(ctx, 1); err != nil { // blocks if at capacity + mu.Lock() + cycleErrors = append(cycleErrors, fmt.Errorf("failed to acquire semaphore for node %d: %w", n.NodeID, err)) + mu.Unlock() + return + } + defer sem.Release(1) + + log.Info(). + Int("node_index", idx+1). + Int("total_nodes", len(nodes)). + Int("node_id", n.NodeID). + Int("farm_id", n.FarmID). + Str("workload", a.cfg.Workload). + Msg("Deploying VM to node") + + timeoutCtx, cancel := context.WithTimeout(ctx, a.cfg.Timeout()) + result, err := a.gridClient.DeployVM(timeoutCtx, uint32(int(n.NodeID)), cpu, memoryMB, diskMB) + cancel() + + attempt := db.Attempt{ + Time: time.Now().Unix(), + NodeID: int64(int(n.NodeID)), + FarmID: int64(int(n.FarmID)), + WorkloadType: a.cfg.Workload, + Status: "failed", + } + + if err != nil { + errorCode := "unknown_error" + if result != nil && result.ErrorCode != "" { + errorCode = result.ErrorCode + attempt.DeployDurationMs = &result.DeployDurationMs + attempt.TotalDurationMs = &result.TotalDurationMs + } + attempt.ErrorCode = &errorCode + log.Error(). + Err(err). + Int("node_id", n.NodeID). + Str("error_code", errorCode). + Msg("Deployment failed") + } else if result != nil { + attempt.Status = "success" + attempt.DeployDurationMs = &result.DeployDurationMs + attempt.StartDurationMs = &result.StartDurationMs + attempt.TotalDurationMs = &result.TotalDurationMs + log.Info(). + Int("node_id", n.NodeID). + Msg("Deployment succeeded") + } - if err := a.database.RecordAttempt(ctx, attempt); err != nil { - return fmt.Errorf("failed to record attempt: %w", err) + if err := a.database.RecordAttempt(ctx, attempt); err != nil { + mu.Lock() + cycleErrors = append(cycleErrors, fmt.Errorf("failed to record attempt for node %d: %w", n.NodeID, err)) + mu.Unlock() + log.Error(). + Err(err). + Int("node_id", n.NodeID). + Msg("Failed to record attempt") + } + }(i, node) } + wg.Wait() + + log.Info(). + Int("total_nodes", len(nodes)). + Int("errors", len(cycleErrors)). + Msg("Deployment cycle completed") return nil } diff --git a/pkg/config/config.go b/pkg/config/config.go index 29b39f64..d2861518 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -24,15 +24,15 @@ const ( ) type Config struct { - IntervalStr string `mapstructure:"interval"` - Workers int `mapstructure:"workers"` - TimeoutStr string `mapstructure:"timeout"` - LogLevel string `mapstructure:"log_level"` - Grid GridConfig `mapstructure:"grid"` - Nodes NodesConfig `mapstructure:"nodes"` - Workload string `mapstructure:"workload"` - ScoreWindowStr string `mapstructure:"score_window"` - TimescaleDB TimescaleDB `mapstructure:"timescaledb"` + IntervalStr string `mapstructure:"interval"` + ConcurrencyLimit int `mapstructure:"concurrency_limit"` + TimeoutStr string `mapstructure:"timeout"` + LogLevel string `mapstructure:"log_level"` + Grid GridConfig `mapstructure:"grid"` + Nodes NodesConfig `mapstructure:"nodes"` + Workload string `mapstructure:"workload"` + ScoreWindowStr string `mapstructure:"score_window"` + TimescaleDB TimescaleDB `mapstructure:"timescaledb"` interval time.Duration timeout time.Duration @@ -146,8 +146,8 @@ func (c *Config) validate() error { if c.interval <= 0 { return fmt.Errorf("interval must be positive") } - if c.Workers <= 0 { - return fmt.Errorf("workers must be positive") + if c.ConcurrencyLimit <= 0 { + return fmt.Errorf("concurrency_limit must be positive") } if c.timeout <= 0 { return fmt.Errorf("timeout must be positive") From c5c5b835f4bf9dd27972eebc888d5c16ba8e9cad Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Mon, 1 Dec 2025 15:11:46 +0200 Subject: [PATCH 3/6] improve the deployment logic --- pkg/app/app.go | 8 ++-- pkg/db/db.go | 22 ++++----- pkg/grid/client.go | 113 ++++++++++++++++++++++++++++++++------------- 3 files changed, 97 insertions(+), 46 deletions(-) diff --git a/pkg/app/app.go b/pkg/app/app.go index 7adef819..bd838aa6 100644 --- a/pkg/app/app.go +++ b/pkg/app/app.go @@ -87,7 +87,7 @@ func (a *App) runCycle(ctx context.Context) error { Int("nodes", len(nodes)). Int("max_concurrent", a.cfg.ConcurrencyLimit). Str("workload", a.cfg.Workload). - Msg("Starting deployment cycle") + Msg("Deployment cycle started") sem := semaphore.NewWeighted(int64(a.cfg.ConcurrencyLimit)) var wg sync.WaitGroup @@ -109,7 +109,7 @@ func (a *App) runCycle(ctx context.Context) error { } defer sem.Release(1) - log.Info(). + log.Debug(). Int("node_index", idx+1). Int("total_nodes", len(nodes)). Int("node_id", n.NodeID). @@ -137,7 +137,7 @@ func (a *App) runCycle(ctx context.Context) error { attempt.TotalDurationMs = &result.TotalDurationMs } attempt.ErrorCode = &errorCode - log.Error(). + log.Debug(). Err(err). Int("node_id", n.NodeID). Str("error_code", errorCode). @@ -147,7 +147,7 @@ func (a *App) runCycle(ctx context.Context) error { attempt.DeployDurationMs = &result.DeployDurationMs attempt.StartDurationMs = &result.StartDurationMs attempt.TotalDurationMs = &result.TotalDurationMs - log.Info(). + log.Debug(). Int("node_id", n.NodeID). Msg("Deployment succeeded") } diff --git a/pkg/db/db.go b/pkg/db/db.go index 3ee9a825..c77b5630 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -3,6 +3,7 @@ package db import ( "context" "fmt" + "time" "github.com/jackc/pgx/v5/pgxpool" ) @@ -62,7 +63,7 @@ func (d *DB) RecordAttempt(ctx context.Context, attempt Attempt) error { VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)` _, err := d.pool.Exec(ctx, query, - attempt.Time, + time.Unix(attempt.Time, 0), attempt.NodeID, attempt.FarmID, attempt.WorkloadType, @@ -81,14 +82,13 @@ func (d *DB) Close() { } type Attempt struct { - Time int64 - NodeID int64 - FarmID int64 - WorkloadType string - Status string - DeployDurationMs *int - StartDurationMs *int - TotalDurationMs *int - ErrorCode *string + Time int64 + NodeID int64 + FarmID int64 + WorkloadType string + Status string + DeployDurationMs *int + StartDurationMs *int + TotalDurationMs *int + ErrorCode *string } - diff --git a/pkg/grid/client.go b/pkg/grid/client.go index a85651b4..5a393837 100644 --- a/pkg/grid/client.go +++ b/pkg/grid/client.go @@ -3,6 +3,7 @@ package grid import ( "context" "fmt" + "net" "time" "github.com/rs/zerolog/log" @@ -19,6 +20,7 @@ type Client struct { func NewClient(network, mnemonic string, logLevel string) (*Client, error) { opts := []deployer.PluginOpt{ deployer.WithNetwork(network), + deployer.WithDisableSentry(), } if logLevel == "debug" { @@ -47,23 +49,32 @@ type DeploymentResult struct { func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryMB uint64, diskMB uint64) (*DeploymentResult, error) { startTime := time.Now() + timestamp := time.Now().Unix() - networkName := fmt.Sprintf("probe-net-%d", time.Now().Unix()) - ipRange, err := zos.ParseIPNet("10.1.0.0/16") + vmName := fmt.Sprintf("probe_%d", timestamp) + networkName := fmt.Sprintf("%s_net", vmName) + projectName := fmt.Sprintf("%s_project", vmName) + + network, err := buildNetwork(networkName, projectName, []uint32{nodeID}) if err != nil { return &DeploymentResult{ Success: false, - ErrorCode: "ip_parse_failed", - }, fmt.Errorf("failed to parse IP range: %w", err) + ErrorCode: "network_build_failed", + }, fmt.Errorf("failed to build network: %w", err) } - network := workloads.ZNet{ - Name: networkName, - Description: "Probe network", - Nodes: []uint32{nodeID}, - IPRange: ipRange, + + vm, err := buildVM(vmName, nodeID, networkName, cpu, memoryMB, diskMB) + if err != nil { + return &DeploymentResult{ + Success: false, + ErrorCode: "vm_build_failed", + }, fmt.Errorf("failed to build VM: %w", err) } + dl := workloads.NewDeployment(vmName, nodeID, projectName, nil, networkName, nil, nil, []workloads.VM{vm}, nil, nil, nil) + deployStart := time.Now() + log.Debug().Str("network", networkName).Uint32("node_id", nodeID).Msg("Deploying network") err = c.tfPlugin.NetworkDeployer.Deploy(ctx, &network) if err != nil { return &DeploymentResult{ @@ -71,32 +82,20 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM DeployDurationMs: int(time.Since(deployStart).Milliseconds()), TotalDurationMs: int(time.Since(startTime).Milliseconds()), ErrorCode: "network_deploy_failed", - }, fmt.Errorf("failed to deploy network: %w", err) + }, fmt.Errorf("failed to deploy network on node %d: %w", nodeID, err) } - vm := workloads.VM{ - Name: fmt.Sprintf("probe-%d", time.Now().Unix()), - Flist: "https://hub.grid.tf/tf-official-apps/base:latest.flist", - CPU: cpu, - MemoryMB: memoryMB, - RootfsSizeMB: diskMB, - Entrypoint: "/sbin/zinit init", - NetworkName: networkName, - IP: "10.1.0.5", - } - - dl := workloads.NewDeployment("probe", nodeID, "", nil, networkName, nil, nil, []workloads.VM{vm}, nil, nil, nil) - vmDeployStart := time.Now() + log.Debug().Str("vm", vmName).Uint32("node_id", nodeID).Msg("Deploying VM") err = c.tfPlugin.DeploymentDeployer.Deploy(ctx, &dl) if err != nil { - c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + revertDeployment(ctx, c.tfPlugin, &dl, &network, false) return &DeploymentResult{ Success: false, DeployDurationMs: int(time.Since(vmDeployStart).Milliseconds()), TotalDurationMs: int(time.Since(startTime).Milliseconds()), ErrorCode: "deploy_failed", - }, fmt.Errorf("failed to deploy VM: %w", err) + }, fmt.Errorf("failed to deploy VM on node %d: %w", nodeID, err) } vmDeployDuration := time.Since(vmDeployStart) @@ -104,27 +103,26 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM startStart := time.Now() _, err = c.tfPlugin.State.LoadVMFromGrid(ctx, nodeID, vm.Name, dl.Name) if err != nil { - c.tfPlugin.DeploymentDeployer.Cancel(ctx, &dl) - c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + revertDeployment(ctx, c.tfPlugin, &dl, &network, true) return &DeploymentResult{ Success: false, DeployDurationMs: int(vmDeployDuration.Milliseconds()), StartDurationMs: int(time.Since(startStart).Milliseconds()), TotalDurationMs: int(time.Since(startTime).Milliseconds()), ErrorCode: "start_failed", - }, fmt.Errorf("failed to start VM: %w", err) + }, fmt.Errorf("failed to load VM from node %d: %w", nodeID, err) } startDuration := time.Since(startStart) totalDuration := time.Since(startTime) - log.Info(). + log.Debug(). Int("deploy_ms", int(vmDeployDuration.Milliseconds())). Int("start_ms", int(startDuration.Milliseconds())). + Uint32("node_id", nodeID). Msg("VM deployed successfully") - c.tfPlugin.DeploymentDeployer.Cancel(ctx, &dl) - c.tfPlugin.NetworkDeployer.Cancel(ctx, &network) + revertDeployment(ctx, c.tfPlugin, &dl, &network, true) return &DeploymentResult{ Success: true, @@ -133,3 +131,56 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM TotalDurationMs: int(totalDuration.Milliseconds()), }, nil } + +func buildNetwork(name, projectName string, nodes []uint32) (workloads.ZNet, error) { + keys := make(map[uint32][]byte) + for _, node := range nodes { + key, err := workloads.RandomMyceliumKey() + if err != nil { + return workloads.ZNet{}, fmt.Errorf("failed to generate mycelium key for node %d: %w", node, err) + } + keys[node] = key + } + + return workloads.ZNet{ + Name: name, + Nodes: nodes, + IPRange: zos.IPNet{IPNet: net.IPNet{ + IP: net.IPv4(10, 20, 0, 0), + Mask: net.CIDRMask(16, 32), + }}, + MyceliumKeys: keys, + SolutionType: projectName, + Description: "Probe network", + }, nil +} + +func buildVM(name string, nodeID uint32, networkName string, cpu uint8, memoryMB uint64, diskMB uint64) (workloads.VM, error) { + ipSeed, err := workloads.RandomMyceliumIPSeed() + if err != nil { + return workloads.VM{}, fmt.Errorf("failed to generate mycelium IP seed: %w", err) + } + return workloads.VM{ + Name: name, + NodeID: nodeID, + Flist: "https://hub.threefold.me/tf-official-apps/threefoldtech-ubuntu-22.04.flist", + CPU: cpu, + MemoryMB: memoryMB, + RootfsSizeMB: diskMB, + Entrypoint: "/sbin/zinit init", + NetworkName: networkName, + MyceliumIPSeed: ipSeed, + }, nil +} + +func revertDeployment(ctx context.Context, tfPlugin deployer.TFPluginClient, dl *workloads.Deployment, network *workloads.ZNet, deleteVM bool) { + if deleteVM { + log.Debug().Msg("Cleaning up deployment") + if err := tfPlugin.DeploymentDeployer.Cancel(ctx, dl); err != nil { + log.Error().Err(err).Msg("Failed to cancel deployment") + } + } + if err := tfPlugin.NetworkDeployer.Cancel(ctx, network); err != nil { + log.Error().Err(err).Msg("Failed to cancel network") + } +} From 51f3521b0767f0926c6cf6cc560dadef95da3932 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Mon, 1 Dec 2025 18:22:22 +0200 Subject: [PATCH 4/6] Add API server with scoring endpoints and update configuration structure - Introduced a new API server with endpoints for retrieving top scores and individual node scores. - Updated configuration structure to include API settings and probe parameters. - Added new dependencies for routing and logging. - Refactored database interaction to support scoring logic. - Enhanced example configuration file to reflect new settings. --- configs/config.example.yaml | 39 +++++--- go.mod | 1 + go.sum | 2 + pkg/api/handlers.go | 180 ++++++++++++++++++++++++++++++++++++ pkg/api/scoring.go | 50 ++++++++++ pkg/api/server.go | 60 ++++++++++++ pkg/app/app.go | 69 ++++++-------- pkg/config/config.go | 120 +++++++++++++++--------- pkg/db/db.go | 126 ++++++++++++++++++++----- pkg/grid/client.go | 72 ++++++--------- pkg/grid/proxy.go | 43 ++++----- 11 files changed, 575 insertions(+), 187 deletions(-) create mode 100644 pkg/api/handlers.go create mode 100644 pkg/api/scoring.go create mode 100644 pkg/api/server.go diff --git a/configs/config.example.yaml b/configs/config.example.yaml index 6ad75a55..f97b7117 100644 --- a/configs/config.example.yaml +++ b/configs/config.example.yaml @@ -1,20 +1,29 @@ -interval: "6h" -concurrency_limit: 10 -timeout: "10m" -log_level: "info" -workload: "light" +api: + host: "0.0.0.0" # host for the API server + port: 8080 # port for the API server + +timescaledb: + url: "postgresql://postgres:postgres@localhost:5432/provision_probe" # url for the timescaledb database + + +probe: + interval: "6h" # interval for the deployment cycle + concurrency_limit: 10 # concurrency limit for the deployment cycle + timeout: "10m" # timeout for the deployment cycle + workload_size: "light" # light, medium, heavy + +scoring: + window: "90d" # default window for the scoring (overwritten by the window query parameter) + min_attempts: 1 # default minimum attempts for the scoring (overwritten by the min_attempts query parameter) grid: - network: "dev" - mnemonic: "" + network: "dev" # dev, qa, test, main + mnemonic: "" # mnemonic for the grid nodes: - status: ["up"] - farms: [1] - nodes: [11] - exclude: [] - -score_window: "90d" + status: "up" # up, healthy + farms: [] # farm IDs to include (empty means all farms) + nodes: [] # node IDs to include (empty means all nodes) + exclude: [] # node IDs to exclude -timescaledb: - url: "postgresql://postgres:postgres@localhost:5432/provision_probe" +log_level: "info" \ No newline at end of file diff --git a/go.mod b/go.mod index d8042186..5ecb1639 100644 --- a/go.mod +++ b/go.mod @@ -28,6 +28,7 @@ require ( github.com/ethereum/go-ethereum v1.11.6 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/getsentry/sentry-go v0.29.1 // indirect + github.com/go-chi/chi/v5 v5.2.3 // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/golang-jwt/jwt v3.2.2+incompatible // indirect github.com/gomodule/redigo v2.0.0+incompatible // indirect diff --git a/go.sum b/go.sum index 0a585e31..0c6e71b1 100644 --- a/go.sum +++ b/go.sum @@ -38,6 +38,8 @@ github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nos github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/getsentry/sentry-go v0.29.1 h1:DyZuChN8Hz3ARxGVV8ePaNXh1dQ7d76AiB117xcREwA= github.com/getsentry/sentry-go v0.29.1/go.mod h1:x3AtIzN01d6SiWkderzaH28Tm0lgkafpJ5Bm3li39O0= +github.com/go-chi/chi/v5 v5.2.3 h1:WQIt9uxdsAbgIYgid+BpYc+liqQZGMHRaUwp0JUcvdE= +github.com/go-chi/chi/v5 v5.2.3/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops= github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= diff --git a/pkg/api/handlers.go b/pkg/api/handlers.go new file mode 100644 index 00000000..5c8c4a2f --- /dev/null +++ b/pkg/api/handlers.go @@ -0,0 +1,180 @@ +package api + +import ( + "encoding/json" + "fmt" + "net/http" + "strconv" + "time" + + "github.com/go-chi/chi/v5" + "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/db" +) + +type Handlers struct { + database *db.DB + cfg *config.Config + defaultWindow time.Duration +} + +func NewHandlers(database *db.DB, cfg *config.Config) *Handlers { + return &Handlers{ + database: database, + cfg: cfg, + defaultWindow: cfg.ScoreWindow(), + } +} + +// GetTopScores handles GET /api/v1/scores +// Query params: window (duration, defaults to config value), limit (int, default 10), min_attempts (int, default 1) +func (h *Handlers) GetTopScores(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + windowStr := r.URL.Query().Get("window") + window := h.defaultWindow + if windowStr != "" { + parsedWindow, err := config.ParseDuration(windowStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid window parameter: "+err.Error()) + return + } + window = parsedWindow + } + + limitStr := r.URL.Query().Get("limit") + limit := 10 + if limitStr != "" { + parsedLimit, err := strconv.Atoi(limitStr) + if err != nil || parsedLimit <= 0 { + respondError(w, http.StatusBadRequest, "invalid limit parameter: must be a positive integer") + return + } + limit = parsedLimit + } + + minAttemptsStr := r.URL.Query().Get("min_attempts") + minAttempts := 1 + if minAttemptsStr != "" { + parsedMinAttempts, err := strconv.Atoi(minAttemptsStr) + if err != nil || parsedMinAttempts < 1 { + respondError(w, http.StatusBadRequest, "invalid min_attempts parameter: must be a positive integer") + return + } + minAttempts = parsedMinAttempts + } + + scoreData, err := h.database.GetTopNodeScores(ctx, window, limit, minAttempts) + if err != nil { + log.Error().Err(err).Msg("Failed to get top node scores") + respondError(w, http.StatusInternalServerError, "failed to retrieve scores") + return + } + + scores := make([]NodeScore, 0, len(scoreData)) + for _, data := range scoreData { + score := CalculateScore(&data) + scores = append(scores, *score) + } + + respondJSON(w, http.StatusOK, map[string]interface{}{ + "window": window.String(), + "limit": limit, + "min_attempts": minAttempts, + "scores": scores, + }) +} + +// GetNodeScore handles GET /api/v1/scores/node/:node_id +// Query params: window (duration string, defaults to config value), min_attempts (int, default 1) +func (h *Handlers) GetNodeScore(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + nodeIDStr := chi.URLParam(r, "node_id") + nodeID, err := strconv.ParseInt(nodeIDStr, 10, 64) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid node_id parameter") + return + } + + windowStr := r.URL.Query().Get("window") + window := h.defaultWindow + if windowStr != "" { + parsedWindow, err := config.ParseDuration(windowStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid window parameter: "+err.Error()) + return + } + window = parsedWindow + } + + minAttemptsStr := r.URL.Query().Get("min_attempts") + minAttempts := 1 + if minAttemptsStr != "" { + parsedMinAttempts, err := strconv.Atoi(minAttemptsStr) + if err != nil || parsedMinAttempts < 1 { + respondError(w, http.StatusBadRequest, "invalid min_attempts parameter: must be a positive integer") + return + } + minAttempts = parsedMinAttempts + } + + scoreData, err := h.database.GetNodeScoreData(ctx, nodeID, window) + if err != nil { + if err.Error() == "failed to get node score data: no rows in result set" || + err.Error() == "no rows in result set" { + respondError(w, http.StatusNotFound, "node not found or no data available") + return + } + log.Error().Err(err).Int64("node_id", nodeID).Msg("Failed to get node score") + respondError(w, http.StatusInternalServerError, "failed to retrieve node score") + return + } + + if scoreData.TotalAttempts < int64(minAttempts) { + respondError(w, http.StatusNotFound, fmt.Sprintf("node does not meet minimum attempts requirement: %d attempts (required: %d)", scoreData.TotalAttempts, minAttempts)) + return + } + + score := CalculateScore(scoreData) + + respondJSON(w, http.StatusOK, map[string]interface{}{ + "window": window.String(), + "min_attempts": minAttempts, + "score": score, + }) +} + +// GetHealth handles GET /api/v1/health +func (h *Handlers) GetHealth(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + + if err := h.database.Ping(ctx); err != nil { + respondJSON(w, http.StatusServiceUnavailable, map[string]interface{}{ + "status": "unhealthy", + "message": "database connection failed", + }) + return + } + + respondJSON(w, http.StatusOK, map[string]interface{}{ + "status": "healthy", + }) +} + +// Helper functions + +func respondJSON(w http.ResponseWriter, status int, data interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(data); err != nil { + log.Error().Err(err).Msg("Failed to encode JSON response") + } +} + +func respondError(w http.ResponseWriter, status int, message string) { + respondJSON(w, status, map[string]interface{}{ + "error": message, + }) +} diff --git a/pkg/api/scoring.go b/pkg/api/scoring.go new file mode 100644 index 00000000..75ab3801 --- /dev/null +++ b/pkg/api/scoring.go @@ -0,0 +1,50 @@ +package api + +import ( + "github.com/threefoldtech/provision-probe/pkg/db" +) + +const ( + minAttemptsThreshold = 1 +) + +type NodeScore struct { + NodeID int64 `json:"node_id"` + FarmID int64 `json:"farm_id"` + SuccessRate float64 `json:"success_rate"` + TotalAttempts int64 `json:"total_attempts"` + AvgDurationMs float64 `json:"avg_duration_ms,omitempty"` + Score float64 `json:"score"` +} + +// TODO: add score to the duration ratio +func CalculateScore(data *db.NodeScoreData) *NodeScore { + if data.TotalAttempts < minAttemptsThreshold { + result := &NodeScore{ + NodeID: data.NodeID, + FarmID: data.FarmID, + TotalAttempts: data.TotalAttempts, + Score: 0.0, + } + if data.AvgDurationMs != nil { + result.AvgDurationMs = *data.AvgDurationMs + } + return result + } + + successRate := float64(data.SuccessCount) / float64(data.TotalAttempts) + + result := &NodeScore{ + NodeID: data.NodeID, + FarmID: data.FarmID, + SuccessRate: successRate, + TotalAttempts: data.TotalAttempts, + Score: successRate, + } + + if data.AvgDurationMs != nil { + result.AvgDurationMs = *data.AvgDurationMs + } + + return result +} diff --git a/pkg/api/server.go b/pkg/api/server.go new file mode 100644 index 00000000..a6beafb0 --- /dev/null +++ b/pkg/api/server.go @@ -0,0 +1,60 @@ +package api + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/go-chi/chi/v5/middleware" + "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/db" +) + +type Server struct { + server *http.Server + handlers *Handlers +} + +func NewServer(database *db.DB, cfg *config.Config) *Server { + handlers := NewHandlers(database, cfg) + + router := chi.NewRouter() + router.Use(middleware.RealIP) + router.Use(middleware.Logger) + router.Use(middleware.Recoverer) + router.Use(middleware.Timeout(60 * time.Second)) + + router.Route("/api/v1", func(r chi.Router) { + r.Get("/scores", handlers.GetTopScores) + r.Route("/scores/node", func(r chi.Router) { + r.Get("/{node_id}", handlers.GetNodeScore) + }) + r.Get("/health", handlers.GetHealth) + }) + + addr := fmt.Sprintf("%s:%d", cfg.API.Host, cfg.API.Port) + server := &http.Server{ + Addr: addr, + Handler: router, + } + + return &Server{ + server: server, + handlers: handlers, + } +} + +func (s *Server) Start() error { + log.Info(). + Str("address", s.server.Addr). + Msg("Starting API server") + return s.server.ListenAndServe() +} + +func (s *Server) Shutdown(ctx context.Context) error { + log.Info().Msg("Shutting down API server") + return s.server.Shutdown(ctx) +} diff --git a/pkg/app/app.go b/pkg/app/app.go index bd838aa6..33b9067a 100644 --- a/pkg/app/app.go +++ b/pkg/app/app.go @@ -3,10 +3,12 @@ package app import ( "context" "fmt" + "net/http" "sync" "time" "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/api" "github.com/threefoldtech/provision-probe/pkg/config" "github.com/threefoldtech/provision-probe/pkg/db" "github.com/threefoldtech/provision-probe/pkg/grid" @@ -18,6 +20,7 @@ type App struct { cfg *config.Config database *db.DB gridClient *grid.Client + apiServer *api.Server } func New(cfg *config.Config) (*App, error) { @@ -32,10 +35,13 @@ func New(cfg *config.Config) (*App, error) { return nil, fmt.Errorf("failed to create grid client: %w", err) } + apiServer := api.NewServer(database, cfg) + return &App{ cfg: cfg, database: database, gridClient: gridClient, + apiServer: apiServer, }, nil } @@ -48,6 +54,13 @@ func (a *App) Run(ctx context.Context) error { Str("network", a.cfg.Grid.Network). Msg("Starting provision probe service") + apiErrChan := make(chan error, 1) + go func() { + if err := a.apiServer.Start(); err != nil && err != http.ErrServerClosed { + apiErrChan <- err + } + }() + if err := a.runCycle(ctx); err != nil { log.Error().Err(err).Msg("Initial cycle failed") } @@ -55,7 +68,14 @@ func (a *App) Run(ctx context.Context) error { for { select { case <-ctx.Done(): + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := a.apiServer.Shutdown(shutdownCtx); err != nil { + log.Error().Err(err).Msg("Error shutting down API server") + } return nil + case err := <-apiErrChan: + return fmt.Errorf("API server error: %w", err) case <-ticker.C: if err := a.runCycle(ctx); err != nil { log.Error().Err(err).Msg("Cycle failed") @@ -66,14 +86,14 @@ func (a *App) Run(ctx context.Context) error { func (a *App) Close() { a.database.Close() + a.gridClient.Close() } func (a *App) runCycle(ctx context.Context) error { log.Info().Msg("Starting deployment cycle") - filters := a.buildFilters() proxyClient := a.gridClient.GetProxyClient() - nodes, err := grid.GetNodes(ctx, proxyClient, filters) + nodes, err := grid.GetNodes(ctx, proxyClient, a.cfg.Nodes) if err != nil { return fmt.Errorf("failed to get nodes: %w", err) } @@ -85,11 +105,11 @@ func (a *App) runCycle(ctx context.Context) error { log.Info(). Int("nodes", len(nodes)). - Int("max_concurrent", a.cfg.ConcurrencyLimit). - Str("workload", a.cfg.Workload). + Int("max_concurrent", a.cfg.Probe.ConcurrencyLimit). + Str("workload", a.cfg.Probe.WorkloadSize). Msg("Deployment cycle started") - sem := semaphore.NewWeighted(int64(a.cfg.ConcurrencyLimit)) + sem := semaphore.NewWeighted(int64(a.cfg.Probe.ConcurrencyLimit)) var wg sync.WaitGroup var mu sync.Mutex var cycleErrors []error @@ -114,7 +134,7 @@ func (a *App) runCycle(ctx context.Context) error { Int("total_nodes", len(nodes)). Int("node_id", n.NodeID). Int("farm_id", n.FarmID). - Str("workload", a.cfg.Workload). + Str("workload", a.cfg.Probe.WorkloadSize). Msg("Deploying VM to node") timeoutCtx, cancel := context.WithTimeout(ctx, a.cfg.Timeout()) @@ -122,18 +142,16 @@ func (a *App) runCycle(ctx context.Context) error { cancel() attempt := db.Attempt{ - Time: time.Now().Unix(), - NodeID: int64(int(n.NodeID)), - FarmID: int64(int(n.FarmID)), - WorkloadType: a.cfg.Workload, - Status: "failed", + Time: time.Now().Unix(), + NodeID: int64(int(n.NodeID)), + FarmID: int64(int(n.FarmID)), + Status: "failed", } if err != nil { errorCode := "unknown_error" if result != nil && result.ErrorCode != "" { errorCode = result.ErrorCode - attempt.DeployDurationMs = &result.DeployDurationMs attempt.TotalDurationMs = &result.TotalDurationMs } attempt.ErrorCode = &errorCode @@ -144,8 +162,6 @@ func (a *App) runCycle(ctx context.Context) error { Msg("Deployment failed") } else if result != nil { attempt.Status = "success" - attempt.DeployDurationMs = &result.DeployDurationMs - attempt.StartDurationMs = &result.StartDurationMs attempt.TotalDurationMs = &result.TotalDurationMs log.Debug(). Int("node_id", n.NodeID). @@ -172,28 +188,3 @@ func (a *App) runCycle(ctx context.Context) error { Msg("Deployment cycle completed") return nil } - -func (a *App) buildFilters() grid.NodeFilters { - var status *string - if len(a.cfg.Nodes.Status) > 0 { - s := a.cfg.Nodes.Status[0] - status = &s - } - - var farmIDs []uint64 - for _, id := range a.cfg.Nodes.Farms { - farmIDs = append(farmIDs, uint64(id)) - } - - var nodeIDs []uint64 - for _, id := range a.cfg.Nodes.Nodes { - nodeIDs = append(nodeIDs, uint64(id)) - } - - return grid.NodeFilters{ - Status: status, - FarmIDs: farmIDs, - NodeIDs: nodeIDs, - Exclude: a.cfg.Nodes.Exclude, - } -} diff --git a/pkg/config/config.go b/pkg/config/config.go index d2861518..7a237239 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -2,8 +2,8 @@ package config import ( "fmt" - "regexp" "strconv" + "strings" "time" "github.com/spf13/viper" @@ -24,37 +24,52 @@ const ( ) type Config struct { - IntervalStr string `mapstructure:"interval"` - ConcurrencyLimit int `mapstructure:"concurrency_limit"` - TimeoutStr string `mapstructure:"timeout"` - LogLevel string `mapstructure:"log_level"` - Grid GridConfig `mapstructure:"grid"` - Nodes NodesConfig `mapstructure:"nodes"` - Workload string `mapstructure:"workload"` - ScoreWindowStr string `mapstructure:"score_window"` - TimescaleDB TimescaleDB `mapstructure:"timescaledb"` - + LogLevel string `mapstructure:"log_level"` + Probe ProbeConfig `mapstructure:"probe"` + Scoring ScoringConfig `mapstructure:"scoring"` + Grid GridConfig `mapstructure:"grid"` + Nodes NodesConfig `mapstructure:"nodes"` + TimescaleDB TimescaleDB `mapstructure:"timescaledb"` + API APIConfig `mapstructure:"api"` + + // viper does not parse duration directly interval time.Duration timeout time.Duration scoreWindow time.Duration } +type ProbeConfig struct { + IntervalStr string `mapstructure:"interval"` + ConcurrencyLimit int `mapstructure:"concurrency_limit"` + TimeoutStr string `mapstructure:"timeout"` + WorkloadSize string `mapstructure:"workload_size"` +} + +type ScoringConfig struct { + WindowStr string `mapstructure:"window"` +} + type GridConfig struct { Network string `mapstructure:"network"` Mnemonic string `mapstructure:"mnemonic"` } type NodesConfig struct { - Status []string `mapstructure:"status"` - Farms []int `mapstructure:"farms"` - Nodes []int `mapstructure:"nodes"` - Exclude []int `mapstructure:"exclude"` + Status string `mapstructure:"status"` + Farms []uint64 `mapstructure:"farms"` + Nodes []uint64 `mapstructure:"nodes"` + Exclude []uint64 `mapstructure:"exclude"` } type TimescaleDB struct { URL string `mapstructure:"url"` } +type APIConfig struct { + Host string `mapstructure:"host"` + Port int `mapstructure:"port"` +} + func Load(configPath string) (*Config, error) { viper.SetConfigFile(configPath) viper.SetConfigType("yaml") @@ -68,32 +83,44 @@ func Load(configPath string) (*Config, error) { return nil, fmt.Errorf("failed to unmarshal config: %w", err) } - if cfg.IntervalStr != "" { - d, err := parseDuration(cfg.IntervalStr) + // manually parse durations + + if cfg.Probe.IntervalStr != "" { + d, err := ParseDuration(cfg.Probe.IntervalStr) if err != nil { return nil, fmt.Errorf("invalid interval format: %w", err) } cfg.interval = d } - if cfg.TimeoutStr != "" { - d, err := parseDuration(cfg.TimeoutStr) + if cfg.Probe.TimeoutStr != "" { + d, err := ParseDuration(cfg.Probe.TimeoutStr) if err != nil { return nil, fmt.Errorf("invalid timeout format: %w", err) } cfg.timeout = d } - if cfg.ScoreWindowStr != "" { - d, err := parseDuration(cfg.ScoreWindowStr) + if cfg.Scoring.WindowStr != "" { + d, err := ParseDuration(cfg.Scoring.WindowStr) if err != nil { - return nil, fmt.Errorf("invalid score_window format: %w", err) + return nil, fmt.Errorf("invalid scoring window format: %w", err) } cfg.scoreWindow = d } - if cfg.Workload == "" { - cfg.Workload = "light" + // add default values + + if cfg.Probe.WorkloadSize == "" { + cfg.Probe.WorkloadSize = "light" + } + + if cfg.API.Host == "" { + cfg.API.Host = "0.0.0.0" + } + + if cfg.API.Port == 0 { + cfg.API.Port = 8080 } if err := cfg.validate(); err != nil { @@ -103,24 +130,20 @@ func Load(configPath string) (*Config, error) { return &cfg, nil } -func parseDuration(s string) (time.Duration, error) { - re := regexp.MustCompile(`(\d+)d`) - if re.MatchString(s) { - matches := re.FindStringSubmatch(s) - if len(matches) == 2 { - days, err := strconv.Atoi(matches[1]) - if err != nil { - return 0, fmt.Errorf("invalid days value: %s", matches[1]) - } - hours := days * 24 - s = re.ReplaceAllString(s, fmt.Sprintf("%dh", hours)) +func ParseDuration(s string) (time.Duration, error) { + if strings.HasSuffix(s, "d") { + days, err := strconv.Atoi(strings.TrimSuffix(s, "d")) + if err != nil { + return 0, fmt.Errorf("invalid days value: %s", strings.TrimSuffix(s, "d")) } + d := time.Duration(days) * 24 * time.Hour + return d, nil } return time.ParseDuration(s) } func (c *Config) GetWorkload() (cpu uint8, memoryMB uint64, diskMB uint64) { - switch c.Workload { + switch c.Probe.WorkloadSize { case "medium": return WorkloadMediumCPU, uint64(WorkloadMediumMemory * 1024), uint64(WorkloadMediumDisk * 1024) case "heavy": @@ -144,25 +167,34 @@ func (c *Config) ScoreWindow() time.Duration { func (c *Config) validate() error { if c.interval <= 0 { - return fmt.Errorf("interval must be positive") + return fmt.Errorf("probe.interval must be positive") } - if c.ConcurrencyLimit <= 0 { - return fmt.Errorf("concurrency_limit must be positive") + if c.Probe.ConcurrencyLimit <= 0 { + return fmt.Errorf("probe.concurrency_limit must be positive") } if c.timeout <= 0 { - return fmt.Errorf("timeout must be positive") + return fmt.Errorf("probe.timeout must be positive") } - if c.Grid.Network == "" { - return fmt.Errorf("grid.network is required") + + validNetworks := map[string]struct{}{"dev": {}, "qa": {}, "test": {}, "main": {}} + if _, ok := validNetworks[c.Grid.Network]; !ok { + return fmt.Errorf("grid.network must be dev, qa, test, or main") } + if c.Grid.Mnemonic == "" { return fmt.Errorf("grid.mnemonic is required") } if c.TimescaleDB.URL == "" { return fmt.Errorf("timescaledb.url is required") } - if c.Workload != "light" && c.Workload != "medium" && c.Workload != "heavy" { - return fmt.Errorf("workload must be light, medium, or heavy") + + validWorkloads := map[string]struct{}{"light": {}, "medium": {}, "heavy": {}} + if _, ok := validWorkloads[c.Probe.WorkloadSize]; !ok { + return fmt.Errorf("probe.workload_size must be light, medium, or heavy") + } + validStatuses := map[string]struct{}{"up": {}, "healthy": {}} + if _, ok := validStatuses[c.Nodes.Status]; !ok { + return fmt.Errorf("nodes.status must be up or healthy") } return nil } diff --git a/pkg/db/db.go b/pkg/db/db.go index c77b5630..b8619a15 100644 --- a/pkg/db/db.go +++ b/pkg/db/db.go @@ -8,6 +8,23 @@ import ( "github.com/jackc/pgx/v5/pgxpool" ) +type Attempt struct { + Time int64 + NodeID int64 + FarmID int64 + Status string + TotalDurationMs *int + ErrorCode *string +} + +type NodeScoreData struct { + NodeID int64 + FarmID int64 + TotalAttempts int64 + SuccessCount int64 + AvgDurationMs *float64 +} + type DB struct { pool *pgxpool.Pool } @@ -30,16 +47,21 @@ func New(ctx context.Context, url string) (*DB, error) { return db, nil } +func (d *DB) Close() { + d.pool.Close() +} + +func (d *DB) Ping(ctx context.Context) error { + return d.pool.Ping(ctx) +} + func (d *DB) initSchema(ctx context.Context) error { queries := []string{ `CREATE TABLE IF NOT EXISTS provision_attempts ( - time TIMESTAMPTZ NOT NULL, + time BIGINT NOT NULL, node_id BIGINT NOT NULL, farm_id BIGINT NOT NULL, - workload_type VARCHAR(20) NOT NULL, status VARCHAR(20) NOT NULL, - deploy_duration_ms INTEGER, - start_duration_ms INTEGER, total_duration_ms INTEGER, error_code VARCHAR(100) )`, @@ -59,17 +81,14 @@ func (d *DB) initSchema(ctx context.Context) error { func (d *DB) RecordAttempt(ctx context.Context, attempt Attempt) error { query := `INSERT INTO provision_attempts - (time, node_id, farm_id, workload_type, status, deploy_duration_ms, start_duration_ms, total_duration_ms, error_code) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)` + (time, node_id, farm_id, status, total_duration_ms, error_code) + VALUES ($1, $2, $3, $4, $5, $6)` _, err := d.pool.Exec(ctx, query, - time.Unix(attempt.Time, 0), + attempt.Time, attempt.NodeID, attempt.FarmID, - attempt.WorkloadType, attempt.Status, - attempt.DeployDurationMs, - attempt.StartDurationMs, attempt.TotalDurationMs, attempt.ErrorCode, ) @@ -77,18 +96,81 @@ func (d *DB) RecordAttempt(ctx context.Context, attempt Attempt) error { return err } -func (d *DB) Close() { - d.pool.Close() +func (d *DB) GetNodeScoreData(ctx context.Context, nodeID int64, window time.Duration) (*NodeScoreData, error) { + windowStart := time.Now().Add(-window).Unix() + + query := ` + SELECT + node_id, + farm_id, + COUNT(*) as total_attempts, + COUNT(*) FILTER (WHERE status = 'success') as success_count, + AVG(total_duration_ms) FILTER (WHERE total_duration_ms IS NOT NULL) as avg_duration_ms + FROM provision_attempts + WHERE node_id = $1 AND time >= $2 + GROUP BY node_id, farm_id + ` + + var data NodeScoreData + err := d.pool.QueryRow(ctx, query, nodeID, windowStart).Scan( + &data.NodeID, + &data.FarmID, + &data.TotalAttempts, + &data.SuccessCount, + &data.AvgDurationMs, + ) + + if err != nil { + return nil, fmt.Errorf("failed to get node score data: %w", err) + } + + return &data, nil } -type Attempt struct { - Time int64 - NodeID int64 - FarmID int64 - WorkloadType string - Status string - DeployDurationMs *int - StartDurationMs *int - TotalDurationMs *int - ErrorCode *string +func (d *DB) GetTopNodeScores(ctx context.Context, window time.Duration, limit int, minAttempts int) ([]NodeScoreData, error) { + windowStart := time.Now().Add(-window).Unix() + + query := ` + SELECT + node_id, + farm_id, + COUNT(*) as total_attempts, + COUNT(*) FILTER (WHERE status = 'success') as success_count, + AVG(total_duration_ms) FILTER (WHERE total_duration_ms IS NOT NULL) as avg_duration_ms + FROM provision_attempts + WHERE time >= $1 + GROUP BY node_id, farm_id + HAVING COUNT(*) >= $3 + ORDER BY + (COUNT(*) FILTER (WHERE status = 'success')::float / COUNT(*)) DESC + LIMIT $2 + ` + + rows, err := d.pool.Query(ctx, query, windowStart, limit, minAttempts) + if err != nil { + return nil, fmt.Errorf("failed to get top node scores: %w", err) + } + defer rows.Close() + + var results []NodeScoreData + for rows.Next() { + var data NodeScoreData + err := rows.Scan( + &data.NodeID, + &data.FarmID, + &data.TotalAttempts, + &data.SuccessCount, + &data.AvgDurationMs, + ) + if err != nil { + return nil, fmt.Errorf("failed to scan node score data: %w", err) + } + results = append(results, data) + } + + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating rows: %w", err) + } + + return results, nil } diff --git a/pkg/grid/client.go b/pkg/grid/client.go index 5a393837..3f5d6ea0 100644 --- a/pkg/grid/client.go +++ b/pkg/grid/client.go @@ -35,27 +35,30 @@ func NewClient(network, mnemonic string, logLevel string) (*Client, error) { return &Client{tfPlugin: tfPlugin}, nil } +func (c *Client) Close() { + c.tfPlugin.Close() +} + func (c *Client) GetProxyClient() proxy.Client { return c.tfPlugin.GridProxyClient } type DeploymentResult struct { - Success bool - DeployDurationMs int - StartDurationMs int - TotalDurationMs int - ErrorCode string + Success bool + ErrorCode string + // TotalDurationMs is the total end-to-end time from deployment start to completion (in milliseconds) + // This includes network build, network deployment, VM deployment, and VM verification + TotalDurationMs int } func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryMB uint64, diskMB uint64) (*DeploymentResult, error) { startTime := time.Now() - timestamp := time.Now().Unix() - vmName := fmt.Sprintf("probe_%d", timestamp) + vmName := fmt.Sprintf("probe_%d", startTime.Unix()) networkName := fmt.Sprintf("%s_net", vmName) projectName := fmt.Sprintf("%s_project", vmName) - network, err := buildNetwork(networkName, projectName, []uint32{nodeID}) + network, err := buildNetwork(networkName, projectName, nodeID) if err != nil { return &DeploymentResult{ Success: false, @@ -73,83 +76,66 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM dl := workloads.NewDeployment(vmName, nodeID, projectName, nil, networkName, nil, nil, []workloads.VM{vm}, nil, nil, nil) - deployStart := time.Now() log.Debug().Str("network", networkName).Uint32("node_id", nodeID).Msg("Deploying network") err = c.tfPlugin.NetworkDeployer.Deploy(ctx, &network) if err != nil { return &DeploymentResult{ - Success: false, - DeployDurationMs: int(time.Since(deployStart).Milliseconds()), - TotalDurationMs: int(time.Since(startTime).Milliseconds()), - ErrorCode: "network_deploy_failed", + Success: false, + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "network_deploy_failed", }, fmt.Errorf("failed to deploy network on node %d: %w", nodeID, err) } - vmDeployStart := time.Now() log.Debug().Str("vm", vmName).Uint32("node_id", nodeID).Msg("Deploying VM") err = c.tfPlugin.DeploymentDeployer.Deploy(ctx, &dl) if err != nil { revertDeployment(ctx, c.tfPlugin, &dl, &network, false) return &DeploymentResult{ - Success: false, - DeployDurationMs: int(time.Since(vmDeployStart).Milliseconds()), - TotalDurationMs: int(time.Since(startTime).Milliseconds()), - ErrorCode: "deploy_failed", + Success: false, + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "deploy_failed", }, fmt.Errorf("failed to deploy VM on node %d: %w", nodeID, err) } - vmDeployDuration := time.Since(vmDeployStart) - - startStart := time.Now() _, err = c.tfPlugin.State.LoadVMFromGrid(ctx, nodeID, vm.Name, dl.Name) if err != nil { revertDeployment(ctx, c.tfPlugin, &dl, &network, true) return &DeploymentResult{ - Success: false, - DeployDurationMs: int(vmDeployDuration.Milliseconds()), - StartDurationMs: int(time.Since(startStart).Milliseconds()), - TotalDurationMs: int(time.Since(startTime).Milliseconds()), - ErrorCode: "start_failed", + Success: false, + TotalDurationMs: int(time.Since(startTime).Milliseconds()), + ErrorCode: "start_failed", }, fmt.Errorf("failed to load VM from node %d: %w", nodeID, err) } - startDuration := time.Since(startStart) totalDuration := time.Since(startTime) log.Debug(). - Int("deploy_ms", int(vmDeployDuration.Milliseconds())). - Int("start_ms", int(startDuration.Milliseconds())). + Int("total_ms", int(totalDuration.Milliseconds())). Uint32("node_id", nodeID). Msg("VM deployed successfully") revertDeployment(ctx, c.tfPlugin, &dl, &network, true) return &DeploymentResult{ - Success: true, - DeployDurationMs: int(vmDeployDuration.Milliseconds()), - StartDurationMs: int(startDuration.Milliseconds()), - TotalDurationMs: int(totalDuration.Milliseconds()), + Success: true, + TotalDurationMs: int(totalDuration.Milliseconds()), }, nil } -func buildNetwork(name, projectName string, nodes []uint32) (workloads.ZNet, error) { - keys := make(map[uint32][]byte) - for _, node := range nodes { - key, err := workloads.RandomMyceliumKey() - if err != nil { - return workloads.ZNet{}, fmt.Errorf("failed to generate mycelium key for node %d: %w", node, err) - } - keys[node] = key +func buildNetwork(name, projectName string, nodeID uint32) (workloads.ZNet, error) { + key, err := workloads.RandomMyceliumKey() + if err != nil { + return workloads.ZNet{}, fmt.Errorf("failed to generate mycelium key for node %d: %w", nodeID, err) } return workloads.ZNet{ Name: name, - Nodes: nodes, + Nodes: []uint32{nodeID}, IPRange: zos.IPNet{IPNet: net.IPNet{ IP: net.IPv4(10, 20, 0, 0), Mask: net.CIDRMask(16, 32), }}, - MyceliumKeys: keys, + MyceliumKeys: map[uint32][]byte{nodeID: key}, SolutionType: projectName, Description: "Probe network", }, nil diff --git a/pkg/grid/proxy.go b/pkg/grid/proxy.go index 9b290a6b..904b35e3 100644 --- a/pkg/grid/proxy.go +++ b/pkg/grid/proxy.go @@ -5,35 +5,20 @@ import ( "fmt" "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/config" "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/client" "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" ) -func GetNodes(ctx context.Context, proxyClient client.Client, filters NodeFilters) ([]types.Node, error) { +func GetNodes(ctx context.Context, proxyClient client.Client, filters config.NodesConfig) ([]types.Node, error) { limit := types.Limit{ Size: 100, Page: 1, } - var allNodes []types.Node - - var excludeIDs []uint64 - for _, id := range filters.Exclude { - excludeIDs = append(excludeIDs, uint64(id)) - } + filter := buildFilters(filters) + var allNodes []types.Node for { - var status []string - if filters.Status != nil { - status = []string{*filters.Status} - } - - filter := types.NodeFilter{ - Status: status, - FarmIDs: filters.FarmIDs, - NodeIDs: filters.NodeIDs, - Excluded: excludeIDs, - } - nodes, total, err := proxyClient.Nodes(ctx, filter, limit) if err != nil { return nil, fmt.Errorf("failed to query nodes: %w", err) @@ -53,9 +38,19 @@ func GetNodes(ctx context.Context, proxyClient client.Client, filters NodeFilter return allNodes, nil } -type NodeFilters struct { - Status *string - FarmIDs []uint64 - NodeIDs []uint64 - Exclude []int +func buildFilters(filters config.NodesConfig) types.NodeFilter { + filter := types.NodeFilter{ + FarmIDs: filters.Farms, + NodeIDs: filters.Nodes, + Excluded: filters.Exclude, + } + + switch filters.Status { + case "up": + filter.Status = []string{"up"} + case "healthy": + filter.Healthy = &[]bool{true}[0] + } + + return filter } From 7b72b94d8051089aeae7bd972730e3148405ebd7 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Tue, 2 Dec 2025 10:48:12 +0200 Subject: [PATCH 5/6] Enhance configuration and deployment logic with retry mechanism and graceful shutdown --- cmd/provision-probe/main.go | 6 ++- configs/config.example.yaml | 8 +++- docker-compose.yml | 15 +++++-- pkg/app/app.go | 77 ++++++++++++++++++++++++++------- pkg/config/config.go | 86 ++++++++++++++++++++++++++++++++++--- pkg/grid/client.go | 36 ++++++++++++---- pkg/grid/proxy.go | 37 ++++++++++++++-- pkg/retry/retry.go | 72 +++++++++++++++++++++++++++++++ 8 files changed, 297 insertions(+), 40 deletions(-) create mode 100644 pkg/retry/retry.go diff --git a/cmd/provision-probe/main.go b/cmd/provision-probe/main.go index 8acf26b6..4d71d9f0 100644 --- a/cmd/provision-probe/main.go +++ b/cmd/provision-probe/main.go @@ -40,8 +40,12 @@ func main() { go func() { <-sigChan - log.Info().Msg("Shutting down...") + log.Info().Msg("Shutdown signal received, finishing current work...") cancel() + + <-sigChan + log.Warn().Msg("Second shutdown signal received, forcing exit") + os.Exit(1) }() if err := application.Run(ctx); err != nil { diff --git a/configs/config.example.yaml b/configs/config.example.yaml index f97b7117..8cb77111 100644 --- a/configs/config.example.yaml +++ b/configs/config.example.yaml @@ -10,7 +10,13 @@ probe: interval: "6h" # interval for the deployment cycle concurrency_limit: 10 # concurrency limit for the deployment cycle timeout: "10m" # timeout for the deployment cycle - workload_size: "light" # light, medium, heavy + workload_size: "light" # light, medium, heavy + shutdown_timeout: "30s" # timeout for graceful shutdown + retry: + max_retries: 3 # maximum number of retries for transient failures + initial_backoff: "1s" # initial backoff duration between retries + max_backoff: "30s" # maximum backoff duration + multiplier: 2.0 # exponential backoff multiplier scoring: window: "90d" # default window for the scoring (overwritten by the window query parameter) diff --git a/docker-compose.yml b/docker-compose.yml index dc4f3dcd..8dd61e2a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: timescaledb: image: timescale/timescaledb:latest-pg16 @@ -9,7 +7,7 @@ services: POSTGRES_PASSWORD: postgres POSTGRES_DB: provision_probe ports: - - "5432:5432" + - "${PG_PORT:-5432}:5432" volumes: - timescaledb-data:/var/lib/postgresql/data healthcheck: @@ -17,7 +15,16 @@ services: interval: 10s timeout: 5s retries: 5 + + pgweb: + image: sosedoff/pgweb:latest + container_name: provision-probe-pgweb + ports: + - "${VIEW_PORT:-8081}:8081" + environment: + - PGWEB_DATABASE_URL=postgres://postgres:postgres@timescaledb:5432/provision_probe?sslmode=disable + depends_on: + - timescaledb volumes: timescaledb-data: - diff --git a/pkg/app/app.go b/pkg/app/app.go index 33b9067a..f190fb24 100644 --- a/pkg/app/app.go +++ b/pkg/app/app.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" "sync" + "sync/atomic" "time" "github.com/rs/zerolog/log" @@ -17,10 +18,12 @@ import ( ) type App struct { - cfg *config.Config - database *db.DB - gridClient *grid.Client - apiServer *api.Server + cfg *config.Config + database *db.DB + gridClient *grid.Client + apiServer *api.Server + cycleWg sync.WaitGroup + shuttingDown atomic.Bool } func New(cfg *config.Config) (*App, error) { @@ -29,7 +32,7 @@ func New(cfg *config.Config) (*App, error) { return nil, fmt.Errorf("failed to connect to database: %w", err) } - gridClient, err := grid.NewClient(cfg.Grid.Network, cfg.Grid.Mnemonic, cfg.LogLevel) + gridClient, err := grid.NewClient(cfg) if err != nil { database.Close() return nil, fmt.Errorf("failed to create grid client: %w", err) @@ -68,15 +71,14 @@ func (a *App) Run(ctx context.Context) error { for { select { case <-ctx.Done(): - shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if err := a.apiServer.Shutdown(shutdownCtx); err != nil { - log.Error().Err(err).Msg("Error shutting down API server") - } - return nil + return a.shutdown() case err := <-apiErrChan: return fmt.Errorf("API server error: %w", err) case <-ticker.C: + if a.shuttingDown.Load() { + continue // do not start new cycle if shutting down + } + if err := a.runCycle(ctx); err != nil { log.Error().Err(err).Msg("Cycle failed") } @@ -84,9 +86,43 @@ func (a *App) Run(ctx context.Context) error { } } +func (a *App) shutdown() error { + log.Info().Msg("Initiating graceful shutdown") + + a.shuttingDown.Store(true) // do not start new cycle/routine if shutting down + + log.Info().Msg("Waiting for current deployments to finish...") + done := make(chan struct{}) + go func() { + a.cycleWg.Wait() + close(done) + }() + + select { + case <-done: + log.Info().Msg("All deployments completed") // shutdown went well + case <-time.After(a.cfg.ShutdownTimeout()): + log.Warn(). + Dur("timeout", a.cfg.ShutdownTimeout()). + Msg("Shutdown timeout reached, some deployments may not have completed") + // continue shutdown anyway + } + + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := a.apiServer.Shutdown(shutdownCtx); err != nil { + log.Error().Err(err).Msg("Error shutting down API server") + } else { + log.Info().Msg("API server shut down successfully") + } + + return nil +} + func (a *App) Close() { a.database.Close() - a.gridClient.Close() + // a.gridClient.Close() } func (a *App) runCycle(ctx context.Context) error { @@ -110,16 +146,16 @@ func (a *App) runCycle(ctx context.Context) error { Msg("Deployment cycle started") sem := semaphore.NewWeighted(int64(a.cfg.Probe.ConcurrencyLimit)) - var wg sync.WaitGroup var mu sync.Mutex var cycleErrors []error cpu, memoryMB, diskMB := a.cfg.GetWorkload() for i, node := range nodes { - wg.Add(1) + + a.cycleWg.Add(1) go func(idx int, n types.Node) { - defer wg.Done() + defer a.cycleWg.Done() if err := sem.Acquire(ctx, 1); err != nil { // blocks if at capacity mu.Lock() @@ -129,6 +165,13 @@ func (a *App) runCycle(ctx context.Context) error { } defer sem.Release(1) + if a.shuttingDown.Load() { + log.Debug(). + Int("node_id", n.NodeID). + Msg("Shutdown requested, skipping deployment") + return // do not start new job if shutting down + } + log.Debug(). Int("node_index", idx+1). Int("total_nodes", len(nodes)). @@ -180,7 +223,9 @@ func (a *App) runCycle(ctx context.Context) error { }(i, node) } - wg.Wait() + a.cycleWg.Wait() + + metrics.GetMetrics().SetActiveDeployments(0) log.Info(). Int("total_nodes", len(nodes)). diff --git a/pkg/config/config.go b/pkg/config/config.go index 7a237239..7c43a33f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -33,16 +33,21 @@ type Config struct { API APIConfig `mapstructure:"api"` // viper does not parse duration directly - interval time.Duration - timeout time.Duration - scoreWindow time.Duration + interval time.Duration + timeout time.Duration + scoreWindow time.Duration + shutdownTimeout time.Duration + initialBackoff time.Duration + maxBackoff time.Duration } type ProbeConfig struct { - IntervalStr string `mapstructure:"interval"` - ConcurrencyLimit int `mapstructure:"concurrency_limit"` - TimeoutStr string `mapstructure:"timeout"` - WorkloadSize string `mapstructure:"workload_size"` + IntervalStr string `mapstructure:"interval"` + ConcurrencyLimit int `mapstructure:"concurrency_limit"` + TimeoutStr string `mapstructure:"timeout"` + WorkloadSize string `mapstructure:"workload_size"` + Retry RetryConfig `mapstructure:"retry"` + ShutdownTimeoutStr string `mapstructure:"shutdown_timeout"` } type ScoringConfig struct { @@ -70,6 +75,13 @@ type APIConfig struct { Port int `mapstructure:"port"` } +type RetryConfig struct { + MaxRetries int `mapstructure:"max_retries"` + InitialBackoffStr string `mapstructure:"initial_backoff"` + MaxBackoffStr string `mapstructure:"max_backoff"` + Multiplier float64 `mapstructure:"multiplier"` +} + func Load(configPath string) (*Config, error) { viper.SetConfigFile(configPath) viper.SetConfigType("yaml") @@ -109,6 +121,30 @@ func Load(configPath string) (*Config, error) { cfg.scoreWindow = d } + if cfg.Probe.ShutdownTimeoutStr != "" { + d, err := ParseDuration(cfg.Probe.ShutdownTimeoutStr) + if err != nil { + return nil, fmt.Errorf("invalid shutdown_timeout format: %w", err) + } + cfg.shutdownTimeout = d + } + + if cfg.Probe.Retry.InitialBackoffStr != "" { + d, err := ParseDuration(cfg.Probe.Retry.InitialBackoffStr) + if err != nil { + return nil, fmt.Errorf("invalid retry.initial_backoff format: %w", err) + } + cfg.initialBackoff = d + } + + if cfg.Probe.Retry.MaxBackoffStr != "" { + d, err := ParseDuration(cfg.Probe.Retry.MaxBackoffStr) + if err != nil { + return nil, fmt.Errorf("invalid retry.max_backoff format: %w", err) + } + cfg.maxBackoff = d + } + // add default values if cfg.Probe.WorkloadSize == "" { @@ -123,6 +159,26 @@ func Load(configPath string) (*Config, error) { cfg.API.Port = 8080 } + if cfg.Probe.Retry.MaxRetries == 0 { + cfg.Probe.Retry.MaxRetries = 3 + } + + if cfg.Probe.Retry.Multiplier == 0 { + cfg.Probe.Retry.Multiplier = 2.0 + } + + if cfg.initialBackoff == 0 { + cfg.initialBackoff = 1 * time.Second + } + + if cfg.maxBackoff == 0 { + cfg.maxBackoff = 30 * time.Second + } + + if cfg.shutdownTimeout == 0 { + cfg.shutdownTimeout = 30 * time.Second + } + if err := cfg.validate(); err != nil { return nil, fmt.Errorf("config validation failed: %w", err) } @@ -165,6 +221,22 @@ func (c *Config) ScoreWindow() time.Duration { return c.scoreWindow } +func (c *Config) ShutdownTimeout() time.Duration { + return c.shutdownTimeout +} + +func (c *Config) RetryConfig() RetryConfig { + return c.Probe.Retry +} + +func (c *Config) InitialBackoff() time.Duration { + return c.initialBackoff +} + +func (c *Config) MaxBackoff() time.Duration { + return c.maxBackoff +} + func (c *Config) validate() error { if c.interval <= 0 { return fmt.Errorf("probe.interval must be positive") diff --git a/pkg/grid/client.go b/pkg/grid/client.go index 3f5d6ea0..5619ba41 100644 --- a/pkg/grid/client.go +++ b/pkg/grid/client.go @@ -7,6 +7,8 @@ import ( "time" "github.com/rs/zerolog/log" + "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/retry" "github.com/threefoldtech/tfgrid-sdk-go/grid-client/deployer" "github.com/threefoldtech/tfgrid-sdk-go/grid-client/workloads" "github.com/threefoldtech/tfgrid-sdk-go/grid-client/zos" @@ -15,24 +17,35 @@ import ( type Client struct { tfPlugin deployer.TFPluginClient + retryCfg retry.BackoffConfig } -func NewClient(network, mnemonic string, logLevel string) (*Client, error) { +func NewClient(cfg *config.Config) (*Client, error) { opts := []deployer.PluginOpt{ - deployer.WithNetwork(network), + deployer.WithNetwork(cfg.Grid.Network), deployer.WithDisableSentry(), } - if logLevel == "debug" { + if cfg.LogLevel == "debug" { opts = append(opts, deployer.WithLogs()) } - tfPlugin, err := deployer.NewTFPluginClient(mnemonic, opts...) + tfPlugin, err := deployer.NewTFPluginClient(cfg.Grid.Mnemonic, opts...) if err != nil { return nil, fmt.Errorf("failed to create grid client: %w", err) } - return &Client{tfPlugin: tfPlugin}, nil + retryCfg := retry.BackoffConfig{ + MaxRetries: cfg.RetryConfig().MaxRetries, + InitialBackoff: cfg.InitialBackoff(), + MaxBackoff: cfg.MaxBackoff(), + Multiplier: cfg.RetryConfig().Multiplier, + } + + return &Client{ + tfPlugin: tfPlugin, + retryCfg: retryCfg, + }, nil } func (c *Client) Close() { @@ -77,7 +90,9 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM dl := workloads.NewDeployment(vmName, nodeID, projectName, nil, networkName, nil, nil, []workloads.VM{vm}, nil, nil, nil) log.Debug().Str("network", networkName).Uint32("node_id", nodeID).Msg("Deploying network") - err = c.tfPlugin.NetworkDeployer.Deploy(ctx, &network) + err = retry.DoWithBackoff(ctx, c.retryCfg, func() error { + return c.tfPlugin.NetworkDeployer.Deploy(ctx, &network) + }) if err != nil { return &DeploymentResult{ Success: false, @@ -87,7 +102,9 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM } log.Debug().Str("vm", vmName).Uint32("node_id", nodeID).Msg("Deploying VM") - err = c.tfPlugin.DeploymentDeployer.Deploy(ctx, &dl) + err = retry.DoWithBackoff(ctx, c.retryCfg, func() error { + return c.tfPlugin.DeploymentDeployer.Deploy(ctx, &dl) + }) if err != nil { revertDeployment(ctx, c.tfPlugin, &dl, &network, false) return &DeploymentResult{ @@ -97,7 +114,10 @@ func (c *Client) DeployVM(ctx context.Context, nodeID uint32, cpu uint8, memoryM }, fmt.Errorf("failed to deploy VM on node %d: %w", nodeID, err) } - _, err = c.tfPlugin.State.LoadVMFromGrid(ctx, nodeID, vm.Name, dl.Name) + err = retry.DoWithBackoff(ctx, c.retryCfg, func() error { + _, err := c.tfPlugin.State.LoadVMFromGrid(ctx, nodeID, vm.Name, dl.Name) + return err + }) if err != nil { revertDeployment(ctx, c.tfPlugin, &dl, &network, true) return &DeploymentResult{ diff --git a/pkg/grid/proxy.go b/pkg/grid/proxy.go index 904b35e3..a2a4dc55 100644 --- a/pkg/grid/proxy.go +++ b/pkg/grid/proxy.go @@ -3,13 +3,22 @@ package grid import ( "context" "fmt" + "time" "github.com/rs/zerolog/log" "github.com/threefoldtech/provision-probe/pkg/config" + "github.com/threefoldtech/provision-probe/pkg/retry" "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/client" "github.com/threefoldtech/tfgrid-sdk-go/grid-proxy/pkg/types" ) +const ( + proxyRetryMaxRetries = 3 + proxyRetryInitialBackoff = 1 * time.Second + proxyRetryMaxBackoff = 10 * time.Second + proxyRetryMultiplier = 2.0 +) + func GetNodes(ctx context.Context, proxyClient client.Client, filters config.NodesConfig) ([]types.Node, error) { limit := types.Limit{ Size: 100, @@ -17,11 +26,33 @@ func GetNodes(ctx context.Context, proxyClient client.Client, filters config.Nod } filter := buildFilters(filters) + retryCfg := retry.BackoffConfig{ + MaxRetries: proxyRetryMaxRetries, + InitialBackoff: proxyRetryInitialBackoff, + MaxBackoff: proxyRetryMaxBackoff, + Multiplier: proxyRetryMultiplier, + } + var allNodes []types.Node + var total int for { - nodes, total, err := proxyClient.Nodes(ctx, filter, limit) - if err != nil { - return nil, fmt.Errorf("failed to query nodes: %w", err) + var nodes []types.Node + + operation := func() error { + var pageTotal int + var err error + nodes, pageTotal, err = proxyClient.Nodes(ctx, filter, limit) + if err != nil { + return fmt.Errorf("failed to query nodes page %d: %w", limit.Page, err) + } + if total == 0 { + total = pageTotal + } + return nil + } + + if err := retry.DoWithBackoff(ctx, retryCfg, operation); err != nil { + return nil, err } allNodes = append(allNodes, nodes...) diff --git a/pkg/retry/retry.go b/pkg/retry/retry.go new file mode 100644 index 00000000..efae4b30 --- /dev/null +++ b/pkg/retry/retry.go @@ -0,0 +1,72 @@ +package retry + +import ( + "context" + "strings" + "time" + + "github.com/cenkalti/backoff/v4" + "github.com/rs/zerolog/log" +) + +func IsTransientError(err error) bool { + if err == nil { + return false + } + + errStr := err.Error() + transientPatterns := []string{ + "timeout", + "deadline exceeded", + "broken pipe", + } + + for _, pattern := range transientPatterns { + if strings.Contains(strings.ToLower(errStr), strings.ToLower(pattern)) { + return true + } + } + + return false +} + +type BackoffConfig struct { + MaxRetries int + InitialBackoff time.Duration + MaxBackoff time.Duration + Multiplier float64 +} + +func DoWithBackoff(ctx context.Context, cfg BackoffConfig, fn func() error) error { + expBackoff := backoff.NewExponentialBackOff() + expBackoff.InitialInterval = cfg.InitialBackoff + expBackoff.MaxInterval = cfg.MaxBackoff + expBackoff.Multiplier = cfg.Multiplier + expBackoff.Reset() + + operation := func() error { + err := fn() + if err == nil { + return nil + } + if !IsTransientError(err) { + return backoff.Permanent(err) + } + return err + } + + backoffWithCtx := backoff.WithContext( + backoff.WithMaxRetries(expBackoff, uint64(cfg.MaxRetries)), + ctx, + ) + + notify := func(err error, d time.Duration) { + log.Debug().Err(err).Dur("delay", d).Msg("retrying operation due to transient error") + } + + return backoff.RetryNotify( + operation, + backoffWithCtx, + notify, + ) +} From e13a3f710f4e706d400bf73274934ec80b870e78 Mon Sep 17 00:00:00 2001 From: Omar Abdulaziz Date: Tue, 2 Dec 2025 13:26:07 +0200 Subject: [PATCH 6/6] add Swagger documentation, and enhance API responses --- docs/swagger/docs.go | 323 ++++++++++++++++++++++++++++++++++++++ docs/swagger/swagger.json | 294 ++++++++++++++++++++++++++++++++++ docs/swagger/swagger.yaml | 206 ++++++++++++++++++++++++ go.mod | 32 +++- go.sum | 82 +++++++++- pkg/api/handlers.go | 133 +++++++++++++--- pkg/api/scoring.go | 14 +- pkg/api/server.go | 12 ++ pkg/app/app.go | 2 - 9 files changed, 1054 insertions(+), 44 deletions(-) create mode 100644 docs/swagger/docs.go create mode 100644 docs/swagger/swagger.json create mode 100644 docs/swagger/swagger.yaml diff --git a/docs/swagger/docs.go b/docs/swagger/docs.go new file mode 100644 index 00000000..77145730 --- /dev/null +++ b/docs/swagger/docs.go @@ -0,0 +1,323 @@ +// Package swagger Code generated by swaggo/swag. DO NOT EDIT +package swagger + +import "github.com/swaggo/swag" + +const docTemplate = `{ + "schemes": {{ marshal .Schemes }}, + "swagger": "2.0", + "info": { + "description": "{{escape .Description}}", + "title": "{{.Title}}", + "contact": {}, + "version": "{{.Version}}" + }, + "host": "{{.Host}}", + "basePath": "{{.BasePath}}", + "paths": { + "/api/v1/health": { + "get": { + "description": "Returns the health status of the service and its dependencies", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "health" + ], + "summary": "Health check endpoint", + "responses": { + "200": { + "description": "Service is healthy", + "schema": { + "$ref": "#/definitions/api.HealthResponse" + } + }, + "503": { + "description": "Service is unhealthy", + "schema": { + "$ref": "#/definitions/api.HealthResponse" + } + } + } + } + }, + "/api/v1/scores": { + "get": { + "description": "Returns the top performing nodes based on success rate within a time window", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "scores" + ], + "summary": "Get top node scores", + "parameters": [ + { + "type": "string", + "description": "Time window for scoring (e.g., '90d', '30d', '7d')", + "name": "window", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 10, + "description": "Maximum number of results to return", + "name": "limit", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 1, + "description": "Minimum number of attempts required", + "name": "min_attempts", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successfully retrieved top scores", + "schema": { + "$ref": "#/definitions/api.TopScoresResponse" + } + }, + "400": { + "description": "Invalid request parameters", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal server error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/api/v1/scores/node/{node_id}": { + "get": { + "description": "Returns the score and performance metrics for a specific node within a time window", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "scores" + ], + "summary": "Get score for a specific node", + "parameters": [ + { + "type": "integer", + "example": 123, + "description": "Node ID", + "name": "node_id", + "in": "path", + "required": true + }, + { + "type": "string", + "default": "\"90d\"", + "description": "Time window for scoring (e.g., '90d', '30d', '7d')", + "name": "window", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 1, + "description": "Minimum number of attempts required", + "name": "min_attempts", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successfully retrieved node score", + "schema": { + "$ref": "#/definitions/api.NodeScoreResponse" + } + }, + "400": { + "description": "Invalid request parameters", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Node not found or insufficient attempts", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal server error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + } + }, + "definitions": { + "api.ErrorResponse": { + "description": "Error response", + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "invalid node_id parameter" + } + } + }, + "api.HealthCheck": { + "type": "object", + "properties": { + "message": { + "type": "string", + "example": "connection failed" + }, + "status": { + "description": "healthy or unhealthy", + "type": "string", + "example": "healthy" + } + } + }, + "api.HealthResponse": { + "description": "Health check response", + "type": "object", + "properties": { + "checks": { + "description": "Individual health checks", + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/api.HealthCheck" + } + }, + "status": { + "description": "Overall health status: healthy or unhealthy", + "type": "string", + "example": "healthy" + } + } + }, + "api.NodeScore": { + "description": "Node performance score and metrics", + "type": "object", + "properties": { + "avg_duration_ms": { + "description": "Average deployment duration in milliseconds", + "type": "number", + "example": 2500 + }, + "farm_id": { + "description": "Farm ID", + "type": "integer", + "example": 1 + }, + "node_id": { + "description": "Node ID", + "type": "integer", + "example": 123 + }, + "score": { + "description": "Calculated score (currently same as success rate)", + "type": "number", + "example": 0.95 + }, + "success_rate": { + "description": "Success rate (0.0 to 1.0)", + "type": "number", + "example": 0.95 + }, + "total_attempts": { + "description": "Total number of deployment attempts", + "type": "integer", + "example": 100 + } + } + }, + "api.NodeScoreResponse": { + "description": "Single node score response", + "type": "object", + "properties": { + "min_attempts": { + "description": "Minimum attempts required", + "type": "integer", + "example": 1 + }, + "score": { + "description": "Node score details", + "allOf": [ + { + "$ref": "#/definitions/api.NodeScore" + } + ] + }, + "window": { + "description": "Time window used for scoring", + "type": "string", + "example": "90d" + } + } + }, + "api.TopScoresResponse": { + "description": "Top node scores response", + "type": "object", + "properties": { + "limit": { + "description": "Maximum number of results returned", + "type": "integer", + "example": 10 + }, + "min_attempts": { + "description": "Minimum attempts required", + "type": "integer", + "example": 1 + }, + "scores": { + "description": "List of node scores", + "type": "array", + "items": { + "$ref": "#/definitions/api.NodeScore" + } + }, + "window": { + "description": "Time window used for scoring", + "type": "string", + "example": "90d" + } + } + } + } +}` + +// SwaggerInfo holds exported Swagger Info so clients can modify it +var SwaggerInfo = &swag.Spec{ + Version: "", + Host: "", + BasePath: "", + Schemes: []string{}, + Title: "", + Description: "", + InfoInstanceName: "swagger", + SwaggerTemplate: docTemplate, + LeftDelim: "{{", + RightDelim: "}}", +} + +func init() { + swag.Register(SwaggerInfo.InstanceName(), SwaggerInfo) +} diff --git a/docs/swagger/swagger.json b/docs/swagger/swagger.json new file mode 100644 index 00000000..dce71853 --- /dev/null +++ b/docs/swagger/swagger.json @@ -0,0 +1,294 @@ +{ + "swagger": "2.0", + "info": { + "contact": {} + }, + "paths": { + "/api/v1/health": { + "get": { + "description": "Returns the health status of the service and its dependencies", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "health" + ], + "summary": "Health check endpoint", + "responses": { + "200": { + "description": "Service is healthy", + "schema": { + "$ref": "#/definitions/api.HealthResponse" + } + }, + "503": { + "description": "Service is unhealthy", + "schema": { + "$ref": "#/definitions/api.HealthResponse" + } + } + } + } + }, + "/api/v1/scores": { + "get": { + "description": "Returns the top performing nodes based on success rate within a time window", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "scores" + ], + "summary": "Get top node scores", + "parameters": [ + { + "type": "string", + "description": "Time window for scoring (e.g., '90d', '30d', '7d')", + "name": "window", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 10, + "description": "Maximum number of results to return", + "name": "limit", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 1, + "description": "Minimum number of attempts required", + "name": "min_attempts", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successfully retrieved top scores", + "schema": { + "$ref": "#/definitions/api.TopScoresResponse" + } + }, + "400": { + "description": "Invalid request parameters", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal server error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + }, + "/api/v1/scores/node/{node_id}": { + "get": { + "description": "Returns the score and performance metrics for a specific node within a time window", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "scores" + ], + "summary": "Get score for a specific node", + "parameters": [ + { + "type": "integer", + "example": 123, + "description": "Node ID", + "name": "node_id", + "in": "path", + "required": true + }, + { + "type": "string", + "default": "\"90d\"", + "description": "Time window for scoring (e.g., '90d', '30d', '7d')", + "name": "window", + "in": "query" + }, + { + "minimum": 1, + "type": "integer", + "default": 1, + "description": "Minimum number of attempts required", + "name": "min_attempts", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successfully retrieved node score", + "schema": { + "$ref": "#/definitions/api.NodeScoreResponse" + } + }, + "400": { + "description": "Invalid request parameters", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "404": { + "description": "Node not found or insufficient attempts", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + }, + "500": { + "description": "Internal server error", + "schema": { + "$ref": "#/definitions/api.ErrorResponse" + } + } + } + } + } + }, + "definitions": { + "api.ErrorResponse": { + "description": "Error response", + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "invalid node_id parameter" + } + } + }, + "api.HealthCheck": { + "type": "object", + "properties": { + "message": { + "type": "string", + "example": "connection failed" + }, + "status": { + "description": "healthy or unhealthy", + "type": "string", + "example": "healthy" + } + } + }, + "api.HealthResponse": { + "description": "Health check response", + "type": "object", + "properties": { + "checks": { + "description": "Individual health checks", + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/api.HealthCheck" + } + }, + "status": { + "description": "Overall health status: healthy or unhealthy", + "type": "string", + "example": "healthy" + } + } + }, + "api.NodeScore": { + "description": "Node performance score and metrics", + "type": "object", + "properties": { + "avg_duration_ms": { + "description": "Average deployment duration in milliseconds", + "type": "number", + "example": 2500 + }, + "farm_id": { + "description": "Farm ID", + "type": "integer", + "example": 1 + }, + "node_id": { + "description": "Node ID", + "type": "integer", + "example": 123 + }, + "score": { + "description": "Calculated score (currently same as success rate)", + "type": "number", + "example": 0.95 + }, + "success_rate": { + "description": "Success rate (0.0 to 1.0)", + "type": "number", + "example": 0.95 + }, + "total_attempts": { + "description": "Total number of deployment attempts", + "type": "integer", + "example": 100 + } + } + }, + "api.NodeScoreResponse": { + "description": "Single node score response", + "type": "object", + "properties": { + "min_attempts": { + "description": "Minimum attempts required", + "type": "integer", + "example": 1 + }, + "score": { + "description": "Node score details", + "allOf": [ + { + "$ref": "#/definitions/api.NodeScore" + } + ] + }, + "window": { + "description": "Time window used for scoring", + "type": "string", + "example": "90d" + } + } + }, + "api.TopScoresResponse": { + "description": "Top node scores response", + "type": "object", + "properties": { + "limit": { + "description": "Maximum number of results returned", + "type": "integer", + "example": 10 + }, + "min_attempts": { + "description": "Minimum attempts required", + "type": "integer", + "example": 1 + }, + "scores": { + "description": "List of node scores", + "type": "array", + "items": { + "$ref": "#/definitions/api.NodeScore" + } + }, + "window": { + "description": "Time window used for scoring", + "type": "string", + "example": "90d" + } + } + } + } +} \ No newline at end of file diff --git a/docs/swagger/swagger.yaml b/docs/swagger/swagger.yaml new file mode 100644 index 00000000..74b51fc5 --- /dev/null +++ b/docs/swagger/swagger.yaml @@ -0,0 +1,206 @@ +definitions: + api.ErrorResponse: + description: Error response + properties: + error: + example: invalid node_id parameter + type: string + type: object + api.HealthCheck: + properties: + message: + example: connection failed + type: string + status: + description: healthy or unhealthy + example: healthy + type: string + type: object + api.HealthResponse: + description: Health check response + properties: + checks: + additionalProperties: + $ref: '#/definitions/api.HealthCheck' + description: Individual health checks + type: object + status: + description: 'Overall health status: healthy or unhealthy' + example: healthy + type: string + type: object + api.NodeScore: + description: Node performance score and metrics + properties: + avg_duration_ms: + description: Average deployment duration in milliseconds + example: 2500 + type: number + farm_id: + description: Farm ID + example: 1 + type: integer + node_id: + description: Node ID + example: 123 + type: integer + score: + description: Calculated score (currently same as success rate) + example: 0.95 + type: number + success_rate: + description: Success rate (0.0 to 1.0) + example: 0.95 + type: number + total_attempts: + description: Total number of deployment attempts + example: 100 + type: integer + type: object + api.NodeScoreResponse: + description: Single node score response + properties: + min_attempts: + description: Minimum attempts required + example: 1 + type: integer + score: + allOf: + - $ref: '#/definitions/api.NodeScore' + description: Node score details + window: + description: Time window used for scoring + example: 90d + type: string + type: object + api.TopScoresResponse: + description: Top node scores response + properties: + limit: + description: Maximum number of results returned + example: 10 + type: integer + min_attempts: + description: Minimum attempts required + example: 1 + type: integer + scores: + description: List of node scores + items: + $ref: '#/definitions/api.NodeScore' + type: array + window: + description: Time window used for scoring + example: 90d + type: string + type: object +info: + contact: {} +paths: + /api/v1/health: + get: + consumes: + - application/json + description: Returns the health status of the service and its dependencies + produces: + - application/json + responses: + "200": + description: Service is healthy + schema: + $ref: '#/definitions/api.HealthResponse' + "503": + description: Service is unhealthy + schema: + $ref: '#/definitions/api.HealthResponse' + summary: Health check endpoint + tags: + - health + /api/v1/scores: + get: + consumes: + - application/json + description: Returns the top performing nodes based on success rate within a + time window + parameters: + - description: Time window for scoring (e.g., '90d', '30d', '7d') + in: query + name: window + type: string + - default: 10 + description: Maximum number of results to return + in: query + minimum: 1 + name: limit + type: integer + - default: 1 + description: Minimum number of attempts required + in: query + minimum: 1 + name: min_attempts + type: integer + produces: + - application/json + responses: + "200": + description: Successfully retrieved top scores + schema: + $ref: '#/definitions/api.TopScoresResponse' + "400": + description: Invalid request parameters + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal server error + schema: + $ref: '#/definitions/api.ErrorResponse' + summary: Get top node scores + tags: + - scores + /api/v1/scores/node/{node_id}: + get: + consumes: + - application/json + description: Returns the score and performance metrics for a specific node within + a time window + parameters: + - description: Node ID + example: 123 + in: path + name: node_id + required: true + type: integer + - default: '"90d"' + description: Time window for scoring (e.g., '90d', '30d', '7d') + in: query + name: window + type: string + - default: 1 + description: Minimum number of attempts required + in: query + minimum: 1 + name: min_attempts + type: integer + produces: + - application/json + responses: + "200": + description: Successfully retrieved node score + schema: + $ref: '#/definitions/api.NodeScoreResponse' + "400": + description: Invalid request parameters + schema: + $ref: '#/definitions/api.ErrorResponse' + "404": + description: Node not found or insufficient attempts + schema: + $ref: '#/definitions/api.ErrorResponse' + "500": + description: Internal server error + schema: + $ref: '#/definitions/api.ErrorResponse' + summary: Get score for a specific node + tags: + - scores +swagger: "2.0" diff --git a/go.mod b/go.mod index 5ecb1639..85577c2b 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/threefoldtech/provision-probe -go 1.23.0 +go 1.24.0 require ( github.com/jackc/pgx/v5 v5.5.0 @@ -9,16 +9,20 @@ require ( ) require ( + github.com/cenkalti/backoff/v4 v4.3.0 + github.com/go-chi/chi/v5 v5.2.3 + github.com/swaggo/http-swagger v1.3.4 + github.com/swaggo/swag v1.16.6 github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4 github.com/threefoldtech/tfgrid-sdk-go/grid-proxy v0.17.4 - golang.org/x/sync v0.13.0 + golang.org/x/sync v0.18.0 ) require ( github.com/ChainSafe/go-schnorrkel v1.1.0 // indirect + github.com/KyleBanks/depth v1.2.1 // indirect github.com/cenkalti/backoff v2.2.1+incompatible // indirect github.com/cenkalti/backoff/v3 v3.2.2 // indirect - github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/centrifuge/go-substrate-rpc-client/v4 v4.0.12 // indirect github.com/cosmos/go-bip39 v1.0.0 // indirect github.com/deckarep/golang-set v1.8.0 // indirect @@ -28,7 +32,16 @@ require ( github.com/ethereum/go-ethereum v1.11.6 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/getsentry/sentry-go v0.29.1 // indirect - github.com/go-chi/chi/v5 v5.2.3 // indirect + github.com/go-openapi/jsonpointer v0.22.3 // indirect + github.com/go-openapi/jsonreference v0.21.3 // indirect + github.com/go-openapi/spec v0.22.1 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-stack/stack v1.8.1 // indirect github.com/golang-jwt/jwt v3.2.2+incompatible // indirect github.com/gomodule/redigo v2.0.0+incompatible // indirect @@ -61,16 +74,21 @@ require ( github.com/spf13/cast v1.6.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/subosito/gotenv v1.6.0 // indirect + github.com/swaggo/files v1.0.1 // indirect github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30 // indirect github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.17.3 // indirect github.com/threefoldtech/zosbase v1.0.3 // indirect github.com/vedhavyas/go-subkey v1.0.3 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/multierr v1.9.0 // indirect - golang.org/x/crypto v0.37.0 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/crypto v0.44.0 // indirect golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect - golang.org/x/sys v0.32.0 // indirect - golang.org/x/text v0.24.0 // indirect + golang.org/x/mod v0.30.0 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/tools v0.39.0 // indirect golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b // indirect gonum.org/v1/gonum v0.16.0 // indirect google.golang.org/protobuf v1.36.6 // indirect diff --git a/go.sum b/go.sum index 0c6e71b1..536415b5 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/ChainSafe/go-schnorrkel v1.1.0 h1:rZ6EU+CZFCjB4sHUE1jIu8VDoB/wRKZxoe1tkcO71Wk= github.com/ChainSafe/go-schnorrkel v1.1.0/go.mod h1:ABkENxiP+cvjFiByMIZ9LYbRoNNLeBLiakC1XeTFxfE= +github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= +github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= github.com/btcsuite/btcd v0.22.0-beta h1:LTDpDKUM5EeOFBPM8IXpinEcmZ6FWfNZbE3lfrfdnWo= github.com/btcsuite/btcd/btcec/v2 v2.2.0 h1:fzn1qaOt32TuLjFlkzYSsBC35Q3KUjT1SwPxiMSCF5k= github.com/btcsuite/btcd/btcec/v2 v2.2.0/go.mod h1:U7MHm051Al6XmscBQ0BoNydpOTsFAn707034b5nY8zU= @@ -44,6 +46,33 @@ github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxI github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-openapi/jsonpointer v0.22.3 h1:dKMwfV4fmt6Ah90zloTbUKWMD+0he+12XYAsPotrkn8= +github.com/go-openapi/jsonpointer v0.22.3/go.mod h1:0lBbqeRsQ5lIanv3LHZBrmRGHLHcQoOXQnf88fHlGWo= +github.com/go-openapi/jsonreference v0.21.3 h1:96Dn+MRPa0nYAR8DR1E03SblB5FJvh7W6krPI0Z7qMc= +github.com/go-openapi/jsonreference v0.21.3/go.mod h1:RqkUP0MrLf37HqxZxrIAtTWW4ZJIK1VzduhXYBEeGc4= +github.com/go-openapi/spec v0.22.1 h1:beZMa5AVQzRspNjvhe5aG1/XyBSMeX1eEOs7dMoXh/k= +github.com/go-openapi/spec v0.22.1/go.mod h1:c7aeIQT175dVowfp7FeCvXXnjN/MrpaONStibD2WtDA= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -161,6 +190,12 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= +github.com/swaggo/files v1.0.1 h1:J1bVJ4XHZNq0I46UU90611i9/YzdrF7x92oX1ig5IdE= +github.com/swaggo/files v1.0.1/go.mod h1:0qXmMNH6sXNf+73t65aKeB+ApmgxdnkQzVTAj2uaMUg= +github.com/swaggo/http-swagger v1.3.4 h1:q7t/XLx0n15H1Q9/tk3Y9L4n210XzJF5WtnDX64a5ww= +github.com/swaggo/http-swagger v1.3.4/go.mod h1:9dAh0unqMBAlbp1uE2Uc2mQTxNMU/ha4UbucIg1MFkQ= +github.com/swaggo/swag v1.16.6 h1:qBNcx53ZaX+M5dxVyTrgQ0PJ/ACK+NzhwcbieTt+9yI= +github.com/swaggo/swag v1.16.6/go.mod h1:ngP2etMK5a0P3QBizic5MEwpRmluJZPHjXcMoj4Xesg= github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30 h1:sH/hiHxCEpeIm2gJsmu4GxKskfQVPZMz9PAgDwk1BfY= github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20250929084418-b950278ead30/go.mod h1:cOL5YgHUmDG5SAXrsZxFjUECRQQuAqOoqvXhZG5sEUw= github.com/threefoldtech/tfgrid-sdk-go/grid-client v0.17.4 h1:eIKmYyFL+ta10ZL/bJhUaMsTJOEBeI++qBrlfdUUxUg= @@ -177,6 +212,7 @@ github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+F github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/vedhavyas/go-subkey v1.0.3 h1:iKR33BB/akKmcR2PMlXPBeeODjWLM90EL98OrOGs8CA= github.com/vedhavyas/go-subkey v1.0.3/go.mod h1:CloUaFQSSTdWnINfBRFjVMkWXZANW+nd8+TI5jYcl6Y= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE= @@ -185,22 +221,36 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI= go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191002192127-34f69633bfdc/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200204104054-c9f3fb736b72/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= -golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU= +golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc= golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= +golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191003171128-d98b1b443823/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190411185658-b44545bcd369/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -209,16 +259,32 @@ golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191003212358-c178f38b412c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= +golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.zx2c4.com/wireguard v0.0.20200121/go.mod h1:P2HsVp8SKwZEufsnezXZA4GRX/T49/HlU7DGuelXsU4= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20200609130330-bd2cb7843e1b h1:l4mBVCYinjzZuR5DtxHuBD6wyd4348TGiavJ5vLrhEc= diff --git a/pkg/api/handlers.go b/pkg/api/handlers.go index 5c8c4a2f..ce1ff252 100644 --- a/pkg/api/handlers.go +++ b/pkg/api/handlers.go @@ -1,3 +1,13 @@ +// Package api provides HTTP handlers for the provision-probe API +// @title Provision Probe API +// @version 1.0 +// @description API for querying node provision scores and health status +// @termsOfService http://swagger.io/terms/ +// @contact.name API Support +// @license.name Apache 2.0 +// @license.url http://www.apache.org/licenses/LICENSE-2.0.html +// @host localhost:8080 +// @BasePath /api/v1 package api import ( @@ -19,6 +29,44 @@ type Handlers struct { defaultWindow time.Duration } +// Response types + +// HealthCheck represents a single health check result +type HealthCheck struct { + Status string `json:"status" example:"healthy"` // healthy or unhealthy + Message string `json:"message,omitempty" example:"connection failed"` +} + +// HealthResponse represents the health endpoint response +// @Description Health check response +type HealthResponse struct { + Status string `json:"status" example:"healthy"` // Overall health status: healthy or unhealthy + Checks map[string]HealthCheck `json:"checks"` // Individual health checks +} + +// TopScoresResponse represents the response for GET /api/v1/scores +// @Description Top node scores response +type TopScoresResponse struct { + Window string `json:"window" example:"90d"` // Time window used for scoring + Limit int `json:"limit" example:"10"` // Maximum number of results returned + MinAttempts int `json:"min_attempts" example:"1"` // Minimum attempts required + Scores []NodeScore `json:"scores"` // List of node scores +} + +// NodeScoreResponse represents the response for GET /api/v1/scores/node/:node_id +// @Description Single node score response +type NodeScoreResponse struct { + Window string `json:"window" example:"90d"` // Time window used for scoring + MinAttempts int `json:"min_attempts" example:"1"` // Minimum attempts required + Score NodeScore `json:"score"` // Node score details +} + +// ErrorResponse represents an error response +// @Description Error response +type ErrorResponse struct { + Error string `json:"error" example:"invalid node_id parameter"` +} + func NewHandlers(database *db.DB, cfg *config.Config) *Handlers { return &Handlers{ database: database, @@ -28,7 +76,18 @@ func NewHandlers(database *db.DB, cfg *config.Config) *Handlers { } // GetTopScores handles GET /api/v1/scores -// Query params: window (duration, defaults to config value), limit (int, default 10), min_attempts (int, default 1) +// @Summary Get top node scores +// @Description Returns the top performing nodes based on success rate within a time window +// @Tags scores +// @Accept json +// @Produce json +// @Param window query string false "Time window for scoring (e.g., '90d', '30d', '7d')" default to the config value +// @Param limit query int false "Maximum number of results to return" default(10) minimum(1) +// @Param min_attempts query int false "Minimum number of attempts required" default(1) minimum(1) +// @Success 200 {object} TopScoresResponse "Successfully retrieved top scores" +// @Failure 400 {object} ErrorResponse "Invalid request parameters" +// @Failure 500 {object} ErrorResponse "Internal server error" +// @Router /api/v1/scores [get] func (h *Handlers) GetTopScores(w http.ResponseWriter, r *http.Request) { ctx := r.Context() @@ -78,16 +137,28 @@ func (h *Handlers) GetTopScores(w http.ResponseWriter, r *http.Request) { scores = append(scores, *score) } - respondJSON(w, http.StatusOK, map[string]interface{}{ - "window": window.String(), - "limit": limit, - "min_attempts": minAttempts, - "scores": scores, + respondJSON(w, http.StatusOK, TopScoresResponse{ + Window: window.String(), + Limit: limit, + MinAttempts: minAttempts, + Scores: scores, }) } // GetNodeScore handles GET /api/v1/scores/node/:node_id -// Query params: window (duration string, defaults to config value), min_attempts (int, default 1) +// @Summary Get score for a specific node +// @Description Returns the score and performance metrics for a specific node within a time window +// @Tags scores +// @Accept json +// @Produce json +// @Param node_id path int true "Node ID" example(123) +// @Param window query string false "Time window for scoring (e.g., '90d', '30d', '7d')" default("90d") +// @Param min_attempts query int false "Minimum number of attempts required" default(1) minimum(1) +// @Success 200 {object} NodeScoreResponse "Successfully retrieved node score" +// @Failure 400 {object} ErrorResponse "Invalid request parameters" +// @Failure 404 {object} ErrorResponse "Node not found or insufficient attempts" +// @Failure 500 {object} ErrorResponse "Internal server error" +// @Router /api/v1/scores/node/{node_id} [get] func (h *Handlers) GetNodeScore(w http.ResponseWriter, r *http.Request) { ctx := r.Context() @@ -139,28 +210,48 @@ func (h *Handlers) GetNodeScore(w http.ResponseWriter, r *http.Request) { score := CalculateScore(scoreData) - respondJSON(w, http.StatusOK, map[string]interface{}{ - "window": window.String(), - "min_attempts": minAttempts, - "score": score, + respondJSON(w, http.StatusOK, NodeScoreResponse{ + Window: window.String(), + MinAttempts: minAttempts, + Score: *score, }) } // GetHealth handles GET /api/v1/health +// @Summary Health check endpoint +// @Description Returns the health status of the service and its dependencies +// @Tags health +// @Accept json +// @Produce json +// @Success 200 {object} HealthResponse "Service is healthy" +// @Success 503 {object} HealthResponse "Service is unhealthy" +// @Router /api/v1/health [get] func (h *Handlers) GetHealth(w http.ResponseWriter, r *http.Request) { ctx := r.Context() + health := HealthResponse{ + Status: "healthy", + Checks: make(map[string]HealthCheck), + } + if err := h.database.Ping(ctx); err != nil { - respondJSON(w, http.StatusServiceUnavailable, map[string]interface{}{ - "status": "unhealthy", - "message": "database connection failed", - }) - return + health.Checks["database"] = HealthCheck{ + Status: "unhealthy", + Message: err.Error(), + } + health.Status = "unhealthy" + } else { + health.Checks["database"] = HealthCheck{ + Status: "healthy", + } } - respondJSON(w, http.StatusOK, map[string]interface{}{ - "status": "healthy", - }) + statusCode := http.StatusOK + if health.Status == "unhealthy" { + statusCode = http.StatusServiceUnavailable + } + + respondJSON(w, statusCode, health) } // Helper functions @@ -174,7 +265,7 @@ func respondJSON(w http.ResponseWriter, status int, data interface{}) { } func respondError(w http.ResponseWriter, status int, message string) { - respondJSON(w, status, map[string]interface{}{ - "error": message, + respondJSON(w, status, ErrorResponse{ + Error: message, }) } diff --git a/pkg/api/scoring.go b/pkg/api/scoring.go index 75ab3801..d46e9c07 100644 --- a/pkg/api/scoring.go +++ b/pkg/api/scoring.go @@ -8,13 +8,15 @@ const ( minAttemptsThreshold = 1 ) +// NodeScore represents a node's performance score +// @Description Node performance score and metrics type NodeScore struct { - NodeID int64 `json:"node_id"` - FarmID int64 `json:"farm_id"` - SuccessRate float64 `json:"success_rate"` - TotalAttempts int64 `json:"total_attempts"` - AvgDurationMs float64 `json:"avg_duration_ms,omitempty"` - Score float64 `json:"score"` + NodeID int64 `json:"node_id" example:"123"` // Node ID + FarmID int64 `json:"farm_id" example:"1"` // Farm ID + SuccessRate float64 `json:"success_rate" example:"0.95"` // Success rate (0.0 to 1.0) + TotalAttempts int64 `json:"total_attempts" example:"100"` // Total number of deployment attempts + AvgDurationMs float64 `json:"avg_duration_ms,omitempty" example:"2500"` // Average deployment duration in milliseconds + Score float64 `json:"score" example:"0.95"` // Calculated score (currently same as success rate) } // TODO: add score to the duration ratio diff --git a/pkg/api/server.go b/pkg/api/server.go index a6beafb0..3f32a728 100644 --- a/pkg/api/server.go +++ b/pkg/api/server.go @@ -9,6 +9,8 @@ import ( "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" "github.com/rs/zerolog/log" + httpSwagger "github.com/swaggo/http-swagger" + "github.com/threefoldtech/provision-probe/docs/swagger" "github.com/threefoldtech/provision-probe/pkg/config" "github.com/threefoldtech/provision-probe/pkg/db" ) @@ -27,6 +29,11 @@ func NewServer(database *db.DB, cfg *config.Config) *Server { router.Use(middleware.Recoverer) router.Use(middleware.Timeout(60 * time.Second)) + // Swagger documentation routes + router.Get("/swagger/*", httpSwagger.Handler( + httpSwagger.URL("/swagger/doc.json"), // Relative URL works better + )) + router.Route("/api/v1", func(r chi.Router) { r.Get("/scores", handlers.GetTopScores) r.Route("/scores/node", func(r chi.Router) { @@ -35,6 +42,11 @@ func NewServer(database *db.DB, cfg *config.Config) *Server { r.Get("/health", handlers.GetHealth) }) + // Initialize swagger info + swagger.SwaggerInfo.Host = fmt.Sprintf("%s:%d", cfg.API.Host, cfg.API.Port) + swagger.SwaggerInfo.BasePath = "/api/v1" + swagger.SwaggerInfo.Schemes = []string{"http", "https"} + addr := fmt.Sprintf("%s:%d", cfg.API.Host, cfg.API.Port) server := &http.Server{ Addr: addr, diff --git a/pkg/app/app.go b/pkg/app/app.go index f190fb24..e969465a 100644 --- a/pkg/app/app.go +++ b/pkg/app/app.go @@ -225,8 +225,6 @@ func (a *App) runCycle(ctx context.Context) error { a.cycleWg.Wait() - metrics.GetMetrics().SetActiveDeployments(0) - log.Info(). Int("total_nodes", len(nodes)). Int("errors", len(cycleErrors)).