diff --git a/Dockerfile.dapper b/Dockerfile.dapper index 66764a9b..753e034d 100644 --- a/Dockerfile.dapper +++ b/Dockerfile.dapper @@ -14,6 +14,11 @@ ENV DAPPER_RUN_ARGS --privileged -v /dev:/host/dev -v /proc:/host/proc -v /sys:/ ENV DAPPER_SOURCE /go/src/github.com/longhorn/go-spdk-helper WORKDIR ${DAPPER_SOURCE} +ENV SPDK_COMMIT_ID a6478cde7e0cff2fb09992868308a7387aa5202a +ENV LIBJSONC_COMMIT_ID b4c371fa0cbc4dcbaccc359ce9e957a22988fb34 +# Build nvme-cli 2.9.1 +ENV NVME_CLI_COMMIT_ID b340fd7dcf1aef76f8d46ab28bef3c170d310887 + RUN zypper -n addrepo --refresh https://download.opensuse.org/repositories/system:/snappy/SLE_15/system:snappy.repo && \ zypper -n addrepo --refresh https://download.opensuse.org/repositories/network:/utilities/SLE_15/network:utilities.repo && \ zypper -n addrepo --refresh https://download.opensuse.org/repositories/devel:libraries:c_c++/15.5/devel:libraries:c_c++.repo && \ @@ -21,30 +26,33 @@ RUN zypper -n addrepo --refresh https://download.opensuse.org/repositories/syste zypper -n addrepo --refresh https://download.opensuse.org/repositories/devel:languages:python:backports/SLE_15/devel:languages:python:backports.repo && \ zypper --gpg-auto-import-keys ref -RUN zypper -n install cmake wget unzip xsltproc docbook-xsl-stylesheets python3 meson ninja python3-pip \ - e2fsprogs xfsprogs util-linux-systemd python3-pyelftools libcmocka-devel device-mapper +RUN zypper -n install cmake wget unzip xsltproc docbook-xsl-stylesheets python311 python311-pip fuse3 libfuse3-3 \ + e2fsprogs xfsprogs util-linux-systemd device-mapper # Install Go & tools -ENV GOLANG_ARCH_amd64=amd64 GOLANG_ARCH_arm64=arm64 GOLANG_ARCH_s390x=s390x GOLANG_ARCH=GOLANG_ARCH_${ARCH} \ +ENV GOLANG_ARCH_amd64=amd64 GOLANG_ARCH_arm64=arm64 GOLANG_ARCH=GOLANG_ARCH_${ARCH} \ GOPATH=/go PATH=/go/bin:/usr/local/go/bin:${PATH} SHELL=/bin/bash RUN curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.55.2 +RUN ln -sf /usr/bin/python3.11 /usr/bin/python3 & \ + ln -sf /usr/bin/pip3.11 /usr/bin/pip3 + # Build SPDK -ENV HUGEMEM=1024 -RUN echo "vm.nr_hugepages=$((HUGEMEM/2))" >> /etc/sysctl.conf ENV SPDK_DIR /usr/src/spdk -ENV SPDK_COMMIT_ID a6478cde7e0cff2fb09992868308a7387aa5202a RUN git clone https://github.com/longhorn/spdk.git ${SPDK_DIR} --recursive && \ cd ${SPDK_DIR} && \ git checkout ${SPDK_COMMIT_ID} && \ git submodule update --init && \ + sed -i '/python3-pyelftools/d' ./scripts/pkgdep/sles.sh && \ + sed -i 's/python3-/python311-/g' ./scripts/pkgdep/sles.sh && \ ./scripts/pkgdep.sh && \ + pip3 install -r ./scripts/pkgdep/requirements.txt && \ if [ ${ARCH} = "amd64" ]; then \ - ./configure --target-arch=nehalem --disable-tests --disable-unit-tests --disable-examples && \ + ./configure --target-arch=nehalem --disable-tests --disable-unit-tests --disable-examples --without-nvme-cuse && \ make -j$(nproc) && \ make install; \ elif [ ${ARCH} = "arm64" ]; then \ - ./configure --target-arch=native --disable-tests --disable-unit-tests --disable-examples && \ + ./configure --target-arch=native --disable-tests --disable-unit-tests --disable-examples --without-nvme-cuse && \ DPDKBUILD_FLAGS="-Dplatform=generic" make -j$(nproc) && \ make install; \ else \ @@ -53,7 +61,6 @@ RUN git clone https://github.com/longhorn/spdk.git ${SPDK_DIR} --recursive && \ fi # Build libjson-c-devel -ENV LIBJSONC_COMMIT_ID b4c371fa0cbc4dcbaccc359ce9e957a22988fb34 RUN cd /usr/src && \ git clone https://github.com/json-c/json-c.git && \ cd json-c && \ @@ -64,9 +71,8 @@ RUN cd /usr/src && \ make && \ make install -# Build nvme-cli 2.9.1 +# Build nvme-cli ENV NVME_CLI_DIR /usr/src/nvme-cli -ENV NVME_CLI_COMMIT_ID b340fd7dcf1aef76f8d46ab28bef3c170d310887 RUN git clone https://github.com/linux-nvme/nvme-cli.git ${NVME_CLI_DIR} && \ cd ${NVME_CLI_DIR} && \ git checkout ${NVME_CLI_COMMIT_ID} && \ diff --git a/app/cmd/dmsetup/dmsetup.go b/app/cmd/dmsetup/dmsetup.go new file mode 100644 index 00000000..53e39c92 --- /dev/null +++ b/app/cmd/dmsetup/dmsetup.go @@ -0,0 +1,245 @@ +package dmsetup + +import ( + "fmt" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + + commonTypes "github.com/longhorn/go-common-libs/types" + "github.com/longhorn/go-spdk-helper/pkg/util" +) + +func Cmd() cli.Command { + return cli.Command{ + Name: "dmsetup", + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "host-proc", + Usage: fmt.Sprintf("The host proc path of namespace executor. By default %v", commonTypes.ProcDirectory), + Value: commonTypes.ProcDirectory, + }, + }, + Subcommands: []cli.Command{ + CreateCmd(), + RemoveCmd(), + SuspendCmd(), + ResumeCmd(), + ReloadCmd(), + DepsCmd(), + }, + } +} + +func CreateCmd() cli.Command { + return cli.Command{ + Name: "create", + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "table", + Usage: "One-line table directly on the command line.", + Required: true, + }, + }, + Usage: "Create a device mapper device with the given name and table: create --table ", + Action: func(c *cli.Context) { + if err := create(c); err != nil { + logrus.WithError(err).Fatalf("Failed to create device %v with table %v", c.Args().First(), c.String("table")) + } + }, + } +} + +func create(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Creating device %v with table %v", deviceName, c.String("table")) + + return util.DmsetupCreate(deviceName, c.String("table"), executor) +} + +func SuspendCmd() cli.Command { + return cli.Command{ + Name: "suspend", + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "noflush", + Usage: "Do not flush outstanding I/O when suspending a device. Default: false.", + Required: false, + }, + cli.BoolFlag{ + Name: "nolockfs", + Usage: "Do not attempt to synchronize filesystem. Default: false.", + Required: false, + }, + }, + Usage: "Suspend the device mapper device with the given name: suspend --noflush --nolockfs ", + Action: func(c *cli.Context) { + if err := suspend(c); err != nil { + logrus.WithError(err).Fatalf("Failed to suspend device %v", c.Args().First()) + } + }, + } +} + +func suspend(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Suspending device %v with noflush %v and nolockfs %v", deviceName, c.Bool("noflush"), c.Bool("nolockfs")) + + return util.DmsetupSuspend(deviceName, c.Bool("noflush"), c.Bool("nolockfs"), executor) +} + +func ResumeCmd() cli.Command { + return cli.Command{ + Name: "resume", + Flags: []cli.Flag{}, + Usage: "Resume the device mapper device with the given name: resume ", + Action: func(c *cli.Context) { + if err := resume(c); err != nil { + logrus.WithError(err).Fatalf("Failed to resume device %v", c.Args().First()) + } + }, + } +} + +func resume(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Resuming device %v", deviceName) + + return util.DmsetupResume(deviceName, executor) +} + +func ReloadCmd() cli.Command { + return cli.Command{ + Name: "reload", + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "table", + Usage: "One-line table directly on the command line.", + Required: true, + }, + }, + Usage: "Reload the table of the device mapper device with the given name and table: reload --table ", + Action: func(c *cli.Context) { + if err := reload(c); err != nil { + logrus.WithError(err).Fatalf("Failed to reload device %v with table %v", c.Args().First(), c.String("table")) + } + }, + } +} + +func reload(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Reloading device %v with table %v", deviceName, c.String("table")) + + return util.DmsetupReload(deviceName, c.String("table"), executor) +} + +func RemoveCmd() cli.Command { + return cli.Command{ + Name: "remove", + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "force", + Usage: "Try harder to complete operation. Default: false.", + Required: false, + }, + cli.BoolFlag{ + Name: "deferred", + Usage: "Enable deferred removal of open devices. The device will be removed when the last user closes it. Default: false.", + Required: false, + }, + }, + Usage: "Remove the device mapper device with the given name: remove --force --deferred ", + Action: func(c *cli.Context) { + if err := remove(c); err != nil { + logrus.WithError(err).Fatalf("Failed to create device %v with table %v", c.Args().First(), c.String("table")) + } + }, + } +} + +func remove(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Removing device %v with force %v and deferred %v", deviceName, c.Bool("force"), c.Bool("deferred")) + + return util.DmsetupRemove(deviceName, c.Bool("force"), c.Bool("deferred"), executor) +} + +func DepsCmd() cli.Command { + return cli.Command{ + Name: "deps", + Flags: []cli.Flag{}, + Usage: "Outputting a list of devices referenced by the live table for the specified device: deps ", + Action: func(c *cli.Context) { + if err := deps(c); err != nil { + logrus.WithError(err).Fatalf("Failed to output a list of devices referenced by the live table for the specified device %v", c.Args().First()) + } + }, + } +} + +func deps(c *cli.Context) error { + executor, err := util.NewExecutor(c.GlobalString("host-proc")) + if err != nil { + return err + } + + deviceName := c.Args().First() + if deviceName == "" { + return fmt.Errorf("device name is required") + } + + logrus.Infof("Outputting a list of devices referenced by the live table for the specified device %v", deviceName) + + output, err := util.DmsetupDeps(deviceName, executor) + if err != nil { + return err + } + fmt.Printf("Dependent devices: %v", output) + return nil +} diff --git a/app/cmd/nvmecli/nvmecli.go b/app/cmd/nvmecli/nvmecli.go index 5adb9589..d1521402 100644 --- a/app/cmd/nvmecli/nvmecli.go +++ b/app/cmd/nvmecli/nvmecli.go @@ -30,6 +30,7 @@ func Cmd() cli.Command { GetCmd(), StartCmd(), StopCmd(), + FlushCmd(), }, } } @@ -267,3 +268,41 @@ func stop(c *cli.Context) error { return nil } + +func FlushCmd() cli.Command { + return cli.Command{ + Name: "flush", + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "namespace-id", + Usage: "Specify the optional namespace id for this command. Defaults to 0xffffffff, indicating flush for all namespaces.", + Required: false, + }, + }, + Usage: "Commit data and metadata associated with the specified namespace(s) to nonvolatile media.", + Action: func(c *cli.Context) { + if err := flush(c); err != nil { + logrus.WithError(err).Fatalf("Failed to run nvme-cli flush command") + } + }, + } +} + +func flush(c *cli.Context) error { + executor, err := util.NewExecutor(c.String("host-proc")) + if err != nil { + return err + } + + devicePath := c.Args().First() + if devicePath == "" { + return fmt.Errorf("device path is required") + } + + resp, err := nvme.Flush(devicePath, c.String("namespace-id"), executor) + if err != nil { + return err + } + + return util.PrintObject(resp) +} diff --git a/main.go b/main.go index 7cfe5d6e..a1e90ecd 100644 --- a/main.go +++ b/main.go @@ -8,6 +8,7 @@ import ( "github.com/longhorn/go-spdk-helper/app/cmd/advanced" "github.com/longhorn/go-spdk-helper/app/cmd/basic" + "github.com/longhorn/go-spdk-helper/app/cmd/dmsetup" "github.com/longhorn/go-spdk-helper/app/cmd/nvmecli" "github.com/longhorn/go-spdk-helper/app/cmd/spdksetup" "github.com/longhorn/go-spdk-helper/app/cmd/spdktgt" @@ -43,6 +44,8 @@ func main() { nvmecli.Cmd(), + dmsetup.Cmd(), + spdktgt.Cmd(), spdksetup.Cmd(), } diff --git a/pkg/nvme/initiator.go b/pkg/nvme/initiator.go index f387e93c..75ab7d13 100644 --- a/pkg/nvme/initiator.go +++ b/pkg/nvme/initiator.go @@ -22,10 +22,11 @@ const ( LockFile = "/var/run/longhorn-spdk.lock" LockTimeout = 120 * time.Second - RetryCounts = 5 - RetryInterval = 3 * time.Second + maxNumRetries = 15 + retryInterval = 1 * time.Second - waitDeviceTimeout = 60 * time.Second + maxNumWaitDeviceRetries = 60 + waitDeviceInterval = 1 * time.Second HostProc = "/host/proc" @@ -42,7 +43,7 @@ type Initiator struct { Endpoint string ControllerName string NamespaceName string - dev *util.KernelDevice + dev *util.LonghornBlockDevice isUp bool hostProc string @@ -51,6 +52,7 @@ type Initiator struct { logger logrus.FieldLogger } +// NewInitiator creates a new NVMe initiator func NewInitiator(name, subsystemNQN, hostProc string) (*Initiator, error) { if name == "" || subsystemNQN == "" { return nil, fmt.Errorf("empty name or subsystem for initiator creation") @@ -78,6 +80,87 @@ func NewInitiator(name, subsystemNQN, hostProc string) (*Initiator, error) { }, nil } +// DiscoverTarget discovers a target +func (i *Initiator) DiscoverTarget(ip, port string) (string, error) { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return "", errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + return DiscoverTarget(ip, port, i.executor) +} + +// ConnectTarget connects to a target +func (i *Initiator) ConnectTarget(ip, port, nqn string) (string, error) { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return "", errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + return ConnectTarget(ip, port, nqn, i.executor) +} + +// DisconnectTarget disconnects a target +func (i *Initiator) DisconnectTarget() error { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + return DisconnectTarget(i.SubsystemNQN, i.executor) +} + +// WaitForConnect waits for the NVMe initiator to connect +func (i *Initiator) WaitForConnect(maxNumRetries int, retryInterval time.Duration) (err error) { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + for r := 0; r < maxNumRetries; r++ { + err = i.loadNVMeDeviceInfoWithoutLock(i.TransportAddress, i.TransportServiceID, i.SubsystemNQN) + if err == nil { + return nil + } + time.Sleep(retryInterval) + } + + return err +} + +// WaitForDisconnect waits for the NVMe initiator to disconnect +func (i *Initiator) WaitForDisconnect(maxNumRetries int, retryInterval time.Duration) (err error) { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + for r := 0; r < maxNumRetries; r++ { + err = i.loadNVMeDeviceInfoWithoutLock(i.TransportAddress, i.TransportServiceID, i.SubsystemNQN) + if IsValidNvmeDeviceNotFound(err) { + return nil + } + time.Sleep(retryInterval) + } + + return err +} + // Suspend suspends the device mapper device for the NVMe initiator func (i *Initiator) Suspend(noflush, nolockfs bool) error { if i.hostProc != "" { @@ -88,16 +171,53 @@ func (i *Initiator) Suspend(noflush, nolockfs bool) error { defer lock.Unlock() } - if err := i.suspendLinearDmDevice(noflush, nolockfs); err != nil { - return errors.Wrapf(err, "failed to suspend device mapper device for NVMe initiator %s", i.Name) + suspended, err := i.IsSuspended() + if err != nil { + return errors.Wrapf(err, "failed to check if linear dm device is suspended for NVMe initiator %s", i.Name) + } + + if !suspended { + if err := i.suspendLinearDmDevice(noflush, nolockfs); err != nil { + return errors.Wrapf(err, "failed to suspend device mapper device for NVMe initiator %s", i.Name) + } + } + + return nil +} + +// Resume resumes the device mapper device for the NVMe initiator +func (i *Initiator) Resume() error { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return errors.Wrapf(err, "failed to get file lock for initiator %s", i.Name) + } + defer lock.Unlock() + } + + if err := i.resumeLinearDmDevice(); err != nil { + return errors.Wrapf(err, "failed to resume device mapper device for NVMe initiator %s", i.Name) } return nil } +func (i *Initiator) resumeLinearDmDevice() error { + logrus.Infof("Resuming linear dm device %s", i.Name) + + return util.DmsetupResume(i.Name, i.executor) +} + func (i *Initiator) replaceDmDeviceTarget() error { - if err := i.suspendLinearDmDevice(true, false); err != nil { - return errors.Wrapf(err, "failed to suspend linear dm device for NVMe initiator %s", i.Name) + suspended, err := i.IsSuspended() + if err != nil { + return errors.Wrapf(err, "failed to check if linear dm device is suspended for NVMe initiator %s", i.Name) + } + + if !suspended { + if err := i.suspendLinearDmDevice(true, false); err != nil { + return errors.Wrapf(err, "failed to suspend linear dm device for NVMe initiator %s", i.Name) + } } if err := i.reloadLinearDmDevice(); err != nil { @@ -111,13 +231,21 @@ func (i *Initiator) replaceDmDeviceTarget() error { } // Start starts the NVMe initiator with the given transportAddress and transportServiceID -func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDeviceCleanup bool) (dmDeviceBusy bool, err error) { +func (i *Initiator) Start(transportAddress, transportServiceID string, dmDeviceCleanupRequired bool) (dmDeviceBusy bool, err error) { defer func() { if err != nil { err = errors.Wrapf(err, "failed to start NVMe initiator %s", i.Name) } }() + i.logger.WithFields(logrus.Fields{ + "transportAddress": transportAddress, + "transportServiceID": transportServiceID, + "dmDeviceCleanupRequired": dmDeviceCleanupRequired, + }) + + i.logger.Info("Starting initiator") + if transportAddress == "" || transportServiceID == "" { return false, fmt.Errorf("invalid TransportAddress %s and TransportServiceID %s for initiator %s start", transportAddress, transportServiceID, i.Name) } @@ -131,12 +259,8 @@ func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDev } // Check if the initiator/NVMe device is already launched and matches the params - if err := i.loadNVMeDeviceInfoWithoutLock(); err == nil { + if err := i.loadNVMeDeviceInfoWithoutLock(i.TransportAddress, i.TransportServiceID, i.SubsystemNQN); err == nil { if i.TransportAddress == transportAddress && i.TransportServiceID == transportServiceID { - i.logger.WithFields(logrus.Fields{ - "transportAddress": transportAddress, - "transportServiceID": transportServiceID, - }) if err = i.LoadEndpoint(false); err == nil { i.logger.Info("NVMe initiator is already launched with correct params") return false, nil @@ -149,7 +273,7 @@ func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDev } i.logger.Infof("Stopping NVMe initiator blindly before starting") - dmDeviceBusy, err = i.stopWithoutLock(needDmDeviceCleanup, false, false) + dmDeviceBusy, err = i.stopWithoutLock(dmDeviceCleanupRequired, false, false) if err != nil { return dmDeviceBusy, errors.Wrapf(err, "failed to stop the mismatching NVMe initiator %s before starting", i.Name) } @@ -161,42 +285,48 @@ func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDev i.logger.Infof("Launching NVMe initiator") // Setup initiator - for counter := 0; counter < RetryCounts; counter++ { + for r := 0; r < maxNumRetries; r++ { // Rerun this API for a discovered target should be fine - if i.SubsystemNQN, err = DiscoverTarget(transportAddress, transportServiceID, i.executor); err != nil { - i.logger.WithError(err).Warnf("Failed to discover") - time.Sleep(RetryInterval) + subsystemNQN, err := DiscoverTarget(transportAddress, transportServiceID, i.executor) + if err != nil { + i.logger.WithError(err).Warn("Failed to discover target") + time.Sleep(retryInterval) continue } - if i.ControllerName, err = ConnectTarget(transportAddress, transportServiceID, i.SubsystemNQN, i.executor); err != nil { - i.logger.WithError(err).Warnf("Failed to connect target") - time.Sleep(RetryInterval) + controllerName, err := ConnectTarget(transportAddress, transportServiceID, subsystemNQN, i.executor) + if err != nil { + i.logger.WithError(err).Warn("Failed to connect target") + time.Sleep(retryInterval) continue } + + i.SubsystemNQN = subsystemNQN + i.ControllerName = controllerName break } if i.ControllerName == "" { - return dmDeviceBusy, fmt.Errorf("failed to start NVMe initiator %s within %d * %v sec retries", i.Name, RetryCounts, RetryInterval.Seconds()) + return dmDeviceBusy, fmt.Errorf("failed to start NVMe initiator %s within %d * %v sec retries", i.Name, maxNumRetries, retryInterval.Seconds()) } - for t := 0; t < int(waitDeviceTimeout.Seconds()); t++ { - if err = i.loadNVMeDeviceInfoWithoutLock(); err == nil { + for r := 0; r < maxNumWaitDeviceRetries; r++ { + err = i.loadNVMeDeviceInfoWithoutLock(i.TransportAddress, i.TransportServiceID, i.SubsystemNQN) + if err == nil { break } - time.Sleep(time.Second) + time.Sleep(waitDeviceInterval) } if err != nil { return dmDeviceBusy, errors.Wrapf(err, "failed to load device info after starting NVMe initiator %s", i.Name) } needMakeEndpoint := true - if needDmDeviceCleanup { + if dmDeviceCleanupRequired { if dmDeviceBusy { // Endpoint is already created, just replace the target device needMakeEndpoint = false - i.logger.Infof("Linear dm device %s is busy, trying the best to replace the target device for NVMe initiator %s", i.Name, i.Name) + i.logger.Infof("Linear dm device is busy, trying the best to replace the target device for NVMe initiator %s", i.Name) if err := i.replaceDmDeviceTarget(); err != nil { i.logger.WithError(err).Warnf("Failed to replace the target device for NVMe initiator %s", i.Name) } else { @@ -206,7 +336,7 @@ func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDev } else { i.logger.Infof("Creating linear dm device for NVMe initiator %s", i.Name) if err := i.createLinearDmDevice(); err != nil { - return dmDeviceBusy, errors.Wrapf(err, "failed to create linear dm device for NVMe initiator %s", i.Name) + return false, errors.Wrapf(err, "failed to create linear dm device for NVMe initiator %s", i.Name) } } } else { @@ -226,7 +356,7 @@ func (i *Initiator) Start(transportAddress, transportServiceID string, needDmDev return dmDeviceBusy, nil } -func (i *Initiator) Stop(needDmDeviceCleanup, deferDmDeviceCleanup, errOnBusyDmDevice bool) (bool, error) { +func (i *Initiator) Stop(dmDeviceCleanupRequired, deferDmDeviceCleanup, errOnBusyDmDevice bool) (bool, error) { if i.hostProc != "" { lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) if err := lock.Lock(); err != nil { @@ -235,7 +365,7 @@ func (i *Initiator) Stop(needDmDeviceCleanup, deferDmDeviceCleanup, errOnBusyDmD defer lock.Unlock() } - return i.stopWithoutLock(needDmDeviceCleanup, deferDmDeviceCleanup, errOnBusyDmDevice) + return i.stopWithoutLock(dmDeviceCleanupRequired, deferDmDeviceCleanup, errOnBusyDmDevice) } func (i *Initiator) removeDmDeviceAndEndpoint(deferDmDeviceCleanup, errOnBusyDmDevice bool) (bool, error) { @@ -254,9 +384,9 @@ func (i *Initiator) removeDmDeviceAndEndpoint(deferDmDeviceCleanup, errOnBusyDmD return false, nil } -func (i *Initiator) stopWithoutLock(needDmDeviceCleanup, deferDmDeviceCleanup, errOnBusyDmDevice bool) (bool, error) { +func (i *Initiator) stopWithoutLock(dmDeviceCleanupRequired, deferDmDeviceCleanup, errOnBusyDmDevice bool) (bool, error) { dmDeviceBusy := false - if needDmDeviceCleanup { + if dmDeviceCleanupRequired { var err error dmDeviceBusy, err = i.removeDmDeviceAndEndpoint(deferDmDeviceCleanup, errOnBusyDmDevice) if err != nil { @@ -299,7 +429,7 @@ func (i *Initiator) GetEndpoint() string { return "" } -func (i *Initiator) LoadNVMeDeviceInfo() (err error) { +func (i *Initiator) LoadNVMeDeviceInfo(transportAddress, transportServiceID, subsystemNQN string) (err error) { if i.hostProc != "" { lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) if err := lock.Lock(); err != nil { @@ -308,11 +438,11 @@ func (i *Initiator) LoadNVMeDeviceInfo() (err error) { defer lock.Unlock() } - return i.loadNVMeDeviceInfoWithoutLock() + return i.loadNVMeDeviceInfoWithoutLock(transportAddress, transportServiceID, subsystemNQN) } -func (i *Initiator) loadNVMeDeviceInfoWithoutLock() error { - nvmeDevices, err := GetDevices(i.TransportAddress, i.TransportServiceID, i.SubsystemNQN, i.executor) +func (i *Initiator) loadNVMeDeviceInfoWithoutLock(transportAddress, transportServiceID, subsystemNQN string) error { + nvmeDevices, err := GetDevices(transportAddress, transportServiceID, subsystemNQN, i.executor) if err != nil { return err } @@ -341,7 +471,9 @@ func (i *Initiator) loadNVMeDeviceInfoWithoutLock() error { return errors.Wrapf(err, "cannot find the device for NVMe initiator %s with namespace name %s", i.Name, i.NamespaceName) } - i.dev = dev + i.dev = &util.LonghornBlockDevice{ + Nvme: *dev, + } return nil } @@ -368,20 +500,22 @@ func (i *Initiator) LoadEndpoint(dmDeviceBusy bool) error { return err } - depDevices, err := i.findDependentDevices(dev.Nvme.Name) + depDevices, err := i.findDependentDevices(dev.Name) if err != nil { return err } if dmDeviceBusy { - i.logger.Debugf("Skipping endpoint %v loading for NVMe initiator %s due to device busy", i.Endpoint, i.Name) + i.logger.Debugf("Skipping endpoint %v loading for NVMe initiator %v due to device busy", i.Endpoint, i.Name) } else { if i.NamespaceName != "" && !i.isNamespaceExist(depDevices) { - return fmt.Errorf("detected device %s name mismatching from endpoint %v for NVMe initiator %s", dev.Nvme.Name, i.Endpoint, i.Name) + return fmt.Errorf("detected device %s name mismatching from endpoint %v for NVMe initiator %s", dev.Name, i.Endpoint, i.Name) } } - i.dev = dev + i.dev = &util.LonghornBlockDevice{ + Export: *dev, + } i.isUp = true return nil @@ -472,10 +606,32 @@ func (i *Initiator) suspendLinearDmDevice(noflush, nolockfs bool) error { return util.DmsetupSuspend(i.Name, noflush, nolockfs, i.executor) } -func (i *Initiator) resumeLinearDmDevice() error { - logrus.Infof("Resuming linear dm device %s", i.Name) +// ReloadDmDevice reloads the linear dm device +func (i *Initiator) ReloadDmDevice() (err error) { + if i.hostProc != "" { + lock := nsfilelock.NewLockWithTimeout(util.GetHostNamespacePath(i.hostProc), LockFile, LockTimeout) + if err := lock.Lock(); err != nil { + return errors.Wrapf(err, "failed to get file lock for NVMe initiator %s", i.Name) + } + defer lock.Unlock() + } - return util.DmsetupResume(i.Name, i.executor) + return i.reloadLinearDmDevice() +} + +// IsSuspended checks if the linear dm device is suspended +func (i *Initiator) IsSuspended() (bool, error) { + devices, err := util.DmsetupInfo(i.Name, i.executor) + if err != nil { + return false, err + } + + for _, device := range devices { + if device.Name == i.Name { + return device.Suspended, nil + } + } + return false, fmt.Errorf("failed to find linear dm device %s", i.Name) } func (i *Initiator) reloadLinearDmDevice() error { @@ -496,7 +652,7 @@ func (i *Initiator) reloadLinearDmDevice() error { table := fmt.Sprintf("0 %v linear %v 0", sectors, devPath) - logrus.Infof("Reloading linear dm device %s with table %s", i.Name, table) + logrus.Infof("Reloading linear dm device %s with table '%s'", i.Name, table) return util.DmsetupReload(i.Name, table, i.executor) } @@ -504,3 +660,7 @@ func (i *Initiator) reloadLinearDmDevice() error { func getDmDevicePath(name string) string { return fmt.Sprintf("/dev/mapper/%s", name) } + +func IsValidNvmeDeviceNotFound(err error) bool { + return strings.Contains(err.Error(), ErrorMessageCannotFindValidNvmeDevice) +} diff --git a/pkg/nvme/nvme.go b/pkg/nvme/nvme.go index 6f9fd068..88ed9c7e 100644 --- a/pkg/nvme/nvme.go +++ b/pkg/nvme/nvme.go @@ -10,6 +10,10 @@ import ( commonNs "github.com/longhorn/go-common-libs/ns" ) +const ( + ErrorMessageCannotFindValidNvmeDevice = "cannot find a valid nvme device" +) + // DiscoverTarget discovers a target func DiscoverTarget(ip, port string, executor *commonNs.Executor) (subnqn string, err error) { hostID, err := getHostID(executor) @@ -162,7 +166,7 @@ func GetDevices(ip, port, nqn string, executor *commonNs.Executor) (devices []De } } - return nil, fmt.Errorf("cannot find a valid nvme device with subsystem NQN %s and address %s:%s", nqn, ip, port) + return nil, fmt.Errorf(ErrorMessageCannotFindValidNvmeDevice+" with subsystem NQN %s and address %s:%s", nqn, ip, port) } return res, nil } @@ -171,3 +175,8 @@ func GetDevices(ip, port, nqn string, executor *commonNs.Executor) (devices []De func GetSubsystems(executor *commonNs.Executor) (subsystems []Subsystem, err error) { return listSubsystems("", executor) } + +// Flush commits data and metadata associated with the specified namespace(s) to nonvolatile media. +func Flush(device, namespaceID string, executor *commonNs.Executor) (output string, err error) { + return flush(device, namespaceID, executor) +} diff --git a/pkg/nvme/nvmecli.go b/pkg/nvme/nvmecli.go index 2477a0d2..91a6300d 100644 --- a/pkg/nvme/nvmecli.go +++ b/pkg/nvme/nvmecli.go @@ -17,7 +17,9 @@ const ( DefaultTransportType = "tcp" // Set short ctrlLossTimeoutSec for quick response to the controller loss. - defaultCtrlLossTimeoutSec = 30 + defaultCtrlLossTmo = 30 + defaultKeepAliveTmo = 5 + defaultReconnectDelay = 10 ) type Device struct { @@ -306,7 +308,9 @@ func connect(hostID, hostNQN, nqn, transpotType, ip, port string, executor *comm "connect", "-t", transpotType, "--nqn", nqn, - "--ctrl-loss-tmo", strconv.Itoa(defaultCtrlLossTimeoutSec), + "--ctrl-loss-tmo", strconv.Itoa(defaultCtrlLossTmo), + "--keep-alive-tmo", strconv.Itoa(defaultKeepAliveTmo), + "--reconnect-delay", strconv.Itoa(defaultReconnectDelay), "-o", "json", } @@ -409,3 +413,18 @@ func GetIPAndPortFromControllerAddress(address string) (string, string) { return traddr, trsvcid } + +func flush(devicePath, namespaceID string, executor *commonNs.Executor) (string, error) { + + opts := []string{ + "flush", + devicePath, + "-o", "json", + } + + if namespaceID != "" { + opts = append(opts, "-n", namespaceID) + } + + return executor.Execute(nil, nvmeBinary, opts, types.ExecuteTimeout) +} diff --git a/pkg/spdk/spdk_test.go b/pkg/spdk/spdk_test.go index 97ccc5ed..376eb0a9 100644 --- a/pkg/spdk/spdk_test.go +++ b/pkg/spdk/spdk_test.go @@ -470,3 +470,116 @@ func (s *TestSuite) TestSPDKClientMultiThread(c *C) { wg.Wait() } + +func (s *TestSuite) TestSPDKEngineSuspend(c *C) { + fmt.Println("Testing Engine Suspend") + + ne, err := util.NewExecutor(commonTypes.ProcDirectory) + c.Assert(err, IsNil) + + LaunchTestSPDKTarget(c, ne.Execute) + PrepareDeviceFile(c) + defer func() { + os.RemoveAll(defaultDevicePath) + }() + + spdkCli, err := client.NewClient(context.Background()) + c.Assert(err, IsNil) + + // Do blindly cleanup + err = spdkCli.DeleteDevice(defaultDeviceName, defaultDeviceName) + if err != nil { + c.Assert(jsonrpc.IsJSONRPCRespErrorNoSuchDevice(err), Equals, true) + } + + bdevAioName, lvsName, lvsUUID, err := spdkCli.AddDevice(defaultDevicePath, defaultDeviceName, types.MiB) + c.Assert(err, IsNil) + defer func() { + err := spdkCli.DeleteDevice(bdevAioName, lvsName) + c.Assert(err, IsNil) + }() + c.Assert(bdevAioName, Equals, defaultDeviceName) + c.Assert(lvsName, Equals, defaultDeviceName) + c.Assert(lvsUUID, Not(Equals), "") + + bdevAioInfoList, err := spdkCli.BdevAioGet(bdevAioName, 0) + c.Assert(err, IsNil) + c.Assert(len(bdevAioInfoList), Equals, 1) + bdevAio := bdevAioInfoList[0] + c.Assert(bdevAio.Name, Equals, bdevAioName) + c.Assert(uint64(bdevAio.BlockSize)*bdevAio.NumBlocks, Equals, defaultDeviceSize) + + lvsList, err := spdkCli.BdevLvolGetLvstore(lvsName, "") + c.Assert(err, IsNil) + c.Assert(len(lvsList), Equals, 1) + lvs := lvsList[0] + c.Assert(lvs.UUID, Equals, lvsUUID) + c.Assert(lvs.BaseBdev, Equals, bdevAio.Name) + c.Assert(int(lvs.BlockSize), Equals, int(bdevAio.BlockSize)) + // The meta info may take space + c.Assert(lvs.ClusterSize*(lvs.TotalDataClusters+4), Equals, defaultDeviceSize) + + lvolName1, lvolName2 := "test-lvol1", "test-lvol2" + lvolUUID1, err := spdkCli.BdevLvolCreate(lvsName, "", lvolName1, defaultLvolSizeInMiB, "", true) + c.Assert(err, IsNil) + defer func() { + deleted, err := spdkCli.BdevLvolDelete(lvolUUID1) + c.Assert(err, IsNil) + c.Assert(deleted, Equals, true) + }() + lvolUUID2, err := spdkCli.BdevLvolCreate("", lvsUUID, lvolName2, defaultLvolSizeInMiB, "", true) + c.Assert(err, IsNil) + defer func() { + deleted, err := spdkCli.BdevLvolDelete(lvolUUID2) + c.Assert(err, IsNil) + c.Assert(deleted, Equals, true) + }() + + lvolList, err := spdkCli.BdevLvolGet("", 0) + c.Assert(err, IsNil) + c.Assert(len(lvolList), Equals, 2) + + raidName := "test-raid" + created, err := spdkCli.BdevRaidCreate(raidName, spdktypes.BdevRaidLevelRaid1, 0, []string{lvolUUID1, lvolUUID2}) + c.Assert(err, IsNil) + c.Assert(created, Equals, true) + defer func() { + deleted, err := spdkCli.BdevRaidDelete(raidName) + c.Assert(err, IsNil) + c.Assert(deleted, Equals, true) + }() + + nqn := types.GetNQN(raidName) + err = spdkCli.StartExposeBdev(nqn, raidName, "ABCDEF0123456789ABCDEF0123456789", types.LocalIP, defaultPort1) + c.Assert(err, IsNil) + defer func() { + err = spdkCli.StopExposeBdev(nqn) + c.Assert(err, IsNil) + }() + + initiator, err := nvme.NewInitiator(raidName, nqn, nvme.HostProc) + c.Assert(err, IsNil) + + dmDeviceBusy, err := initiator.Start(types.LocalIP, defaultPort1, true) + c.Assert(dmDeviceBusy, Equals, false) + c.Assert(err, IsNil) + defer func() { + dmDeviceBusy, err = initiator.Stop(true, true, true) + c.Assert(dmDeviceBusy, Equals, false) + c.Assert(err, IsNil) + }() + + err = initiator.Suspend(true, true) + c.Assert(err, IsNil) + + suspended, err := initiator.IsSuspended() + c.Assert(err, IsNil) + c.Assert(suspended, Equals, true) + + err = initiator.LoadEndpoint(dmDeviceBusy) + c.Assert(err, IsNil) + c.Assert(initiator.GetEndpoint(), Equals, "/dev/longhorn/test-raid") + + err = initiator.Resume() + c.Assert(err, IsNil) +} diff --git a/pkg/util/device.go b/pkg/util/device.go index c3beb0a6..f55f4fd8 100644 --- a/pkg/util/device.go +++ b/pkg/util/device.go @@ -35,7 +35,7 @@ type BlockDevices struct { Devices []BlockDevice `json:"blockdevices"` } -type KernelDevice struct { +type LonghornBlockDevice struct { Nvme BlockDevice Export BlockDevice } @@ -51,19 +51,18 @@ func RemoveDevice(dev string) error { } // GetKnownDevices returns the path of the device with the given major and minor numbers -func GetKnownDevices(executor *commonNs.Executor) (map[string]*KernelDevice, error) { - knownDevices := make(map[string]*KernelDevice) - - /* Example command output - $ lsblk -l -n -o NAME,MAJ:MIN - sda 8:0 - sdb 8:16 - sdc 8:32 - nvme0n1 259:0 - nvme0n1p1 259:1 - nvme0n1p128 259:2 - nvme1n1 259:3 - */ +func GetKnownDevices(executor *commonNs.Executor) (map[string]*LonghornBlockDevice, error) { + knownDevices := make(map[string]*LonghornBlockDevice) + + // Example command output + // $ lsblk -l -n -o NAME,MAJ:MIN + // sda 8:0 + // sdb 8:16 + // sdc 8:32 + // nvme0n1 259:0 + // nvme0n1p1 259:1 + // nvme0n1p128 259:2 + // nvme1n1 259:3 opts := []string{ "-l", "-n", "-o", "NAME,MAJ:MIN", @@ -79,7 +78,7 @@ func GetKnownDevices(executor *commonNs.Executor) (map[string]*KernelDevice, err line := scanner.Text() f := strings.Fields(line) if len(f) == 2 { - dev := &KernelDevice{ + dev := &LonghornBlockDevice{ Nvme: BlockDevice{ Name: f[0], }, @@ -95,11 +94,10 @@ func GetKnownDevices(executor *commonNs.Executor) (map[string]*KernelDevice, err } // DetectDevice detects the device with the given path -func DetectDevice(path string, executor *commonNs.Executor) (*KernelDevice, error) { - /* Example command output - $ lsblk -l -n -o NAME,MAJ:MIN - nvme1n1 259:3 - */ +func DetectDevice(path string, executor *commonNs.Executor) (*BlockDevice, error) { + // Example command output + // $ lsblk -l -n -o NAME,MAJ:MIN + // nvme1n1 259:3 opts := []string{ path, "-n", "-o", "NAME,MAJ:MIN", "--nodeps", @@ -110,19 +108,18 @@ func DetectDevice(path string, executor *commonNs.Executor) (*KernelDevice, erro return nil, err } - var dev *KernelDevice + var dev *BlockDevice scanner := bufio.NewScanner(strings.NewReader(output)) for scanner.Scan() { line := scanner.Text() f := strings.Fields(line) if len(f) == 2 { - dev = &KernelDevice{ - Nvme: BlockDevice{ - Name: f[0], - }, + dev = &BlockDevice{ + Name: f[0], } - if _, err := fmt.Sscanf(f[1], "%d:%d", &dev.Nvme.Major, &dev.Nvme.Minor); err != nil { - return nil, fmt.Errorf("invalid major:minor %s for device %s with path %s", dev.Nvme.Name, f[1], path) + _, err = fmt.Sscanf(f[1], "%d:%d", &dev.Major, &dev.Minor) + if err != nil { + return nil, fmt.Errorf("invalid major:minor %s for device %s with path %s", dev.Name, f[1], path) } } break // nolint:staticcheck @@ -210,7 +207,7 @@ func GetDeviceNumbers(devPath string, executor *commonNs.Executor) (int, int, er } // DuplicateDevice creates a device node for the given device -func DuplicateDevice(dev *KernelDevice, dest string) error { +func DuplicateDevice(dev *LonghornBlockDevice, dest string) error { if dev == nil { return fmt.Errorf("found nil device for device duplication") } diff --git a/pkg/util/dmsetup.go b/pkg/util/dmsetup.go index c761ca0c..591012c8 100644 --- a/pkg/util/dmsetup.go +++ b/pkg/util/dmsetup.go @@ -1,7 +1,9 @@ package util import ( + "fmt" "regexp" + "strings" commonNs "github.com/longhorn/go-common-libs/ns" @@ -98,3 +100,76 @@ func parseDependentDevicesFromString(str string) []string { return devices } + +type DeviceInfo struct { + Name string + BlockDeviceName string + TableLive bool + TableInactive bool + Suspended bool + ReadOnly bool + Major uint32 + Minor uint32 + OpenCount uint32 // Open reference count + TargetCount uint32 // Number of targets in the live table + EventNumber uint32 // Last event sequence number (used by wait) +} + +// DmsetupInfo returns the information of the device mapper device with the given name +func DmsetupInfo(dmDeviceName string, executor *commonNs.Executor) ([]*DeviceInfo, error) { + opts := []string{ + "info", + "--columns", + "--noheadings", + "-o", + "name,blkdevname,attr,major,minor,open,segments,events", + "--separator", + " ", + dmDeviceName, + } + + outputStr, err := executor.Execute(nil, dmsetupBinary, opts, types.ExecuteTimeout) + if err != nil { + return nil, err + } + + var ( + lines = strings.Split(outputStr, "\n") + devices = []*DeviceInfo{} + ) + + for _, line := range lines { + var ( + attr = "" + info = &DeviceInfo{} + ) + + // Break, if the line is empty or EOF + if line == "" { + break + } + + _, err := fmt.Sscan(line, + &info.Name, + &info.BlockDeviceName, + &attr, + &info.Major, + &info.Minor, + &info.OpenCount, + &info.TargetCount, + &info.EventNumber) + if err != nil { + continue + } + + // Parse attributes (see "man 8 dmsetup" for details) + info.Suspended = strings.Contains(attr, "s") + info.ReadOnly = strings.Contains(attr, "r") + info.TableLive = strings.Contains(attr, "L") + info.TableInactive = strings.Contains(attr, "I") + + devices = append(devices, info) + } + + return devices, nil +}