Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fsfreeze functionality to snapshot #1083

Merged
merged 17 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Dockerfile.dapper
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ ENV PATH /go/bin:$PATH
ENV DAPPER_DOCKER_SOCKET true
ENV DAPPER_ENV TAG REPO DRONE_REPO DRONE_PULL_REQUEST DRONE_COMMIT_REF SKIP_TASKS
ENV DAPPER_OUTPUT bin coverage.out
ENV DAPPER_RUN_ARGS --privileged --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.venv:exec --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.tox:exec -v /dev:/host/dev -v /proc:/host/proc
# For filesystem freeze tests, our container must be able to see the filesystem mounted in the host mount namespace.
# Usually, instance-manager runs with the equivalent of "-v /:/host". For integration testing, use "-v /tmp:/host/tmp"
# and mount filesystems to /tmp to simulate this without bind mounting everything.
ENV DAPPER_RUN_ARGS --privileged --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.venv:exec --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.tox:exec -v /dev:/host/dev -v /proc:/host/proc --mount type=bind,source=/tmp,destination=/host/tmp,bind-propagation=rslave
ENV DAPPER_SOURCE /go/src/github.com/longhorn/longhorn-engine
WORKDIR ${DAPPER_SOURCE}

Expand Down Expand Up @@ -115,7 +118,7 @@ RUN cd integration && \

# Build longhorn-instance-manager for integration testing
RUN cd /go/src/github.com/longhorn && \
git clone https://github.com/longhorn/longhorn-instance-manager.git && \
git clone https://github.com/ejweber/longhorn-instance-manager.git -b 2187-fsfreeze && \
ejweber marked this conversation as resolved.
Show resolved Hide resolved
cd longhorn-instance-manager && \
go build -o ./longhorn-instance-manager -tags netgo -ldflags "-linkmode external -extldflags -static" && \
install longhorn-instance-manager /usr/local/bin
Expand Down
8 changes: 8 additions & 0 deletions app/cmd/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ func startController(c *cli.Context) error {
frontend = f
}

// If the engine failed during a snapshot, we may have left a frozen filesystem. We attempt to unfreeze here because
// we may not reach frontend startup (e.g. if there are no available backends). An unfreeze command may get stuck
// here for reasons unrelated to a frozen filesystem (e.g. there are outstanding I/Os to a crashed volume). Log a
// failure (e.g. a timeout), but continue.
if err := util.UnfreezeFilesystemForDevice(util.GetDevicePathFromVolumeName(volumeName)); err != nil {
logrus.WithError(err).Warn("Failed to unfreeze filesystem")
}

logrus.Infof("Creating volume %v controller with iSCSI target request timeout %v and engine to replica(s) timeout %v",
volumeName, iscsiTargetRequestTimeout, engineReplicaTimeout)
control := controller.NewController(volumeName, dynamic.New(factories), frontend, isUpgrade, disableRevCounter, salvageRequested,
Expand Down
10 changes: 8 additions & 2 deletions app/cmd/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ func SnapshotCreateCmd() cli.Command {
Flags: []cli.Flag{
cli.StringSliceFlag{
Name: "label",
Usage: "specify labels, in the format of `--label key1=value1 --label key2=value2`",
Usage: "Specify labels, in the format of `--label key1=value1 --label key2=value2`",
},
cli.BoolFlag{
Name: "freeze-fs",
Usage: "Freeze the filesystem on the root partition before taking the snapshot",
},
},
Action: func(c *cli.Context) {
Expand Down Expand Up @@ -245,13 +249,15 @@ func createSnapshot(c *cli.Context) error {
}
}

freezeFilesystem := c.Bool("freeze-fs")

controllerClient, err := getControllerClient(c)
if err != nil {
return err
}
defer controllerClient.Close()

id, err := controllerClient.VolumeSnapshot(name, labelMap)
id, err := controllerClient.VolumeSnapshot(name, labelMap, freezeFilesystem)
if err != nil {
return err
}
Expand Down
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ require (
google.golang.org/protobuf v1.34.0
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
gopkg.in/cheggaaa/pb.v2 v2.0.7
k8s.io/mount-utils v0.30.0
)

require (
Expand Down Expand Up @@ -74,6 +75,7 @@ require (
gotest.tools/v3 v3.4.0 // indirect
k8s.io/apimachinery v0.27.1 // indirect
k8s.io/klog/v2 v2.120.1 // indirect
k8s.io/mount-utils v0.30.0 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
)

replace github.com/longhorn/types => github.com/ejweber/types v0.0.0-20240429165354-56f775b2d017
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI=
github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/ejweber/types v0.0.0-20240429165354-56f775b2d017 h1:dRPOOpT12986AL7+cw+3xm+m2eJjWZqT6BhIB6Rvw5Q=
github.com/ejweber/types v0.0.0-20240429165354-56f775b2d017/go.mod h1:1oEh1cnDDqNSuFh/dH/lvJ3Ssq83SOweTAAPLRY4PMI=
github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk=
github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/gammazero/deque v0.2.1 h1:qSdsbG6pgp6nL7A0+K/B7s12mcCY/5l5SIUpMOl+dC0=
Expand Down Expand Up @@ -74,8 +76,6 @@ github.com/longhorn/go-iscsi-helper v0.0.0-20240427164656-e9439c0018ce h1:PxKniE
github.com/longhorn/go-iscsi-helper v0.0.0-20240427164656-e9439c0018ce/go.mod h1:d9t3gtE+UPjescbCFluXd4xBc8OQT/JrC2cdkk2IXWQ=
github.com/longhorn/sparse-tools v0.0.0-20240427164751-a7b9f1b2c8a8 h1:lwtmZEomiv8uchwo9JIyoo+lK8J3cLCm7/qzpn6wmzo=
github.com/longhorn/sparse-tools v0.0.0-20240427164751-a7b9f1b2c8a8/go.mod h1:pvlUkVwRGojXhcTkkzksOe4i7GVk59P2PbJjHIB2Yj0=
github.com/longhorn/types v0.0.0-20240427164854-38dbed8528d3 h1:7YDGJTwro/kCcMmnCWM1UIL4JgervR6MT++71PvcbGY=
github.com/longhorn/types v0.0.0-20240427164854-38dbed8528d3/go.mod h1:pqT+7B8T+nkyUZYe8tL3CwPDCHjkbe3mHUDY5ntJFyQ=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA=
github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
Expand Down
5 changes: 3 additions & 2 deletions integration/common/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def info_get(url):
return json.loads(subprocess.check_output(cmd, encoding='utf-8'))


def snapshot_create(url):
cmd = [_bin(), '--url', url, '--debug', 'snapshot', 'create']
def snapshot_create(url, freeze_fs = False):
cmd = [_bin(), '--url', url, '--debug', 'snapshot', 'create',
f'--freeze-fs={freeze_fs}']
return subprocess.check_output(cmd, encoding='utf-8').strip()


Expand Down
2 changes: 2 additions & 0 deletions integration/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@
EXPANSION_DISK_NAME = "volume-snap-expand-%d.img" % EXPANDED_SIZE

MESSAGE_TYPE_ERROR = "error"

LOGS_DIR = "/var/log/instances/"
8 changes: 8 additions & 0 deletions integration/common/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import hashlib

from common.constants import LOGS_DIR

def file(f):
return os.path.join(_base(), '../../{}'.format(f))
Expand Down Expand Up @@ -37,3 +38,10 @@ def read_file(file_path, offset, length):

def checksum_data(data):
return hashlib.sha512(data).hexdigest()

def get_process_log_lines(process_name):
file_path = LOGS_DIR + process_name + '.log'
assert os.path.exists(file_path)
with open(file_path, 'r') as file:
lines = file.readlines()
return lines
60 changes: 44 additions & 16 deletions integration/data/test_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@
RETRY_COUNTS_SHORT, RETRY_INTERVAL_SHORT
)

from common.util import get_process_log_lines

from data.snapshot_tree import snapshot_tree_build, snapshot_tree_verify_node

from tempfile import mkdtemp

from os.path import relpath


def snapshot_revert_test(dev, address, engine_name): # NOQA
existings = {}
Expand Down Expand Up @@ -402,7 +408,7 @@
6. Take snapshot 2 (test file included)
7. Overwrite test file on the filesystem
8. Take snapshot 3 (test file changed)
9. Observe that logs of engine `Filesystem frozen / unfrozen`
9. Observe that logs of engine include "Freezing" and "Unfreezing"
10. Validate snapshots are created
11. Restore snapshot 1, validate test file missing
12. Restore snapshot 2, validate test file original content
Expand All @@ -415,35 +421,57 @@

def snapshot_mounted_filesystem_test(volume_name, dev, address, engine_name): # NOQA
dev_path = dev.dev
mnt_path = "/tmp/mnt-" + volume_name
test_file = mnt_path + "/test"
# /tmp is bind mounted into the container at /host/tmp. This simulates
# the typical running environment of instance-manager with "-v /:/host".
# Use mkdtemp to create a temporary directory that the container sees at
# /host/tmp/... and the host sees at /tmp/... (and avoid Codefactor
# warnings for the use of a hard coded /tmp directory name).
mnt_path_in_container = mkdtemp(dir="/host/tmp/", prefix=volume_name)
mnt_path_on_host = relpath(mnt_path_in_container, "/host")
test_file_on_host = mnt_path_on_host + "/test"
length = 128

print("dev_path: " + dev_path + "\n")
print("mnt_path: " + mnt_path + "\n")
print("mnt_path_in_container: " + mnt_path_in_container + "\n")
print("mnt_path_on_host: " + mnt_path_on_host + "\n")

# create & mount a ext4 filesystem on dev
nsenter_cmd = get_nsenter_cmd()
mount_cmd = nsenter_cmd + ["mount", "--make-shared", dev_path, mnt_path]
umount_cmd = nsenter_cmd + ["umount", mnt_path]
mount_cmd = nsenter_cmd + ["mount", "--make-shared", dev_path,
mnt_path_on_host]
umount_cmd = nsenter_cmd + ["umount", mnt_path_on_host]
findmnt_cmd = nsenter_cmd + ["findmnt", dev_path]
subprocess.check_call(nsenter_cmd + ["mkfs.ext4", dev_path])
subprocess.check_call(nsenter_cmd + ["mkdir", "-p", mnt_path])
subprocess.check_call(nsenter_cmd + ["mkdir", "-p", mnt_path_on_host])
subprocess.check_call(mount_cmd)
subprocess.check_call(findmnt_cmd)

# create snapshot1 with empty fs
# NOTE: we cannot use checksum_dev since it assumes
# asci data for device data instead of raw bytes
snap1 = cmd.snapshot_create(address)
snap1 = cmd.snapshot_create(address, True)

# create snapshot2 with a new test file
test2_checksum = write_filesystem_file(length, test_file)
snap2 = cmd.snapshot_create(address)
test2_checksum = write_filesystem_file(length, test_file_on_host)
snap2 = cmd.snapshot_create(address, True)

# create snapshot3 overwriting the test file
test3_checksum = write_filesystem_file(length, test_file)
snap3 = cmd.snapshot_create(address)
test3_checksum = write_filesystem_file(length, test_file_on_host)
snap3 = cmd.snapshot_create(address, True)

# Observe that logs of engine include "Freezing" and "Unfreezing"
freeze_lines_count = unfreeze_lines_count = 0
for line in get_process_log_lines(engine_name):
if 'Freezing filesystem' in line:
freeze_lines_count += 1
if 'Unfreezing filesystem' in line:
unfreeze_lines_count += 1
# We only log these if we have detected a mounted Longhorn filesystem,
# done a bind mount, and are immediately preparing to freeze. If we did
# this for all three snapshots (and the snapshots themselves didn't fail),
# it's a good guarantee that whole of the freezing logic is working.
assert freeze_lines_count == 3
assert unfreeze_lines_count == 3

# verify existence of the snapshots
snapshots = cmd.snapshot_ls(address)
Expand All @@ -462,25 +490,25 @@
# should error since the file does not exist in snapshot 1
with pytest.raises(subprocess.CalledProcessError):
print("is expected error, since the file does not exist.")
checksum_filesystem_file(test_file)
checksum_filesystem_file(test_file_on_host)
subprocess.check_call(umount_cmd)

print("\nsnapshot_revert_with_frontend snap2 begin")
snapshot_revert_with_frontend(address, engine_name, snap2)
print("snapshot_revert_with_frontend snap2 finish\n")
subprocess.check_call(mount_cmd)
assert checksum_filesystem_file(test_file) == test2_checksum
assert checksum_filesystem_file(test_file_on_host) == test2_checksum
subprocess.check_call(umount_cmd)

print("\nsnapshot_revert_with_frontend snap3 begin")
snapshot_revert_with_frontend(address, engine_name, snap3)
print("snapshot_revert_with_frontend snap3 finish\n")
subprocess.check_call(mount_cmd)
assert checksum_filesystem_file(test_file) == test3_checksum
assert checksum_filesystem_file(test_file_on_host) == test3_checksum
subprocess.check_call(umount_cmd)

# remove the created mount folder
subprocess.check_call(nsenter_cmd + ["rmdir", mnt_path])
subprocess.check_call(nsenter_cmd + ["rmdir", mnt_path_on_host])


def test_snapshot_prune(grpc_controller, grpc_replica1, grpc_replica2): # NOQA
Expand Down Expand Up @@ -686,7 +714,7 @@
"""
address = grpc_controller.address

# TODO: Increase the default volume size of the engine integration test to

Check notice on line 717 in integration/data/test_snapshot.py

View check run for this annotation

codefactor.io / CodeFactor

integration/data/test_snapshot.py#L717

unresolved comment '# TODO: Increase the default volume size of the engine integration test to' (C100)
# meet the min requirement of xfs. But increasing the volume size means
# we need to re-check all expansion related tests.
for fs_type in ["ext4"]:
Expand Down
7 changes: 4 additions & 3 deletions pkg/controller/client/controller_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,15 @@ func (c *ControllerClient) VolumeStart(size, currentSize int64, replicas ...stri
return nil
}

func (c *ControllerClient) VolumeSnapshot(name string, labels map[string]string) (string, error) {
func (c *ControllerClient) VolumeSnapshot(name string, labels map[string]string, freezeFilesystem bool) (string, error) {
controllerServiceClient := c.getControllerServiceClient()
ctx, cancel := context.WithTimeout(context.Background(), GRPCServiceTimeout)
defer cancel()

reply, err := controllerServiceClient.VolumeSnapshot(ctx, &enginerpc.VolumeSnapshotRequest{
Name: name,
Labels: labels,
Name: name,
Labels: labels,
FreezeFilesystem: freezeFilesystem,
})
if err != nil {
return "", errors.Wrapf(err, "failed to create snapshot %v for volume %v", name, c.serviceURL)
Expand Down
Loading
Loading