Skip to content

Commit 0749895

Browse files
authored
Merge pull request #192 from t-mialve/t-mialve/fix-tests
Fix instability in long haul tests
2 parents aac2d50 + 71b5655 commit 0749895

File tree

15 files changed

+42
-31
lines changed

15 files changed

+42
-31
lines changed

deploy/example/echodate/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ spec:
88
- ReadWriteMany
99
resources:
1010
requests:
11-
storage: 1Gi
11+
storage: 4Ti
1212
storageClassName: sc.azurelustre.csi.azure.com
1313
---
1414
apiVersion: apps/v1

docs/examples/pv.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ spec:
1010
capacity:
1111
# This field should be the true size of the Azure Lustre you want
1212
# to used. So that, k8s can allocate resources better.
13-
storage: 4Ti
13+
storage: 48Ti
1414
csi:
1515
driver: azurelustre.csi.azure.com
1616
volumeAttributes:

docs/examples/pv_subdir.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ spec:
1010
capacity:
1111
# This field should be the true size of the Azure Lustre you want
1212
# to used. So that, k8s can allocate resources better.
13-
storage: 4Ti
13+
storage: 48Ti
1414
csi:
1515
driver: azurelustre.csi.azure.com
1616
volumeAttributes:

docs/examples/pvc_storageclass.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ spec:
1010
resources:
1111
requests:
1212
# The real storage capacity in the claim
13-
storage: 1Gi
13+
storage: 4Ti
1414
# This field must be the same as the storage class name in StorageClass
1515
storageClassName: sc.azurelustre.csi.azure.com

docs/examples/pvc_storageclass_subdir.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ spec:
1010
resources:
1111
requests:
1212
# The real storage capacity in the claim
13-
storage: 1Gi
13+
storage: 4Ti
1414
# This field must be the same as the storage class name in StorageClass
1515
storageClassName: subdir.azurelustre.csi.azure.com

hack/verify-integration-test-aks.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ function catlog {
4040
trap catlog ERR EXIT
4141

4242
./kubectl wait --for=condition=Ready pod/aml-integration-test --timeout=60s
43-
./kubectl wait --for=condition=Ready=false pod/aml-integration-test --timeout=300s
43+
./kubectl wait --for=condition=Ready=false pod/aml-integration-test --timeout=600s
4444

4545
exit_code=$(./kubectl get pod aml-integration-test -o=jsonpath='{.status.containerStatuses[*].state.*.exitCode}')
4646

test/external-e2e/run.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,15 +60,15 @@ echo "deploy test pvc"
6060
kubectl apply -f ${claim_file}
6161
echo "wait pvc to Bound status"
6262
# wait for json is supported in kubectl v1.24
63-
kubectl wait --for=jsonpath='{.status.phase}'=Bound -f ${claim_file} --timeout=300s
63+
kubectl wait --for=jsonpath='{.status.phase}'=Bound -f ${claim_file} --timeout=600s
6464
bounded_pv=$(kubectl get -f ${claim_file} -ojsonpath='{.spec.volumeName}')
6565
echo "bounded pv is ${bounded_pv}"
6666
echo "delete pvc"
6767
kubectl delete -f ${claim_file}
6868
echo "wait for the pvc to be deleted"
69-
kubectl wait --for=delete -f ${claim_file} --timeout=300s
69+
kubectl wait --for=delete -f ${claim_file} --timeout=600s
7070
echo "wait for pv ${bounded_pv} to be deleted"
71-
kubectl wait --for=delete pv/${bounded_pv} --timeout=300s
71+
kubectl wait --for=delete pv/${bounded_pv} --timeout=600s
7272

7373
echo "delete test storageclass"
7474
kubectl delete -f ${sc_file}

test/external-e2e/testdriver-azurelustre.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ StorageClass:
77
DriverInfo:
88
Name: azurelustre.csi.azure.com
99
SupportedSizeRange:
10-
Max: 8Ti
10+
Max: 48Ti
1111
Min: 4Ti
1212
RequiredAccessModes:
1313
- ReadWriteMany

test/long-haul/cleanup/cleanupjob.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ spec:
1919
- ReadWriteMany
2020
resources:
2121
requests:
22-
storage: 1Gi
22+
storage: 48Ti
2323
storageClassName: azurelustre-longhaulcleanup-sc
2424

2525
---

test/long-haul/fault-test.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ sleep $SleepInSecs
3838
verify_sample_workload_by_pod_status workloadPodNameNew workloadNodeNameNew
3939
if [[ "$workloadPodName" == "$workloadPodNameNew" ]] ; then
4040
print_logs_error "workload pod $workloadPodName should be killed and new workload should be started"
41-
print_debug_on_ERR
41+
print_debug
4242
fast_exit
4343
fi
4444

@@ -92,20 +92,20 @@ print_logs_info "running 'kubectl delete po' by background task"
9292
sleep $SleepInSecs
9393

9494
podState=$(get_pod_state $workloadPodName $workloadNodeName)
95-
if [[ -z $podState || "$podState" != "Terminating" ]]; then
96-
print_logs_error "Workload pod $workloadPodName should be in Terminating state on node $workloadNodeName, but its actual state is $podState"
97-
print_debug_on_ERR
95+
if [[ "$podState" != "Terminating" && "$podState" != "Error" ]]; then
96+
print_logs_error "Workload pod $workloadPodName should be in Error/Terminating state on node $workloadNodeName, but its actual state is $podState"
97+
print_debug
9898
fast_exit
9999
else
100-
print_logs_info "Workload pod $workloadPodName is in Terminating state on node $workloadNodeName"
100+
print_logs_info "Workload pod $workloadPodName is in Error state on node $workloadNodeName"
101101
fi
102102

103103

104104
print_logs_title "Verify the new workload pod in Running state on other nodes or ContainerCreating state on the same node"
105105
verify_sample_workload_by_pod_status workloadPodNameNew workloadNodeNameNew "Running\|ContainerCreating"
106106
if [[ "$workloadPodName" == "$workloadPodNameNew" ]] ; then
107107
print_logs_error "New workload pod should be started, but still find old running pod $workloadPodName"
108-
print_debug_on_ERR
108+
print_debug
109109
fast_exit
110110
else
111111
print_logs_info "new workload pod $workloadPodNameNew started on another node $workloadNodeNameNew"
@@ -119,7 +119,7 @@ sleep $SleepInSecs
119119
podState=$(get_pod_state $NodePodNameKeyword $workloadNodeName)
120120
if [[ -z "$podState" || "$podState" != "Running" ]]; then
121121
print_logs_error "Lustre CSI node pod can't be started on $nodeName, state=$podState"
122-
print_debug_on_ERR
122+
print_debug
123123
fast_exit
124124
else
125125
print_logs_info "Lustre CSI node pod started on $nodeName again"
@@ -132,7 +132,7 @@ sleep $SleepInSecs
132132
podState=$(get_pod_state $workloadPodName $workloadNodeName)
133133
if [[ ! -z $podState ]]; then
134134
print_logs_error "Still can find workload pod $workloadPodName in $podState state on node $workloadNodeName, it should be deleted successfully"
135-
print_debug_on_ERR
135+
print_debug
136136
fast_exit
137137
else
138138
print_logs_info "workload pod $workloadPodName has been deleted successfully from node $workloadNodeName"

test/long-haul/sample-workload/deployment_write_print_file.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ spec:
1919
- ReadWriteMany
2020
resources:
2121
requests:
22-
storage: 1Gi
22+
storage: 48Ti
2323
storageClassName: azurelustre-longhaulsample-sc
2424
---
2525
apiVersion: apps/v1

test/long-haul/update-test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,10 @@ else
6868
fi
6969

7070
print_logs_info "Upgrading node pool to the latest node image"
71-
az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName --node-image-only
71+
az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName --node-image-only -y
7272

7373
print_logs_info "Upgrading node pool to the latest"
74-
az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName
74+
az aks nodepool upgrade --resource-group $ResourceGroup --cluster-name $ClusterName --name $PoolName -y
7575

7676
print_logs_title "Print versions after"
7777
print_versions

test/long-haul/utils.sh

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
set -x
12
set -o errexit
23
set -o pipefail
34
set -o nounset
45

6+
trap print_debug EXIT
7+
58
REPO_ROOT_PATH=${REPO_ROOT_PATH:-$(git rev-parse --show-toplevel)}
69

710
export REPO_ROOT_PATH=$REPO_ROOT_PATH
@@ -37,7 +40,8 @@ reset_csi_driver () {
3740
echo "Reset CSI driver"
3841
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml --ignore-not-found
3942
kubectl delete -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml --ignore-not-found
40-
kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
43+
kubectl wait pod -n kube-system --for=delete --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
44+
4145

4246
echo "Reset node label"
4347
kubectl get nodes --no-headers | grep "$PoolName" | awk '{print $1}' |
@@ -51,7 +55,7 @@ reset_csi_driver () {
5155
kubectl apply -f $REPO_ROOT_PATH/deploy/csi-azurelustre-controller.yaml
5256
kubectl apply -f $REPO_ROOT_PATH/deploy/csi-azurelustre-node.yaml
5357

54-
kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
58+
kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
5559

5660
sleep 60
5761
}
@@ -144,13 +148,19 @@ verify_csi_driver () {
144148
print_logs_info "$nodePodsNum node pods running..."
145149
fi
146150

147-
kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=300s
151+
kubectl wait pod -n kube-system --for=condition=Ready --selector='app in (csi-azurelustre-controller,csi-azurelustre-node)' --timeout=600s
152+
148153
}
149154

150155
start_sample_workload () {
151156
stop_sample_workload
152-
kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=300s
153-
kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
157+
kubectl apply -f ./sample-workload/deployment_write_print_file.yaml --timeout=600s
158+
kubectl wait pod --for=condition=Ready --selector=app=azurelustre-longhaulsample-deployment --timeout=600s
159+
160+
if [[ $? -ne 0 ]]; then
161+
print_logs_error "Failed to start sample workload"
162+
print_debug
163+
fi
154164
sleep 15
155165
}
156166

@@ -160,8 +170,9 @@ stop_sample_workload () {
160170
kubectl patch pvc azurelustre-longhaulsample-pvc -p '{"metadata":{"finalizers":null}}'
161171
fi
162172

163-
kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=300s --grace-period=0 --force --cascade
164-
kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=300s
173+
kubectl delete -f ./sample-workload/deployment_write_print_file.yaml --ignore-not-found --timeout=600s --grace-period=0 --force --cascade
174+
kubectl wait pod --for=delete --selector=app=azurelustre-longhaulsample-deployment --timeout=600s
175+
165176
}
166177

167178
verify_sample_workload_logs () {

test/scale/run_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def deploy_workload(self):
251251
self.run_command(
252252
"kubectl rollout status deployment"
253253
" scale-test-set"
254-
" --timeout=300s"
254+
" --timeout=600s"
255255
)
256256
logger.info("workload was ready")
257257

test/scale/static_workload.yml.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ spec:
77
accessModes:
88
- ReadWriteMany
99
capacity:
10-
storage: 4Ti
10+
storage: 48Ti
1111
csi:
1212
driver: ${csi_name}
1313
volumeAttributes:

0 commit comments

Comments
 (0)