Skip to content

Commit

Permalink
feat: Add specific reasons for failed task submission (#218)
Browse files Browse the repository at this point in the history
* Add specific reasons for failed task submission

* change proto

* change proto errcode

* fix qos resource enum

* fix comments

* refactor account errcode name

* style: Change CraneErrCode::__ERR_CODE_COUNT

Signed-off-by: lijunlin <xiafeng.li@foxmail.com>

---------

Signed-off-by: lijunlin <xiafeng.li@foxmail.com>
Co-authored-by: huerni <47264950+huerni@users.noreply.github.com>
Co-authored-by: lijunlin <xiafeng.li@foxmail.com>
  • Loading branch information
3 people authored Feb 27, 2025
1 parent cd289b2 commit 3a650af
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 183 deletions.
38 changes: 19 additions & 19 deletions internal/cacctmgr/cacctmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ func AddAccount(account *protos.AccountInfo) util.CraneCmdError {
fmt.Println("Account added successfully.")
return util.ErrorSuccess
} else {
fmt.Printf("Failed to add account: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to add account: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand Down Expand Up @@ -410,7 +410,7 @@ func AddUser(user *protos.UserInfo, partition []string, level string, coordinato
fmt.Println("User added successfully.")
return util.ErrorSuccess
} else {
fmt.Printf("Failed to add user: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to add user: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand Down Expand Up @@ -443,7 +443,7 @@ func AddQos(qos *protos.QosInfo) util.CraneCmdError {
fmt.Println("QoS added successfully.")
return util.ErrorSuccess
} else {
fmt.Printf("Failed to add QoS: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to add QoS: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -466,10 +466,10 @@ func DeleteAccount(name string) util.CraneCmdError {
}
}
if reply.GetOk() {
fmt.Printf("Delete account %s succeeded.\n", name)
fmt.Printf("Successfully deleted account '%s'.\n", name)
return util.ErrorSuccess
} else {
fmt.Printf("Delete account %s failed: %s.\n", name, util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to delete account %s: %s.\n", name, util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -492,10 +492,10 @@ func DeleteUser(name string, account string) util.CraneCmdError {
}
}
if reply.GetOk() {
fmt.Printf("Remove user %s succeeded.\n", name)
fmt.Printf("Successfully removed user '%s'.\n", name)
return util.ErrorSuccess
} else {
fmt.Printf("Remove user %s failed: %s.\n", name, util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to remove user '%s': %s.\n", name, util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -518,10 +518,10 @@ func DeleteQos(name string) util.CraneCmdError {
}
}
if reply.GetOk() {
fmt.Printf("Delete QoS %s succeeded.\n", name)
fmt.Printf("Successfully deleted QoS '%s'.\n", name)
return util.ErrorSuccess
} else {
fmt.Printf("Delete QoS %s failed: %s.\n", name, util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to delete QoS '%s': %s.\n", name, util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand Down Expand Up @@ -551,10 +551,10 @@ func ModifyAccount(modify_field protos.ModifyField, new_value string, name strin
}
}
if reply.GetOk() {
fmt.Println("Modify information succeeded.")
fmt.Println("Information was successfully modified.")
return util.ErrorSuccess
} else {
fmt.Printf("Modify information failed: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Failed to modify information: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand Down Expand Up @@ -596,7 +596,7 @@ func ModifyUser(modify_field protos.ModifyField, new_value string, name string,
fmt.Println("Modify information succeeded.")
return util.ErrorSuccess
} else {
fmt.Printf("Modify information failed: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Modify information failed: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand Down Expand Up @@ -627,7 +627,7 @@ func ModifyQos(modify_field protos.ModifyField, new_value string, name string) u
fmt.Println("Modify information succeeded.")
return util.ErrorSuccess
} else {
fmt.Printf("Modify information failed: %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Modify information failed: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -652,7 +652,7 @@ func ShowAccounts() util.CraneCmdError {
PrintAllAccount(reply.AccountList)
return util.ErrorSuccess
} else {
fmt.Println(util.ErrMsg(reply.Reason))
fmt.Println(util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -677,7 +677,7 @@ func ShowUser(name string, account string) util.CraneCmdError {
PrintAllUsers(reply.UserList)
return util.ErrorSuccess
} else {
fmt.Println(util.ErrMsg(reply.Reason))
fmt.Println(util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -703,7 +703,7 @@ func ShowQos(name string) util.CraneCmdError {
return util.ErrorSuccess
} else {
if name == "" {
fmt.Printf("Can't find any QoS. %s.\n", util.ErrMsg(reply.GetReason()))
fmt.Printf("Can't find any QoS. %s.\n", util.ErrMsg(reply.GetCode()))
} else {
fmt.Printf("Can't find QoS %s.\n", name)
}
Expand Down Expand Up @@ -731,7 +731,7 @@ func FindAccount(name string) util.CraneCmdError {
PrintAccountTable(reply.AccountList)
return util.ErrorSuccess
} else {
fmt.Println(util.ErrMsg(reply.Reason))
fmt.Println(util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -756,7 +756,7 @@ func BlockAccountOrUser(name string, entityType protos.EntityType, account strin
fmt.Printf("Block %s succeeded.\n", name)
return util.ErrorSuccess
} else {
fmt.Println(util.ErrMsg(reply.Reason))
fmt.Println(util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -781,7 +781,7 @@ func UnblockAccountOrUser(name string, entityType protos.EntityType, account str
fmt.Printf("Unblock %s succeeded.\n", name)
return util.ErrorSuccess
} else {
fmt.Println(util.ErrMsg(reply.Reason))
fmt.Println(util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
11 changes: 7 additions & 4 deletions internal/cbatch/cbatch.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ func SendRequest(task *protos.TaskToCtld) util.CraneCmdError {
fmt.Printf("Job id allocated: %d.\n", reply.GetTaskId())
return util.ErrorSuccess
} else {
log.Errorf("Job allocation failed: %s.\n", reply.GetReason())
log.Errorf("Job allocation failed: %s.\n", util.ErrMsg(reply.GetCode()))
return util.ErrorBackend
}
}
Expand All @@ -330,7 +330,7 @@ func SendMultipleRequests(task *protos.TaskToCtld, count uint32) util.CraneCmdEr

if FlagJson {
fmt.Println(util.FmtJson.FormatReply(reply))
if len(reply.ReasonList) > 0 {
if len(reply.GetCodeList()) > 0 {
return util.ErrorBackend
} else {
return util.ErrorSuccess
Expand All @@ -345,8 +345,11 @@ func SendMultipleRequests(task *protos.TaskToCtld, count uint32) util.CraneCmdEr
fmt.Printf("Job id allocated: %s.\n", strings.Join(taskIdListString, ", "))
}

if len(reply.ReasonList) > 0 {
log.Errorf("Job allocation failed: %s.\n", strings.Join(reply.ReasonList, ", "))
if len(reply.GetCodeList()) > 0 {
for _, reason := range reply.GetCodeList() {
log.Errorf("Job allocation failed: %s.\n", util.ErrMsg(reason))
}

return util.ErrorBackend
}
return util.ErrorSuccess
Expand Down
153 changes: 68 additions & 85 deletions internal/util/err.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,95 +37,78 @@ const (
ErrorInvalidFormat CraneCmdError = 5
)

func ErrMsg(err_code protos.ErrCode) string {
switch err_code {
case protos.ErrCode_ERR_INVALID_UID:
return "The user UID being operated on does not exist in the system"
case protos.ErrCode_ERR_INVALID_OP_USER:
return "you are not a user of Crane"
case protos.ErrCode_ERR_INVALID_USER:
return "The entered user is not a user of Crane"
case protos.ErrCode_ERR_PERMISSION_USER:
return "Your permission is insufficient"
case protos.ErrCode_ERR_USER_DUPLICATE_ACCOUNT:
return "The user already exists in this account"
case protos.ErrCode_ERR_USER_ALLOWED_ACCOUNT:
return "The user is not allowed to access account"
case protos.ErrCode_ERR_INVALID_ADMIN_LEVEL:
return "Unknown admin level"
case protos.ErrCode_ERR_USER_ACCOUNT_MISMATCH:
return "The user does not belong to this account"
case protos.ErrCode_ERR_NO_ACCOUNT_SPECIFIED:
return "No account is specified for the user"
default:
break
}
var errMsgMap = map[protos.ErrCode]string{

switch err_code {
case protos.ErrCode_ERR_INVALID_ACCOUNT:
return "The entered account does not exist"
case protos.ErrCode_ERR_INVALID_PARENTACCOUNT:
return "The parent account of the entered account does not exist"
case protos.ErrCode_ERR_DUPLICATE_ACCOUNT:
return "The account already exists in the crane"
case protos.ErrCode_ERR_DELETE_ACCOUNT:
return "The account has child account or users, unable to delete."
}
// User-related errors
protos.ErrCode_ERR_INVALID_UID: "The user UID being operated on does not exist in the system",
protos.ErrCode_ERR_INVALID_OP_USER: "You are not a user of Crane",
protos.ErrCode_ERR_INVALID_USER: "The entered user is not a user of Crane",
protos.ErrCode_ERR_PERMISSION_USER: "Your permission is insufficient",
protos.ErrCode_ERR_BLOCKED_USER: "The user has been blocked",
protos.ErrCode_ERR_USER_ALREADY_EXISTS: "The user already exists in this account",
protos.ErrCode_ERR_USER_ACCESS_TO_ACCOUNT_DENIED: "The user is not allowed to access account",
protos.ErrCode_ERR_INVALID_ADMIN_LEVEL: "Unknown admin level",
protos.ErrCode_ERR_USER_ACCOUNT_MISMATCH: "The user does not belong to this account",
protos.ErrCode_ERR_NO_ACCOUNT_SPECIFIED: "No account is specified for the user",

switch err_code {
case protos.ErrCode_ERR_INVALID_PARTITION:
return "The entered partition does not exist"
case protos.ErrCode_ERR_ALLOWED_PARTITION:
return "The entered account or user does not include this partition"
case protos.ErrCode_ERR_PARENT_ALLOWED_PARTITION:
return "Parent account does not include the partition"
case protos.ErrCode_ERR_DUPLICATE_PARTITION:
return "The partition already exists in the account or user"
case protos.ErrCode_ERR_USER_EMPTY_PARTITION:
return "The user does not contain any partitions, operation cannot be performed."
case protos.ErrCode_ERR_CHILD_HAS_PARTITION:
return "The partition is currently being used by the child accounts or users of the account, operation cannot be performed. You can use a forced operation to ignore this constraint"
}
// Account-related errors
protos.ErrCode_ERR_INVALID_ACCOUNT: "The entered account does not exist",
protos.ErrCode_ERR_INVALID_PARENT_ACCOUNT: "The parent account of the entered account does not exist",
protos.ErrCode_ERR_ACCOUNT_ALREADY_EXISTS: "The account already exists in the crane",
protos.ErrCode_ERR_ACCOUNT_HAS_CHILDREN: "The account has child account or users, unable to delete.",
protos.ErrCode_ERR_BLOCKED_ACCOUNT: "The account has been blocked",

switch err_code {
case protos.ErrCode_ERR_INVALID_QOS:
return "The entered qos does not exist"
case protos.ErrCode_ERR_DB_DUPLICATE_QOS:
return "Qos already exists in the crane"
case protos.ErrCode_ERR_DELETE_QOS:
return "QoS is still being used by accounts or users, unable to delete"
case protos.ErrCode_ERR_CONVERT_TO_INTERGER:
return "Failed to convert value to integer"
case protos.ErrCode_ERR_TIME_LIMIT:
return "Invalid time limit value"
case protos.ErrCode_ERR_ALLOWED_QOS:
return "The entered account or user does not include this qos"
case protos.ErrCode_ERR_DUPLICATE_QOS:
return "The Qos already exists in the account or user"
case protos.ErrCode_ERR_PARENT_ALLOWED_QOS:
return "Parent account does not include the qos"
case protos.ErrCode_ERR_SET_ALLOWED_QOS:
return "The entered QoS list does not include the default QoS for this user. Ignoring this constraint with forced operation, the default QoS is randomly replaced with one of the items in the new QoS list"
case protos.ErrCode_ERR_ALLOWED_DEFAULT_QOS:
return "The entered default_qos is not allowed"
case protos.ErrCode_ERR_DUPLICATE_DEFAULT_QOS:
return "The QoS is already the default QoS for the account or specified partition of the user"
case protos.ErrCode_ERR_SET_ACCOUNT_QOS:
return "The entered QoS list does not include the default QoS for this account or some descendant node. You can use a forced operation to ignore this constraint"
case protos.ErrCode_ERR_CHILD_HAS_DEFAULT_QOS:
return "some child accounts or users is using the QoS as the default QoS. By ignoring this constraint with forced deletion, the deleted default QoS is randomly replaced with one of the remaining items in the QoS list"
case protos.ErrCode_ERR_SET_DEFAULT_QOS:
return "The Qos not allowed or is already the default qos"
case protos.ErrCode_ERR_IS_DEFAULT_QOS:
return "The QoS is the default QoS for the current user/Account and cannot be modified. Ignoring this constraint with forced operation, the default QoS is randomly replaced with one of the items in the new QoS list"
}
// Partition-related errors
protos.ErrCode_ERR_INVALID_PARTITION: "The entered partition does not exist",
protos.ErrCode_ERR_PARTITION_MISSING: "The entered account or user does not include this partition",
protos.ErrCode_ERR_PARENT_ACCOUNT_PARTITION_MISSING: "Parent account does not include the partition",
protos.ErrCode_ERR_PARTITION_ALREADY_EXISTS: "The partition already exists in the account or user",
protos.ErrCode_ERR_USER_EMPTY_PARTITION: "The user does not contain any partitions, operation cannot be performed.",
protos.ErrCode_ERR_CHILD_HAS_PARTITION: "The partition is currently being used by the child accounts or users of the account, operation cannot be performed. You can use a forced operation to ignore this constraint",
protos.ErrCode_ERR_HAS_NO_QOS_IN_PARTITION: "The user has no QoS available for this partition to be used",
protos.ErrCode_ERR_HAS_ALLOWED_QOS_IN_PARTITION: "The qos you set is not in partition's allowed qos list",

switch err_code {
case protos.ErrCode_ERR_UPDATE_DATABASE:
return "Fail to update data in database"
default:
break
}
// QoS-related errors
protos.ErrCode_ERR_INVALID_QOS: "The entered qos does not exist",
protos.ErrCode_ERR_DB_QOS_ALREADY_EXISTS: "Qos already exists in the crane",
protos.ErrCode_ERR_QOS_REFERENCES_EXIST: "QoS is still being used by accounts or users, unable to delete",
protos.ErrCode_ERR_CONVERT_TO_INTEGER: "Failed to convert value to integer",
protos.ErrCode_ERR_TIME_LIMIT: "Invalid time limit value",
protos.ErrCode_ERR_QOS_MISSING: "The entered account or user does not include this qos",
protos.ErrCode_ERR_QOS_ALREADY_EXISTS: "The Qos already exists in the account or user",
protos.ErrCode_ERR_PARENT_ACCOUNT_QOS_MISSING: "Parent account does not include the qos",
protos.ErrCode_ERR_SET_ALLOWED_QOS: "The entered QoS list does not include the default QoS for this user. Ignoring this constraint with forced operation, the default QoS is randomly replaced with one of the items in the new QoS list",
protos.ErrCode_ERR_DEFAULT_QOS_NOT_INHERITED: "The entered default_qos is not allowed",
protos.ErrCode_ERR_DUPLICATE_DEFAULT_QOS: "The QoS is already the default QoS for the account or specified partition of the user",
protos.ErrCode_ERR_SET_ACCOUNT_QOS: "The entered QoS list does not include the default QoS for this account or some descendant node. You can use a forced operation to ignore this constraint",
protos.ErrCode_ERR_CHILD_HAS_DEFAULT_QOS: "some child accounts or users is using the QoS as the default QoS. By ignoring this constraint with forced deletion, the deleted default QoS is randomly replaced with one of the remaining items in the QoS list",
protos.ErrCode_ERR_SET_DEFAULT_QOS: "The Qos not allowed or is already the default qos",
protos.ErrCode_ERR_DEFAULT_QOS_MODIFICATION_DENIED: "The QoS is the default QoS for the current user/Account and cannot be modified. Ignoring this constraint with forced operation, the default QoS is randomly replaced with one of the items in the new QoS list",

// System-related errors
protos.ErrCode_ERR_UPDATE_DATABASE: "Fail to update data in database",
protos.ErrCode_ERR_NO_RESOURCE: "Resource not enough for task",
protos.ErrCode_ERR_INVALID_NODE_NUM: "Nodes partition not enough for task",
protos.ErrCode_ERR_INVAILD_NODE_LIST: "Invalid node list",
protos.ErrCode_ERR_INVAILD_EX_NODE_LIST: "Invalid exclude node list",
protos.ErrCode_ERR_TIME_TIMIT_BEYOND: "Time-limit reached the user's limit",
protos.ErrCode_ERR_CPUS_PER_TASK_BEYOND: "cpus-per-task reached the user's limit",
protos.ErrCode_ERR_NO_ENOUGH_NODE: "Nodes num not enough for task",
protos.ErrCode_ERR_BEYOND_TASK_ID: "System error occurred or the number of pending tasks exceeded maximum value",
protos.ErrCode_ERR_CGROUP: "Error when manipulating cgroup",
protos.ErrCode_ERR_SYSTEM_ERR: "Linux Error",
protos.ErrCode_ERR_RPC_FAILURE: "RPC call failed",
protos.ErrCode_ERR_GENERIC_FAILURE: "Generic failure",
protos.ErrCode_ERR_NON_EXISTENT: "The object doesn't exist",
protos.ErrCode_ERR_INVALID_PARAM: "Invalid Parameter",
protos.ErrCode_ERR_PROTOBUF: "Error when using protobuf",
protos.ErrCode_ERR_MAX_JOB_COUNT_PER_USER: "job max count is empty or exceeds the limit",
protos.ErrCode_ERR_USER_NO_PRIVILEGE: "User has insufficient privilege",
}

func ErrMsg(err_code protos.ErrCode) string {
if msg, exists := errMsgMap[err_code]; exists {
return msg
}
return fmt.Sprintf("Unknown Error Occurred: %s", err_code)
}
Loading

0 comments on commit 3a650af

Please sign in to comment.