Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
1daidai1 committed Dec 16, 2024
1 parent 047b895 commit 1c01ff4
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 145 deletions.
141 changes: 71 additions & 70 deletions protos/PublicDefs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -319,76 +319,77 @@ message TrimmedPartitionInfo {
enum ErrCode {
SUCCESS = 0; // Success

ERR_INVALID_UID = 10001; // Invalid UID passed
ERR_INVALID_OP_USER = 10002; // Invalid operation user
ERR_INVALID_USER = 10003; // Invalid user
ERR_PERMISSION_USER = 10004; // User permissions too low, no permission to operate
ERR_BLOCKED_USER = 10005;
ERR_USER_DUPLICATE_ACCOUNT= 10006; // User duplicate account insertion
ERR_USER_ALLOWED_ACCOUNT = 10007; // User does not have permission for the account
ERR_INVALID_ADMIN_LEVEL = 10008; // Invalid permission level
ERR_USER_ACCOUNT_MISMATCH = 10009; // User does not belong to the account
ERR_NO_ACCOUNT_SPECIFIED = 10010;

ERR_INVALID_ACCOUNT = 10011; // Invalid account
ERR_DUPLICATE_ACCOUNT = 10012; // Duplicate account insertion
ERR_INVALID_PARENTACCOUNT = 10013; // Invalid parent account
ERR_DELETE_ACCOUNT = 10014; // Account has child nodes
ERR_BLOCKED_ACCOUNT = 10015;

ERR_INVALID_PARTITION = 10016; // Invalid partition, partition does not exist
ERR_ALLOWED_PARTITION = 10017; // Account/user does not include this partition
ERR_DUPLICATE_PARTITION = 10018; // Account/user duplicate insertion
ERR_PARENT_ALLOWED_PARTITION = 10019; // Parent account does not include this partition
ERR_USER_EMPTY_PARTITION = 10020; // Cannot add QoS when user has no partition
ERR_CHILD_HAS_PARTITION = 10021; // Partition '{}' is used by some descendant node of the account '{}'. Ignoring this constraint with forced operation.
ERR_HAS_NO_QOS_IN_PARTITION = 10022;
ERR_HAS_ALLOWED_QOS_IN_PARTITION = 10023;

ERR_INVALID_QOS = 10024; // Invalid QoS, QoS does not exist
ERR_DB_DUPLICATE_QOS = 10025; // Duplicate QoS insertion in the database.
ERR_DELETE_QOS = 10026; // QoS reference count is not zero.
ERR_CONVERT_TO_INTERGER = 10027; // String to integer conversion failed
ERR_TIME_LIMIT = 10028; // Invalid time value
ERR_ALLOWED_QOS = 10029; // Account/user does not include this QoS.
ERR_DUPLICATE_QOS = 10030; // Account/user duplicate insertion.
ERR_PARENT_ALLOWED_QOS = 10031; // Parent account does not include this QoS.
ERR_SET_ALLOWED_QOS = 10032; // QoS '{}' is the default QoS of partition '{}', but not found in the new QoS list.
ERR_ALLOWED_DEFAULT_QOS = 10033; // Default QoS is not in the allowed QoS list
ERR_DUPLICATE_DEFAULT_QOS = 10034; // Duplicate default QoS setting
ERR_CHILD_HAS_DEFAULT_QOS = 10035; // Someone is using QoS '{}' as default QoS. Ignoring this constraint with forced deletion, the deleted default QoS is randomly replaced with one of the remaining items in the QoS list.
ERR_SET_ACCOUNT_QOS = 10036; // QoS '{}' is used by some descendant node or itself of the account '{}'. Ignoring this constraint with forced operation.
ERR_SET_DEFAULT_QOS = 10037; // Qos '{}' not in allowed qos list or is already the default qos
ERR_IS_DEFAULT_QOS = 10038;

ERR_UPDATE_DATABASE = 10039; // Database update failed

ERR_GENERIC_FAILURE = 10100;
ERR_NO_RESOURCE = 10101;
ERR_NON_EXISTENT = 10102;
ERR_INVALID_NODE_NUM = 10103;
ERR_INVAILD_NODE_LIST = 10104;
ERR_INVAILD_EX_NODE_LIST = 10105;
ERR_TIME_TIMIT_BEYOND = 10106;
ERR_CPUS_PER_TASK_BEYOND = 10107;
ERR_NO_ENOUGH_NODE = 10108;

ERR_SYSTEM_ERR = 10109;
ERR_EXISTING_TASK = 10110;
ERR_BEYOND_TASK_ID = 10111;
ERR_INVALID_PARAM = 10112;
ERR_STOP = 10113;
ERR_PERMISSION_DENIED = 10114;
ERR_CONNECTION_TIMEOUT = 10115;
ERR_CONNECTION_ABORTED = 10116;
ERR_RPC_FAILURE = 10117;
ERR_TOKEN_REQUEST_FAILURE = 10118;
ERR_STREAM_BROKEN = 10119;
ERR_INVALID_STUB = 10120;
ERR_CGROUP = 10121;
ERR_PROTOBUF = 10122;
ERR_LIB_EVENT = 10123;
ERR_NO_AVAIL_NODE = 10124;
ERR_INVALID_UID = 1; // Invalid UID passed
ERR_INVALID_OP_USER = 2; // Invalid operation user
ERR_INVALID_USER = 3; // Invalid user
ERR_PERMISSION_USER = 4; // User permissions too low, no permission to operate
ERR_BLOCKED_USER = 5;
ERR_USER_DUPLICATE_ACCOUNT= 6; // User duplicate account insertion
ERR_USER_ALLOWED_ACCOUNT = 7; // User does not have permission for the account
ERR_INVALID_ADMIN_LEVEL = 8; // Invalid permission level
ERR_USER_ACCOUNT_MISMATCH = 9; // User does not belong to the account
ERR_NO_ACCOUNT_SPECIFIED = 10;

ERR_INVALID_ACCOUNT = 11; // Invalid account
ERR_DUPLICATE_ACCOUNT = 12; // Duplicate account insertion
ERR_INVALID_PARENTACCOUNT = 13; // Invalid parent account
ERR_DELETE_ACCOUNT = 14; // Account has child nodes
ERR_BLOCKED_ACCOUNT = 15;

ERR_INVALID_PARTITION = 16; // Invalid partition, partition does not exist
ERR_ALLOWED_PARTITION = 17; // Account/user does not include this partition
ERR_DUPLICATE_PARTITION = 18; // Account/user duplicate insertion
ERR_PARENT_ALLOWED_PARTITION = 19; // Parent account does not include this partition
ERR_USER_EMPTY_PARTITION = 20; // Cannot add QoS when user has no partition
ERR_CHILD_HAS_PARTITION = 21; // Partition '{}' is used by some descendant node of the account '{}'. Ignoring this constraint with forced operation.
ERR_HAS_NO_QOS_IN_PARTITION = 22;
ERR_HAS_ALLOWED_QOS_IN_PARTITION = 23;

ERR_INVALID_QOS = 24; // Invalid QoS, QoS does not exist
ERR_DB_DUPLICATE_QOS = 25; // Duplicate QoS insertion in the database.
ERR_DELETE_QOS = 26; // QoS reference count is not zero.
ERR_CONVERT_TO_INTERGER = 27; // String to integer conversion failed
ERR_TIME_LIMIT = 28; // Invalid time value
ERR_ALLOWED_QOS = 29; // Account/user does not include this QoS.
ERR_DUPLICATE_QOS = 30; // Account/user duplicate insertion.
ERR_PARENT_ALLOWED_QOS = 31; // Parent account does not include this QoS.
ERR_SET_ALLOWED_QOS = 32; // QoS '{}' is the default QoS of partition '{}', but not found in the new QoS list.
ERR_ALLOWED_DEFAULT_QOS = 33; // Default QoS is not in the allowed QoS list
ERR_DUPLICATE_DEFAULT_QOS = 34; // Duplicate default QoS setting
ERR_CHILD_HAS_DEFAULT_QOS = 35; // Someone is using QoS '{}' as default QoS. Ignoring this constraint with forced deletion, the deleted default QoS is randomly replaced with one of the remaining items in the QoS list.
ERR_SET_ACCOUNT_QOS = 36; // QoS '{}' is used by some descendant node or itself of the account '{}'. Ignoring this constraint with forced operation.
ERR_SET_DEFAULT_QOS = 37; // Qos '{}' not in allowed qos list or is already the default qos
ERR_IS_DEFAULT_QOS = 38;

ERR_UPDATE_DATABASE = 39; // Database update failed

ERR_GENERIC_FAILURE = 40;
ERR_NO_RESOURCE = 41;
ERR_NON_EXISTENT = 42;
ERR_INVALID_NODE_NUM = 43;
ERR_INVAILD_NODE_LIST = 44;
ERR_INVAILD_EX_NODE_LIST = 45;
ERR_TIME_TIMIT_BEYOND = 46;
ERR_CPUS_PER_TASK_BEYOND = 47;
ERR_NO_ENOUGH_NODE = 48;

ERR_SYSTEM_ERR = 49;
ERR_EXISTING_TASK = 50;
ERR_BEYOND_TASK_ID = 51;
ERR_INVALID_PARAM = 52;
ERR_STOP = 53;
ERR_PERMISSION_DENIED = 555;
ERR_CONNECTION_TIMEOUT = 55;
ERR_CONNECTION_ABORTED = 56;
ERR_RPC_FAILURE = 57;
ERR_TOKEN_REQUEST_FAILURE = 58;
ERR_STREAM_BROKEN = 59;
ERR_INVALID_STUB = 60;
ERR_CGROUP = 61;
ERR_PROTOBUF = 62;
ERR_LIB_EVENT = 63;
ERR_NO_AVAIL_NODE = 64;
ERR_CODE_COUNT = 65;
}

enum EntityType {
Expand Down
3 changes: 3 additions & 0 deletions src/CraneCtld/AccountManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,13 +907,16 @@ CraneErrCodeExpected<void> AccountManager::CheckIfUserOfAccountIsEnabled(
do {
const Account* account_ptr = GetExistedAccountInfoNoLock_(account_name);
if (account_ptr->blocked) {
CRANE_ERROR("CheckIfUserOfAccountIsEnabled error: Ancestor account '{}' is blocked",
account_ptr->name);
return std::unexpected(crane::grpc::ErrCode::ERR_BLOCKED_ACCOUNT);
}
account_name = account_ptr->parent_account;
} while (!account_name.empty());

const User* user_ptr = GetExistedUserInfoNoLock_(user);
if (user_ptr->account_to_attrs_map.at(account).blocked) {
CRANE_ERROR("CheckIfUserOfAccountIsEnabled error: User '{}' is blocked", user_ptr->name);
return std::unexpected(crane::grpc::ErrCode::ERR_BLOCKED_USER);
}
return {};
Expand Down
7 changes: 6 additions & 1 deletion src/CraneCtld/CtldGrpcServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,7 @@ CraneErrCodeExpected <std::future<task_id_t>>
CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {

if (!task->password_entry->Valid()) {
CRANE_ERROR("Uid {} not found on the controller node", task->uid);
return std::unexpected(crane::grpc::ErrCode::ERR_INVALID_UID);
}
task->SetUsername(task->password_entry->Username());
Expand All @@ -946,6 +947,7 @@ CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {
auto user_scoped_ptr =
g_account_manager->GetExistedUserInfo(task->Username());
if (!user_scoped_ptr) {
CRANE_ERROR("User '{}' not found in the account database", task->Username());
return std::unexpected(crane::grpc::ErrCode::ERR_INVALID_USER);
}

Expand All @@ -954,14 +956,17 @@ CtldServer::SubmitTaskToScheduler(std::unique_ptr<TaskInCtld> task) {
task->MutableTaskToCtld()->set_account(user_scoped_ptr->default_account);
} else {
if (!user_scoped_ptr->account_to_attrs_map.contains(task->account)) {
CRANE_ERROR("Account '{}' is not in your account list", task->account);
return std::unexpected(crane::grpc::ErrCode::ERR_USER_ACCOUNT_MISMATCH);
}
}
}

if (!g_account_manager->CheckUserPermissionToPartition(
task->Username(), task->account, task->partition_id)) {
return std::unexpected(crane::grpc::ErrCode::ERR_ALLOWED_PARTITION);
CRANE_ERROR("User '{}' doesn't have permission to use partition '{}' when using account '{}'",
task->Username(), task->partition_id, task->account);
return std::unexpected(crane::grpc::ErrCode::ERR_ALLOWED_PARTITION);
}

auto enable_res = g_account_manager->CheckIfUserOfAccountIsEnabled(
Expand Down
17 changes: 12 additions & 5 deletions src/CraneCtld/TaskScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2720,8 +2720,11 @@ void TaskScheduler::PersistAndTransferTasksToMongodb_(

CraneErrCodeExpected<void> TaskScheduler::AcquireTaskAttributes(TaskInCtld* task) {
auto part_it = g_config.Partitions.find(task->partition_id);
if (part_it == g_config.Partitions.end())
if (part_it == g_config.Partitions.end()) {
CRANE_ERROR("Failed to call AcquireTaskAttributes: {}",
CraneErrCodeStr(crane::grpc::ErrCode::ERR_INVALID_PARTITION));
return std::unexpected(crane::grpc::ErrCode::ERR_INVALID_PARTITION);
}

task->partition_priority = part_it->second.priority;

Expand Down Expand Up @@ -2749,7 +2752,11 @@ CraneErrCodeExpected<void> TaskScheduler::AcquireTaskAttributes(TaskInCtld* task

auto check_qos_result = g_account_manager->CheckAndApplyQosLimitOnTask(
task->Username(), task->account, task);
if (!check_qos_result) return check_qos_result;
if (!check_qos_result) {
CRANE_ERROR("Failed to call CheckAndApplyQosLimitOnTask: {}",
CraneErrCodeStr(check_qos_result.error()));
return check_qos_result;
}

if (!task->TaskToCtld().nodelist().empty() && task->included_nodes.empty()) {
std::list<std::string> nodes;
Expand Down Expand Up @@ -2802,15 +2809,15 @@ CraneErrCodeExpected<void> TaskScheduler::CheckTaskValidity(TaskInCtld* task) {
.memory_sw_bytes),
util::ReadableTypedDeviceMap(
metas_ptr->partition_global_meta.res_total.GetDeviceMap()));
return std::unexpected(crane::grpc::ErrCode::ERR_NO_RESOURCE) ;
return std::unexpected(crane::grpc::ErrCode::ERR_NO_RESOURCE);
}

if (task->node_num > metas_ptr->craned_ids.size()) {
CRANE_TRACE(
"Nodes not enough for task #{}. "
"Partition total Nodes: {}",
task->TaskId(), metas_ptr->craned_ids.size());
return std::unexpected(crane::grpc::ErrCode::ERR_INVALID_NODE_NUM);
return std::unexpected(crane::grpc::ErrCode::ERR_INVALID_NODE_NUM);
}

auto craned_meta_map = g_meta_container->GetCranedMetaMapConstPtr();
Expand All @@ -2832,7 +2839,7 @@ CraneErrCodeExpected<void> TaskScheduler::CheckTaskValidity(TaskInCtld* task) {
"Resource not enough. Task #{} needs {} nodes, while only {} "
"nodes satisfy its requirement.",
task->TaskId(), task->node_num, avail_nodes.size());
return std::unexpected(crane::grpc::ErrCode::ERR_NO_ENOUGH_NODE);
return std::unexpected(crane::grpc::ErrCode::ERR_NO_ENOUGH_NODE);
}

return {};
Expand Down
Loading

0 comments on commit 1c01ff4

Please sign in to comment.