Skip to content

Commit

Permalink
Feat: Add Pending Reason Displaying (#278)
Browse files Browse the repository at this point in the history
* pending作业显示原因

* cacctmgr默认qos的priority为1000,判断qos的qriority请求是否未0,是的话就改为1000

* refactor: Refactor

* fix: Remove "Resource" and Refactor task_priority_vec

---------

Co-authored-by: Nativu5 <44155313+Nativu5@users.noreply.github.com>
  • Loading branch information
June19980 and Nativu5 authored Jun 26, 2024
1 parent ec98142 commit e01f836
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 12 deletions.
11 changes: 7 additions & 4 deletions protos/PublicDefs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,13 @@ message TaskInfo {

// Dynamic task information
TaskStatus status = 31;
string craned_list = 32;
double alloc_cpu = 33;
uint32 exit_code = 34;
uint32 priority = 35;
double alloc_cpu = 32;
uint32 exit_code = 33;
uint32 priority = 34;
oneof pending_reason_or_craned_list {
string pending_reason = 35;
string craned_list = 36;
}
}

message PartitionInfo {
Expand Down
3 changes: 2 additions & 1 deletion src/CraneCtld/CtldGrpcServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,8 @@ grpc::Status CraneCtldServiceImpl::AddQos(

qos.name = qos_info->name();
qos.description = qos_info->description();
qos.priority = qos_info->priority();
qos.priority =
qos_info->priority() == 0 ? kDefaultQosPriority : qos_info->priority();
qos.max_jobs_per_user = qos_info->max_jobs_per_user();
qos.max_cpus_per_user = qos_info->max_cpus_per_user();
qos.max_time_limit_per_task =
Expand Down
1 change: 1 addition & 0 deletions src/CraneCtld/CtldPublicDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ struct TaskInCtld {
uint32_t nodes_alloc;
std::vector<CranedId> executing_craned_ids;
std::string allocated_craneds_regex;
std::string pending_reason;

double mandated_priority{0.0};
double cached_priority{0.0};
Expand Down
19 changes: 12 additions & 7 deletions src/CraneCtld/TaskScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1534,8 +1534,12 @@ void TaskScheduler::QueryTasksInRam(
task_it->set_priority(task.cached_priority);

task_it->set_status(task.RuntimeAttr().status());
task_it->set_craned_list(
util::HostNameListToStr(task.RuntimeAttr().craned_ids()));
if (task.RuntimeAttr().status() == crane::grpc::Pending) {
task_it->set_pending_reason(task.pending_reason);
} else {
task_it->set_craned_list(
util::HostNameListToStr(task.RuntimeAttr().craned_ids()));
}
};

auto task_rng_filter_time = [&](auto& it) {
Expand Down Expand Up @@ -2512,20 +2516,21 @@ std::vector<task_id_t> MultiFactorPriority::GetOrderedTaskIdList(
absl::Time now = absl::Now();
CalculateFactorBound_(pending_task_map, running_task_map, now);

std::vector<std::pair<task_id_t, double>> task_priority_vec;
std::vector<std::pair<TaskInCtld*, double>> task_priority_vec;
for (const auto& [task_id, task] : pending_task_map) {
// Admin may manually specify the priority of a task.
// In this case, MultiFactorPriority will not calculate the priority.
double priority = (task->mandated_priority == 0.0)
? CalculatePriority_(task.get(), now)
: task->mandated_priority;
task->cached_priority = priority;
task_priority_vec.emplace_back(task->TaskId(), priority);
task->pending_reason = "Priority";
task_priority_vec.emplace_back(task.get(), priority);
}

std::sort(task_priority_vec.begin(), task_priority_vec.end(),
[](const std::pair<task_id_t, double>& a,
const std::pair<task_id_t, double>& b) {
[](const std::pair<TaskInCtld*, double>& a,
const std::pair<TaskInCtld*, double>& b) {
return a.second > b.second;
});

Expand All @@ -2535,7 +2540,7 @@ std::vector<task_id_t> MultiFactorPriority::GetOrderedTaskIdList(
task_id_vec.reserve(id_vec_len);

for (int i = 0; i < id_vec_len; i++)
task_id_vec.emplace_back(task_priority_vec[i].first);
task_id_vec.emplace_back(task_priority_vec[i].first->TaskId());

return task_id_vec;
}
Expand Down
1 change: 1 addition & 0 deletions src/CraneCtld/TaskScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class BasicPriority : public IPrioritySorter {

int i = 0;
for (auto it = pending_task_map.begin(); i < len; i++, it++) {
it->second->pending_reason = "Priority";
task_id_vec.emplace_back(it->first);
}

Expand Down
1 change: 1 addition & 0 deletions src/Utilities/PublicHeader/include/crane/PublicHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ inline const char* kUnlimitedQosName = "UNLIMITED";
inline const char* kHostFilePath = "/etc/hosts";

inline constexpr size_t kDefaultQueryTaskNumLimit = 1000;
inline constexpr uint32_t kDefaultQosPriority = 1000;
inline constexpr uint64_t kPriorityDefaultMaxAge = 7 * 24 * 3600; // 7 days

inline const char* kDefaultCraneBaseDir = "/var/crane/";
Expand Down

0 comments on commit e01f836

Please sign in to comment.