-
Notifications
You must be signed in to change notification settings - Fork 103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DRAFT : Register memory #1010
base: master
Are you sure you want to change the base?
DRAFT : Register memory #1010
Conversation
Can one of the admins verify this patch? |
src/components/ec/base/ucc_ec_base.h
Outdated
} ucc_ee_executor_task_t; | ||
|
||
typedef struct node_ucc_ee_executor_task node_ucc_ee_executor_task_t; | ||
typedef struct node_ucc_ee_executor_task { | ||
ucc_ee_executor_task_t *val; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change val to etask
src/components/tl/ucp/tl_ucp_coll.h
Outdated
@@ -57,6 +57,23 @@ void ucc_tl_ucp_team_default_score_str_free( | |||
} \ | |||
} while(0) | |||
|
|||
// #define EXEC_TASK_TEST_2(_errmsg, _etask) do { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove
while (polls++ < task->n_polls) { | ||
node_ucc_ee_executor_task_t *current_node; | ||
current_node = task->allgather_kn.etask_linked_list_head; | ||
while(current_node != NULL) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ask Sergey should this be put inside / outside of loop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what are you talking about ? I don't remember
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
performance wise, should we iterate the list of etasks inside the while loop, or outside (1 TIME OR N_POLL TIME)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
almost no performance difference according to Sergey
src/components/tl/ucp/tl_ucp_coll.h
Outdated
status = ucc_ee_executor_task_test(current_node->val); \ | ||
if (status > 0) { \ | ||
ucc_ee_executor_task_finalize(current_node->val); \ | ||
ucp_memcpy_device_complete(current_node->val->completion, status); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
have to come before finalize
src/components/tl/ucp/tl_ucp_coll.h
Outdated
node_ucc_ee_executor_task_t *current_node; | ||
current_node = task->allgather_kn.etask_linked_list_head; | ||
while(current_node != NULL) { | ||
if (current_node->val != NULL) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can be removed
task->allgather_kn.etask_linked_list_head = (node_ucc_ee_executor_task_t *)malloc(sizeof(node_ucc_ee_executor_task_t)); | ||
task->allgather_kn.etask_linked_list_head->val = NULL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
task->allgather_kn.etask_linked_list_head = NULL
ucc_tl_ucp_context_t *ctx = UCC_TL_UCP_TEAM_CTX(team); | ||
ucp_mem_map_params_t mmap_params; | ||
ucp_mem_h mh; | ||
int size_of_list = 6; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe start with 1;
UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; | ||
mmap_params.memory_type = ucc_memtype_to_ucs[mem_type]; | ||
|
||
// EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could be removed
|
||
mmap_params.address = task->allgather_kn.sbuf; | ||
mmap_params.length = local * dt_size; | ||
UCPCHECK_GOTO(ucs_status_to_ucc_status(ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh)), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dst is not important? ask Sergey
ucc_tl_ucp_send_nb(void *buffer, size_t msglen, ucc_memory_type_t mtype,
ucc_rank_t dest_group_rank, ucc_tl_ucp_team_t *team,
ucc_tl_ucp_task_t *task)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dest isnt important
mmap_params.address = rbuf; | ||
mmap_params.length = data_size; | ||
UCPCHECK_GOTO(ucs_status_to_ucc_status(ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh)), | ||
task, out); | ||
if (count_mh == size_of_list){ | ||
size_of_list *= 2; | ||
mh_list = (ucp_mem_h *)realloc(mh_list, size_of_list * sizeof(ucp_mem_h)); | ||
} | ||
mh_list[count_mh++] = mh; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make sure all cases are perfectly identical logic (double list size etc), try to create a macro if possible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what isnt identical, I'm not sure to understand
} | ||
mh_list[count_mh++] = mh; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
task->allgather_kn.etask = NULL; | ||
task->allgather_kn.etask_linked_list_head->etask); | ||
|
||
// task->allgather_kn.etask_linked_list_head = NULL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove
@@ -59,38 +63,51 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |||
size_t extra_count; | |||
|
|||
EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", | |||
task->allgather_kn.etask); | |||
task->allgather_kn.etask = NULL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
a BUG - you can't remove it. its needed
task, out); | ||
if (count_mh >= max_count){ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is a function ucc_assert for that look what its doing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why u chose to out that check here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should I just raise an error maybe ? How ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if I use ucc_assert, I can't add my print right ?
task, out); | ||
if (count_mh >= max_count){ | ||
printf("count_mh is bigger than it should (%d >= %d).", count_mh, max_count); | ||
goto out; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not good here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you mean even with ucc_assert ?
task, out); | ||
if (count_mh >= max_count){ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same as above
status = ucc_coll_task_get_executor(&task->super, &exec); | ||
if (ucc_unlikely(status != UCC_OK)) { | ||
task->super.status = status; | ||
return -1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe u can simply return uuc_status? check its values, it's enum so it's int eventually
|
||
if (ucc_unlikely(status != UCC_OK)) { | ||
task->super.status = status; | ||
return -1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same as above
task->allgather_kn.etask_linked_list_head = new_node; | ||
|
||
task->allgather_kn.etask_linked_list_head->etask->completion = completion; | ||
return 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same as above
status = ucc_ee_executor_task_test(etask); | ||
while (status>0) { | ||
status = ucc_ee_executor_task_test(etask); | ||
// if (ucc_unlikely(status < 0)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not remove?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added it now
src/schedule/ucc_schedule.h
Outdated
@@ -7,6 +7,7 @@ | |||
#ifndef UCC_SCHEDULE_H_ | |||
#define UCC_SCHEDULE_H_ | |||
|
|||
#include <ucp/api/ucp.h> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is it really needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think so for the ucp_mem_map
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
check again
src/components/tl/ucp/tl_ucp_coll.h
Outdated
ucp_mem_h *mh_list; | ||
int count_mh; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add both to allgather
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why ? Is it a bug ?
@@ -243,12 +371,18 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_init_r( | |||
ucc_sbgp_t *sbgp; | |||
|
|||
task = ucc_tl_ucp_init_task(coll_args, team); | |||
ucc_mpool_init(&task->allgather_kn.etask_node_mpool, 0, sizeof(node_ucc_ee_executor_task_t), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
u didn't check for status
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what should I do if status<0 ?
if (tl_team->cfg.use_reordering && | ||
coll_args->args.coll_type == UCC_COLL_TYPE_ALLREDUCE) { | ||
sbgp = ucc_topo_get_sbgp(tl_team->topo, UCC_SBGP_FULL_HOST_ORDERED); | ||
task->subset.myrank = sbgp->group_rank; | ||
task->subset.map = sbgp->map; | ||
} | ||
register_memory(&task->super); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
u didn't check for status
src/components/tl/ucp/tl_ucp_coll.h
Outdated
#define MEM_MAP() do { \ | ||
status = ucp_mem_map(ctx->worker.ucp_context, &mmap_params, &mh); \ | ||
if (UCC_OK != status) { \ | ||
task->super.status = status; \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, just return status and error
ucc/src/components/tl/nccl/tl_nccl_context.c
Line 196 in 2359644
tl_error(self->super.super.lib, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I should keep this self->super.super.lib or it's something else for here ?
src/components/tl/ucp/tl_ucp_coll.h
Outdated
ucc_rank_t recv_dist; | ||
ucc_mpool_t etask_node_mpool; | ||
ucc_ee_executor_task_t *etask; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move back to original place
} | ||
UCC_KN_PHASE_PROXY: | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test(task)) { | ||
if (UCC_INPROGRESS == ucc_tl_ucp_test_with_etasks(task)) { | ||
SAVE_STATE(UCC_KN_PHASE_PROXY); | ||
return; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add in out
|ucc_assert(count_mh == max_count) or smth similar
@@ -50,6 +51,9 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |||
args->root : 0; | |||
ucc_rank_t rank = VRANK(task->subset.myrank, broot, size); | |||
size_t local = GET_LOCAL_COUNT(args, size, rank); | |||
ucp_mem_h *mh_list = task->mh_list; | |||
int max_count = task->count_mh; | |||
int count_mh = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BUG - this is a mistake, because u want to keep it when u go in and out from progress
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
right but so how do I initialize it ?
@@ -60,37 +64,42 @@ void ucc_tl_ucp_allgather_knomial_progress(ucc_coll_task_t *coll_task) | |||
|
|||
EXEC_TASK_TEST(UCC_KN_PHASE_INIT, "failed during ee task test", | |||
task->allgather_kn.etask); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove
@@ -193,7 +205,6 @@ ucc_status_t ucc_tl_ucp_allgather_knomial_start(ucc_coll_task_t *coll_task) | |||
|
|||
UCC_TL_UCP_PROFILE_REQUEST_EVENT(coll_task, "ucp_allgather_kn_start", 0); | |||
ucc_tl_ucp_task_reset(task, UCC_INPROGRESS); | |||
task->allgather_kn.etask = NULL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BUG - why removed?
|
||
if (KN_NODE_EXTRA == node_type) { | ||
if (p->type != KN_PATTERN_ALLGATHERX) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need for all of the line spaces
6142008
to
3ca742c
Compare
What
add callback function to replace cuda copy
Why ?
There is a deadlock when executing collectives in AI workload