Skip to content

Commit

Permalink
clone3: Add syscall
Browse files Browse the repository at this point in the history
Introduce clone_internal similar to Linux's kernel_clone and
reimplement clone and implement clone3 in terms of it.

Most of Linux's clone3 functionality is irrelevant to Nanos. The main
thing we are concerned about is the stack. The stack size is now
passed to the kernel and the stack pointer points to the bottom of the
stack instead of the top.

It seems that at some point glibc started using clone3. This get's
programs linked against glibc-2.35 to run.
  • Loading branch information
richiejp authored and francescolavra committed Jul 13, 2022
1 parent 7cdb1d3 commit 20dd6d9
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 37 deletions.
20 changes: 20 additions & 0 deletions src/unix/system_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,26 @@ struct tms {
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */

struct clone_args_internal {
u64 flags;
int *child_tid;
int *parent_tid;
void *stack;
bytes stack_size;
u64 tls;
};

struct clone_args {
u64 flags;
u64 pidfd;
u64 child_tid;
u64 parent_tid;
u64 exit_signal;
u64 stack;
u64 stack_size;
u64 tls;
};

#define EPOLL_CTL_ADD 0x1
#define EPOLL_CTL_DEL 0x2
#define EPOLL_CTL_MOD 0x3
Expand Down
119 changes: 83 additions & 36 deletions src/unix/thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,53 @@ sysreturn arch_prctl(int code, unsigned long addr)
}
#endif

static sysreturn clone_internal(struct clone_args_internal *args)
{
u64 flags = args->flags;
void *stack = args->stack;
bytes stack_size = args->stack_size;

if (!stack_size)
return -EINVAL;

if (!(flags & CLONE_THREAD)) {
thread_log(current, "attempted to create new process, aborting.");
return -ENOSYS;
}

if (!validate_user_memory(stack, stack_size, true))
return -EFAULT;

if (((flags & CLONE_PARENT_SETTID) &&
!validate_user_memory(args->parent_tid, sizeof(u64), true)) ||
((flags & CLONE_CHILD_CLEARTID) &&
!validate_user_memory(args->child_tid, sizeof(u64), true)))
return -EFAULT;

thread t = create_thread(current->p, INVALID_PHYSICAL);
context_frame f = thread_frame(t);

clone_frame_pstate(f, thread_frame(current));
thread_clone_sigmask(t, current);

set_syscall_return(t, 0);
f[SYSCALL_FRAME_SP] = (u64)stack + stack_size;
if (flags & CLONE_SETTLS)
set_tls(f, args->tls);
if (flags & CLONE_PARENT_SETTID)
*(args->parent_tid) = t->tid;
if (flags & CLONE_CHILD_SETTID)
*(args->child_tid) = t->tid;
if (flags & CLONE_CHILD_CLEARTID)
t->clear_tid = args->child_tid;
t->blocked_on = 0;
t->syscall = 0;
f[FRAME_FULL] = true;
thread_reserve(t);
schedule_thread(t);
return t->tid;
}

#if defined(__x86_64__)
sysreturn clone(unsigned long flags, void *child_stack, int *ptid, int *ctid, unsigned long newtls)
#elif defined(__aarch64__) || defined(__riscv)
Expand All @@ -60,51 +107,51 @@ sysreturn clone(unsigned long flags, void *child_stack, int *ptid, unsigned long
thread_log(current, "clone: flags %lx, child_stack %p, ptid %p, ctid %p, newtls %lx",
flags, child_stack, ptid, ctid, newtls);

if (!(flags & CLONE_THREAD)) {
thread_log(current, "attempted to create new process, aborting.");
return set_syscall_error(current, ENOSYS);
}

/* no stack size given, just validate the top word */
if (!validate_user_memory(child_stack, sizeof(u64), true))
return set_syscall_error(current, EFAULT);

if (((flags & CLONE_PARENT_SETTID) &&
!validate_user_memory(ptid, sizeof(int), true)) ||
((flags & CLONE_CHILD_CLEARTID) &&
!validate_user_memory(ctid, sizeof(int), true)))
return set_syscall_error(current, EFAULT);

thread t = create_thread(current->p, INVALID_PHYSICAL);
context_frame f = thread_frame(t);
/* clone frame processor state */
clone_frame_pstate(f, thread_frame(current));
thread_clone_sigmask(t, current);

/* clone behaves like fork at the syscall level, returning 0 to the child */
set_syscall_return(t, 0);
f[SYSCALL_FRAME_SP] = u64_from_pointer(child_stack);
if (flags & CLONE_SETTLS)
set_tls(f, newtls);
if (flags & CLONE_PARENT_SETTID)
*ptid = t->tid;
if (flags & CLONE_CHILD_CLEARTID)
t->clear_tid = ctid;
t->blocked_on = 0;
t->syscall = 0;
f[FRAME_FULL] = true;
thread_reserve(t);
schedule_thread(t);
return t->tid;
struct clone_args_internal args = {
.flags = flags,
.child_tid = ctid,
.parent_tid = ptid,
/* no stack size given, just validate the top word */
.stack = child_stack - sizeof(u64),
.stack_size = sizeof(u64),
.tls = newtls,
};

return clone_internal(&args);
}

sysreturn clone3(struct clone_args *args, bytes size)
{
thread_log(current,
"clone3: args_size: %ld, pidfd: %p, child_tid: %p, parent_tid: %p, exit_signal: %ld, stack: %p, stack_size: 0x%lx, tls: %p",
size, args->pidfd, args->child_tid, args->parent_tid, args->exit_signal,
args->stack, args->stack_size, args->tls);

if (size < sizeof(*args))
return -EINVAL;

if (!validate_user_memory(args, size, false))
return -EFAULT;

struct clone_args_internal argsi = {
.flags = args->flags,
.child_tid = (int *)args->child_tid,
.parent_tid = (int *)args->parent_tid,
.stack = (void *)args->stack,
.stack_size = args->stack_size,
.tls = args->tls
};

return clone_internal(&argsi);
}

void register_thread_syscalls(struct syscall *map)
{
register_syscall(map, futex, futex, 0);
register_syscall(map, set_robust_list, set_robust_list, 0);
register_syscall(map, get_robust_list, get_robust_list, 0);
register_syscall(map, clone, clone, SYSCALL_F_SET_PROC);
register_syscall(map, clone3, clone3, SYSCALL_F_SET_PROC);
#ifdef __x86_64__
register_syscall(map, arch_prctl, arch_prctl, 0);
#endif
Expand Down
3 changes: 2 additions & 1 deletion src/x86_64/unix_syscalls.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,5 +333,6 @@
#define SYS_io_uring_setup 425
#define SYS_io_uring_enter 426
#define SYS_io_uring_register 427
#define SYS_clone3 435

#define SYS_MAX 428
#define SYS_MAX 436

0 comments on commit 20dd6d9

Please sign in to comment.