From e0a97fe7592036c96b743ffc0eb0a054f52df69c Mon Sep 17 00:00:00 2001
From: tombl <git@tombl.dev>
Date: Thu, 21 Mar 2024 00:11:14 +0800
Subject: [PATCH] smp is back again

spawn one worker per kernel task, but only one task per virtual cpu can be running at a time
---
 README.md                            |   3 +
 arch/wasm/Kconfig                    |   8 +-
 arch/wasm/include/asm/globals.h      |  20 +++
 arch/wasm/include/asm/sigcontext.h   |   1 -
 arch/wasm/include/asm/sysmem.h       |   4 +-
 arch/wasm/include/asm/thread_info.h  |   9 +-
 arch/wasm/include/asm/wasm_imports.h |   8 +-
 arch/wasm/kernel/Makefile            |   1 -
 arch/wasm/kernel/process.c           | 154 +++++++++--------
 arch/wasm/kernel/setjmp.c            | 103 -----------
 arch/wasm/kernel/setup.c             |  12 +-
 arch/wasm/kernel/smp.c               |  51 ++++--
 arch/wasm/mm/Makefile                |   2 +-
 arch/wasm/mm/tls.c                   |  11 +-
 kernel/kthread.c                     |   1 -
 kernel/sched/fair.c                  |   4 +-
 mm/percpu-tls.c                      |   4 -
 tools/wasm/index.html                |   8 +-
 tools/wasm/index.js                  |   9 +-
 tools/wasm/run.ts                    |   9 +-
 tools/wasm/src/index.ts              |  99 +++++------
 tools/wasm/src/worker.ts             | 249 ++++++++++++++-------------
 22 files changed, 375 insertions(+), 395 deletions(-)
 create mode 100644 arch/wasm/include/asm/globals.h
 delete mode 100644 arch/wasm/kernel/setjmp.c

diff --git a/README.md b/README.md
index 39494210b0f3cb..6c6ffb57213955 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,10 @@
 
 - [x] builds
 - [ ] boots
+  - [ ] port musl
+  - [ ] port busybox
 - [ ] smp
+  - [ ] futex
 - [ ] mmu
 - [ ] highmem
 - [ ] virtio
diff --git a/arch/wasm/Kconfig b/arch/wasm/Kconfig
index e32cda73555f45..9000e35f767789 100644
--- a/arch/wasm/Kconfig
+++ b/arch/wasm/Kconfig
@@ -1,6 +1,6 @@
 config WASM
 	def_bool y
-	depends on !SMP && !BINFMT_ELF && !MMU && !MODULES && !COREDUMP && !SECCOMP && !UPROBES && !COMPAT
+	depends on !BINFMT_ELF && !MMU && !MODULES && !COREDUMP && !SECCOMP && !UPROBES && !COMPAT
 	select ARCH_HAS_BINFMT_WASM
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_NO_PREEMPT
@@ -9,14 +9,14 @@ config WASM
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select BUG
 	select FLATMEM
-	select FORCE_NR_CPUS if SMP
 	select GENERIC_ATOMIC64
 	select GENERIC_CSUM
 	select GENERIC_HWEIGHT
+	select GENERIC_SMP_IDLE_THREAD
 	select OF
 	select OF_EARLY_FLATTREE
 	select PAGE_SIZE_64KB
-	# select SMP
+	select SMP
 	select THREAD_INFO_IN_TASK
 	select UACCESS_MEMCPY if !MMU
 	select USE_PER_CPU_TLS
@@ -32,7 +32,7 @@ config EARLY_PRINTK
 	def_bool y
 
 config SMP
-	def_bool n
+	def_bool y
 
 config NR_CPUS
 	int
diff --git a/arch/wasm/include/asm/globals.h b/arch/wasm/include/asm/globals.h
new file mode 100644
index 00000000000000..dea148c1f098d6
--- /dev/null
+++ b/arch/wasm/include/asm/globals.h
@@ -0,0 +1,20 @@
+#ifndef _WASM_GLOBALS
+#define _WASM_GLOBALS
+
+#include <linux/compiler_attributes.h>
+
+__asm__(".globaltype __stack_pointer, i32\n");
+static void __always_inline set_stack_pointer(void *ptr)
+{
+	__asm__ volatile("local.get %0\n"
+			 "global.set __stack_pointer" ::"r"(ptr));
+}
+
+__asm__(".globaltype __tls_base, i32\n");
+static void __always_inline set_tls_base(void *ptr)
+{
+	__asm__ volatile("local.get %0\n"
+			 "global.set __tls_base" ::"r"(ptr));
+}
+
+#endif
\ No newline at end of file
diff --git a/arch/wasm/include/asm/sigcontext.h b/arch/wasm/include/asm/sigcontext.h
index f7b758a1b9694c..53abc6041328bc 100644
--- a/arch/wasm/include/asm/sigcontext.h
+++ b/arch/wasm/include/asm/sigcontext.h
@@ -2,7 +2,6 @@
 #define _WASM_SIGCONTEXT_H
 
 struct pt_regs {
-	void* current_stack;
 	int (*fn)(void*);
 	void* fn_arg;
 };
diff --git a/arch/wasm/include/asm/sysmem.h b/arch/wasm/include/asm/sysmem.h
index 83baa9438d51bd..32ae0b66a4f591 100644
--- a/arch/wasm/include/asm/sysmem.h
+++ b/arch/wasm/include/asm/sysmem.h
@@ -1,9 +1,11 @@
 #ifndef _WASM_SYSMEM_H
 #define _WASM_SYSMEM_H
 
+#include <linux/types.h>
+
 void zones_init(void);
 void early_tls_init(void);
 void smp_tls_prepare(void);
-void smp_tls_init(int cpu);
+void smp_tls_init(int cpu, bool init);
 
 #endif
diff --git a/arch/wasm/include/asm/thread_info.h b/arch/wasm/include/asm/thread_info.h
index 4fc4e6b8e7f33b..f228ef11fcef13 100644
--- a/arch/wasm/include/asm/thread_info.h
+++ b/arch/wasm/include/asm/thread_info.h
@@ -1,6 +1,8 @@
 #ifndef _WASM_THREAD_INFO_H
 #define _WASM_THREAD_INFO_H
 
+#include <linux/types.h>
+
 /* THREAD_SIZE is the size of the task_struct + kernel stack
  * This is asserted in setup, but the stack should be 1 page,
  * and a task_struct should be *way* less than a page big. */
@@ -11,15 +13,14 @@
 struct thread_info {
 	unsigned long flags;
 	int preempt_count;
-	struct task_struct *from_sched;
-	// unsigned int cpu;
-	unsigned int instance_id;
-	void* jmpbuf;
+	int cpu; // this is for the kernel
+	atomic_t running_cpu; // negative means unscheduled
 };
 
 #define INIT_THREAD_INFO(tsk)                                    \
 	{                                                        \
 		.flags = 0, .preempt_count = INIT_PREEMPT_COUNT, \
+		.running_cpu = ATOMIC_INIT(0),                   \
 	}
 
 #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
diff --git a/arch/wasm/include/asm/wasm_imports.h b/arch/wasm/include/asm/wasm_imports.h
index 6109bf39b23275..0b68efd1eab08c 100644
--- a/arch/wasm/include/asm/wasm_imports.h
+++ b/arch/wasm/include/asm/wasm_imports.h
@@ -25,9 +25,11 @@ import("get_now_nsec") unsigned long long wasm_get_now_nsec(void);
 
 import("get_stacktrace") void wasm_get_stacktrace(char *buf, size_t size);
 
-// import("new_kernel_instance") unsigned int wasm_new_kernel_instance(void);
-// import("poll_kernel_instance") void wasm_poll_kernel_instance(unsigned int id);
-// import("poll_yield") void wasm_poll_yield(void);
+struct task_struct;
+import("new_worker") void wasm_new_worker(struct task_struct *task, char *comm,
+					  size_t comm_len);
+
+import("bringup_secondary") void wasm_bringup_secondary(int cpu, struct task_struct *idle);
 
 #undef import
 
diff --git a/arch/wasm/kernel/Makefile b/arch/wasm/kernel/Makefile
index 89d64d6219ec4e..737d8c6bcdcdad 100644
--- a/arch/wasm/kernel/Makefile
+++ b/arch/wasm/kernel/Makefile
@@ -6,7 +6,6 @@ obj-y += \
 	irq.o \
 	process.o \
 	ptrace.o \
-	setjmp.o \
 	setup.o \
 	stacktrace.o \
 	time.o
diff --git a/arch/wasm/kernel/process.c b/arch/wasm/kernel/process.c
index 32b380efbcc1f7..ba2b6cedcedd4d 100644
--- a/arch/wasm/kernel/process.c
+++ b/arch/wasm/kernel/process.c
@@ -1,87 +1,56 @@
+#include <asm/delay.h>
+#include <asm/globals.h>
+#include <asm/sysmem.h>
 #include <asm/wasm_imports.h>
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/task.h>
 
-__asm__(".globaltype __stack_pointer, i32\n");
-
-int setjmp(void *buf) __attribute__((returns_twice));
-void longjmp(void *buf, int val) __attribute__((noreturn));
-
-static void *get_stack_pointer(void)
-{
-	void *ptr;
-	__asm__ volatile("global.get __stack_pointer\n"
-			 "local.set %0"
-			 : "=r"(ptr));
-	return ptr;
-}
-
-static void __always_inline set_stack_pointer(void *ptr)
-{
-	__asm__ volatile("local.get %0\n"
-			 "global.set __stack_pointer" ::"r"(ptr));
-}
-
-static struct task_struct *prev = &init_task;
-
-inline static struct task_struct *__switch_to_inner(struct task_struct *from,
-						    struct task_struct *to)
-{
-	struct pt_regs *from_regs = task_pt_regs(from);
-	struct pt_regs *to_regs = task_pt_regs(to);
-	struct thread_info *from_info = task_thread_info(from);
-	struct thread_info *to_info = task_thread_info(to);
-
-	if (setjmp(from_info->jmpbuf) == 0) {
-		set_stack_pointer(to_regs->current_stack);
-
-		if (to_info->from_sched)
-			schedule_tail(to_info->from_sched);
-		to_info->from_sched = NULL;
-
-		if (to_regs->fn) {
-			int (*fn)(void *) = to_regs->fn;
-			int result;
-
-			to_regs->fn = NULL;
-			pr_info("call %p(%p)\n", fn, to_regs->fn_arg);
-
-			// callback returns if the kernel thread execs a process?
-			result = fn(to_regs->fn_arg);
-			pr_info("call %p(%p) = %u\n", fn, to_regs->fn_arg,
-				result);
-		} else {
-			pr_info("longjmp %p to %u\n", to_info->jmpbuf, to->pid);
-			longjmp(to_info->jmpbuf, 1);
-		}
-	} else {
-		pr_info("free %p\n", from_info->jmpbuf);
-		kfree(from_info->jmpbuf);
-	}
-
-	return prev;
-}
+// TODO(wasm): replace __builtin_wasm_memory_atomic with completion?
 
 struct task_struct *__switch_to(struct task_struct *from,
 				struct task_struct *to)
 {
-	struct pt_regs *from_regs = task_pt_regs(from);
-	struct pt_regs *to_regs = task_pt_regs(to);
 	struct thread_info *from_info = task_thread_info(from);
 	struct thread_info *to_info = task_thread_info(to);
-
-	from_regs->current_stack = get_stack_pointer();
-	from_info->jmpbuf = kmalloc(16, 0);
-
-	pr_info("alloc %p for %u\n", from_info->jmpbuf, from->pid);
-
-	current = to;
-	to_info->from_sched = prev;
-	prev = from;
-
-	return __switch_to_inner(from, to);
+	int cpu, other_cpu;
+
+	cpu = atomic_xchg_release(&from_info->running_cpu, -1);
+	BUG_ON(cpu < 0); // current process must be scheduled to a cpu
+
+	// give the current cpu to the new worker
+	other_cpu = atomic_xchg_acquire(&to_info->running_cpu, cpu);
+	BUG_ON(other_cpu >= 0); // new process should not have had a cpu
+
+	// wake the other worker:
+	// pr_info("wake cpu=%i task=%p\n", cpu, to);
+	// memory.atomic.notify returns how many waiters were notified
+	// 0 is fine, because it means the worker isn't running yet
+	// 1 is great, because it means someone is waiting for this number
+	// 2+ means there's an issue, because I asked for only 1
+	BUG_ON(__builtin_wasm_memory_atomic_notify(
+		       &to_info->running_cpu.counter, 1) > 1);
+
+	// pr_info("waiting cpu=%i task=%p in switch\n", cpu, from);
+
+	// sleep this worker:
+	/* memory.atomic.wait32 returns:
+	 * 0 -> the thread blocked and was woken
+		= we slept and were woken
+	 * 1 -> the value at the pointer didn't match the passed value
+	 	= somebody gave us their cpu straight await
+	 * 2 -> the thread blocked but timed out
+	 	= not possible because we pass an infinite timeout
+	*/
+	__builtin_wasm_memory_atomic_wait32(&from_info->running_cpu.counter,
+					    /* block if the value is: */ -1,
+					    /* timeout: */ -1);
+
+	// pr_info("woke up cpu=%i task=%p in switch\n", cpu, from);
+
+	BUG_ON(cpu < 0); // we should be given a new cpu
+
+	return from;
 }
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
@@ -90,7 +59,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 
 	memset(childregs, 0, sizeof(struct pt_regs));
 
-	childregs->current_stack = childregs - 1;
+	atomic_set(&task_thread_info(p)->running_cpu, -1);
 
 	if (!args->fn)
 		panic("can't copy userspace thread"); // yet
@@ -98,5 +67,42 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	childregs->fn = args->fn;
 	childregs->fn_arg = args->fn_arg;
 
+	pr_info("spawning task=%p\n", p);
+	wasm_new_worker(p, p->comm, strnlen(p->comm, TASK_COMM_LEN));
+
 	return 0;
 }
+
+__attribute__((export_name("task"))) void _start_task(struct task_struct *task)
+{
+	struct thread_info *info = task_thread_info(task);
+	struct pt_regs *regs = task_pt_regs(task);
+	int cpu;
+
+	set_stack_pointer(task_pt_regs(task) - 1);
+
+	early_printk("                       waiting cpu=%i task=%p in entry\n",
+		     atomic_read(&info->running_cpu), task);
+
+	// if we don't currently have a cpu, wait for one
+	__builtin_wasm_memory_atomic_wait32(&info->running_cpu.counter,
+					    /* block if the value is: */ -1,
+					    /* timeout: */ -1);
+
+	cpu = atomic_read(&info->running_cpu);
+
+	early_printk("                       woke up cpu=%i task=%p in entry\n",
+		     cpu, task);
+
+	smp_tls_init(cpu, false);
+
+	schedule_tail(current);
+
+	current = task;
+
+	// callback returns only if the kernel thread execs a process
+	regs->fn(regs->fn_arg);
+
+	// call into userspace?
+	panic("can't call userspace\n");
+}
diff --git a/arch/wasm/kernel/setjmp.c b/arch/wasm/kernel/setjmp.c
deleted file mode 100644
index 31d0589a49fded..00000000000000
--- a/arch/wasm/kernel/setjmp.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * once https://github.com/llvm/llvm-project/pull/84137 is merged,
- * this file can become significantly less cursed
- */
-
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-
-struct entry {
-	uint32_t id;
-	uint32_t label;
-};
-
-// TODO(wasm): use pcpu_read
-static DEFINE_PER_CPU(
-	struct state {
-		uint32_t id;
-		uint32_t size;
-		struct arg {
-			void *env;
-			int val;
-		} arg;
-	},
-	g_state);
-
-/*
- * table is allocated at the entry of functions which call setjmp.
- *
- *   table = malloc(40);
- *   size = 4;
- *   *(int *)table = 0;
- */
-_Static_assert(sizeof(struct entry) * (4 + 1) <= 40, "entry size");
-
-void *saveSetjmp(void *env, uint32_t label, void *table, uint32_t size)
-{
-	struct state *state = &g_state;
-	struct entry *e = table;
-	uint32_t i;
-	void *p;
-
-	for (i = 0; i < size; i++) {
-		if (e[i].id == 0) {
-			uint32_t id = ++state->id;
-			*(uint32_t *)env = id;
-			e[i].id = id;
-			e[i].label = label;
-			/*
-                         * note: only the first word is zero-initialized
-                         * by the caller.
-                         */
-			e[i + 1].id = 0;
-			goto done;
-		}
-	}
-	size *= 2;
-	p = krealloc(table, sizeof(*e) * (size + 1), 0);
-	if (!p)
-		panic("uh oh\n");
-	table = p;
-done:
-	state->size = size;
-	return table;
-}
-
-uint32_t testSetjmp(unsigned int id, void *table, uint32_t size)
-{
-	struct entry *e = table;
-	uint32_t i;
-	for (i = 0; i < size; i++) {
-		if (e[i].id == id) {
-			return e[i].label;
-		}
-	}
-	return 0;
-}
-
-uint32_t getTempRet0(void)
-{
-	struct state *state = &g_state;
-	return state->size;
-}
-
-void __wasm_longjmp(void *env, int val)
-{
-	struct state *state = &g_state;
-	struct arg *arg = &state->arg;
-	if (val == 0)
-		val = 1;
-	arg->env = env;
-	arg->val = val;
-	__builtin_wasm_throw(1, arg);
-}
-
-void *malloc(size_t size)
-{
-	return kmalloc(size, 0);
-}
-void free(void *ptr)
-{
-	kfree(ptr);
-}
diff --git a/arch/wasm/kernel/setup.c b/arch/wasm/kernel/setup.c
index 9d408db78faafb..1472f2fece3619 100644
--- a/arch/wasm/kernel/setup.c
+++ b/arch/wasm/kernel/setup.c
@@ -35,7 +35,7 @@ extern void __heap_end;
 int __init setup_early_printk(char *buf);
 size_t __per_cpu_size;
 
-__attribute__((export_name("start"))) void __init _start(void)
+__attribute__((export_name("boot"))) void __init _start(void)
 {
 	static char wasm_dt[1024];
 	
@@ -52,6 +52,11 @@ __attribute__((export_name("start"))) void __init _start(void)
 
 	memblock_reserve(0, (phys_addr_t)&__heap_base);
 
+	for (int i = 0; i < NR_CPUS; i++) {
+		set_cpu_possible(i, true);
+		set_cpu_present(i, true);
+	}
+
 	start_kernel();
 }
 
@@ -76,11 +81,6 @@ void __init setup_arch(char **cmdline_p)
 	memblock_dump_all();
 
 	zones_init();
-
-	for (int i = 0; i < NR_CPUS; i++) {
-		set_cpu_possible(i, true);
-		set_cpu_present(i, true);
-	}
 }
 
 void machine_restart(char *cmd)
diff --git a/arch/wasm/kernel/smp.c b/arch/wasm/kernel/smp.c
index 21f69559194570..2501b604872436 100644
--- a/arch/wasm/kernel/smp.c
+++ b/arch/wasm/kernel/smp.c
@@ -1,11 +1,39 @@
+#include <asm/globals.h>
+#include <asm/sysmem.h>
 #include <asm/wasm_imports.h>
-#include <linux/cpumask.h>
+#include <linux/cpu.h>
 #include <linux/of_fdt.h>
 #include <linux/sched.h>
 
+int __cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+	BUG();
+	wasm_bringup_secondary(cpu, idle);
+}
+
+__attribute__((export_name("secondary"))) void
+_start_secondary(int cpu, struct task_struct *idle)
+{
+	set_stack_pointer(task_pt_regs(idle) - 1);
+	smp_tls_init(cpu, true);
+
+	BUG_ON(cpu_online(cpu));
+	set_cpu_online(cpu, true);
+
+	current = idle;
+	mmgrab(&init_mm);
+	current->active_mm = &init_mm;
+
+	local_irq_enable();
+
+	notify_cpu_starting(cpu);
+
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
 static struct {
 	unsigned long bits ____cacheline_aligned;
-} ipi_data[NR_CPUS] __cacheline_aligned;
+} ipi_data[NR_CPUS] __cacheline_aligned = { 0 };
 
 enum ipi_message_type {
 	IPI_RESCHEDULE,
@@ -20,20 +48,6 @@ static void send_ipi_message(const struct cpumask *to_whom,
 	mb();
 	for_each_cpu(i, to_whom)
 		set_bit(operation, &ipi_data[i].bits);
-
-	// mb();
-	// for_each_cpu(i, to_whom)
-	// 	BUG(); // TODO(wasm): postMessage interrupt to other worker
-}
-
-int __cpu_up(unsigned int cpu, struct task_struct *idle)
-{
-	BUG();
-}
-
-void __init smp_init_cpus(void)
-{
-	BUG();
 }
 
 /* Called early in main to prepare the boot cpu */
@@ -47,12 +61,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	memset(ipi_data, 0, sizeof(ipi_data));
 
-	pr_info("SMP bringup with %i parallel processes\n", NR_CPUS);
+	early_printk("SMP bringup with %i parallel processes\n", NR_CPUS);
 }
 
 void smp_send_stop(void)
 {
 	cpumask_t to_whom;
+	BUG();
 	cpumask_copy(&to_whom, cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), &to_whom);
 	send_ipi_message(&to_whom, IPI_CPU_STOP);
@@ -106,7 +121,7 @@ void handle_ipi(struct pt_regs *regs)
 			switch (which) {
 			case IPI_CPU_STOP:
 				wasm_halt();
-
+				__builtin_trap();
 			default:
 				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n",
 				       this_cpu, which);
diff --git a/arch/wasm/mm/Makefile b/arch/wasm/mm/Makefile
index 5e924adc876372..d5a15e1efa960c 100644
--- a/arch/wasm/mm/Makefile
+++ b/arch/wasm/mm/Makefile
@@ -1,2 +1,2 @@
 obj-y := init.o page.o
-obj-$(CONFIG_SMP) := tls.o
+obj-$(CONFIG_SMP) += tls.o
diff --git a/arch/wasm/mm/tls.c b/arch/wasm/mm/tls.c
index 10daf9d992d8d2..6574a04ab00c0b 100644
--- a/arch/wasm/mm/tls.c
+++ b/arch/wasm/mm/tls.c
@@ -1,3 +1,4 @@
+#include <asm/globals.h>
 #include <asm/sysmem.h>
 #include <linux/cpumask.h>
 #include <linux/log2.h>
@@ -44,8 +45,14 @@ void smp_tls_prepare(void)
 #endif
 }
 
-void smp_tls_init(int cpu)
+
+
+void smp_tls_init(int cpu, bool init)
 {
 	BUG_ON(__per_cpu_offset[cpu] == (void *)-1);
-	__wasm_init_tls((void *)__per_cpu_offset[cpu]);
+	if (init) {
+		set_tls_base(__per_cpu_offset[cpu]);
+	} else {
+		__wasm_init_tls((void *)__per_cpu_offset[cpu]);
+	}
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b7839729b80e6a..f97fd01a29325f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -429,7 +429,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->done = &done;
 
 	spin_lock(&kthread_create_lock);
-	early_printk("adding kthread %p %s\n", threadfn, namefmt);
 	list_add_tail(&create->list, &kthread_create_list);
 	spin_unlock(&kthread_create_lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 35c8eb8dcffef1..355162bc38152c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6076,7 +6076,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int idle_h_nr_running = task_has_idle_policy(p);
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 
-	early_printk("enqueue %u %s\n", p->pid, p->comm);
+	pr_info("enqueue %u %s\n", p->pid, p->comm);
 	
 	/*
 	 * The code below (indirectly) updates schedutil which looks at
@@ -6172,7 +6172,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	int idle_h_nr_running = task_has_idle_policy(p);
 	bool was_sched_idle = sched_idle_rq(rq);
 
-	early_printk("dequeue %u %s\n", p->pid, p->comm);
+	pr_info("dequeue %u %s\n", p->pid, p->comm);
 
 	util_est_dequeue(&rq->cfs, p);
 
diff --git a/mm/percpu-tls.c b/mm/percpu-tls.c
index 1349d435752b58..3a4abb962969fb 100644
--- a/mm/percpu-tls.c
+++ b/mm/percpu-tls.c
@@ -19,12 +19,9 @@ bool __percpu_is_static(void *ptr)
 		void *base = __per_cpu_offset[i];
 		if (base != (void *)-1 && ptr >= base &&
 		    ptr < base + __per_cpu_size) {
-			// early_printk("S");
 			return true;
 		}
 	}
-
-	// early_printk("D");
 	return false;
 }
 
@@ -53,7 +50,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 		kfree(ptr);
 		return NULL;
 	}
-	// early_printk("A");
 	return ptr;
 }
 
diff --git a/tools/wasm/index.html b/tools/wasm/index.html
index 0c3271468ef35f..589f04767dd9a3 100644
--- a/tools/wasm/index.html
+++ b/tools/wasm/index.html
@@ -94,8 +94,12 @@
         })
       );
 
-      machine.addEventListener("error", ({ detail: { error, workerName } }) => {
-        term.write(`Error in ${workerName}: ${error}\n`);
+      machine.addEventListener("restart", () => {
+        location.reload();
+      });
+
+      machine.addEventListener("error", ({ detail: { error, threadName } }) => {
+        term.write(`${error.name} in ${threadName}: ${error.message}\n`);
       });
     </script>
   </body>
diff --git a/tools/wasm/index.js b/tools/wasm/index.js
index f33f11e776bf3d..98053105c17ebb 100755
--- a/tools/wasm/index.js
+++ b/tools/wasm/index.js
@@ -15,6 +15,11 @@ const machine = start({
 
 machine.bootConsole.pipeTo(Writable.toWeb(process.stdout));
 
-machine.addEventListener("error", ({ detail: { error, workerName } }) => {
-  console.log(`Error in ${workerName}:`, error);
+machine.addEventListener("restart", () => {
+  console.log("reboot requested. halting...");
+  process.exit(0);
+});
+
+machine.addEventListener("error", ({ detail: { error, threadName } }) => {
+  console.log(`Error in ${threadName}:`, error);
 });
diff --git a/tools/wasm/run.ts b/tools/wasm/run.ts
index f2809df4df3dca..9b270e4d2a520f 100755
--- a/tools/wasm/run.ts
+++ b/tools/wasm/run.ts
@@ -10,6 +10,11 @@ const machine = start({
 
 machine.bootConsole.pipeTo(Deno.stdout.writable);
 
-machine.addEventListener("error", ({ detail: { error, workerName } }) => {
-  console.log(`Error in ${workerName}:`, error);
+machine.addEventListener("restart", () => {
+  console.log("reboot requested. halting...");
+  Deno.exit(0);
+});
+
+machine.addEventListener("error", ({ detail: { error, threadName } }) => {
+  console.log(`Error in ${threadName}:`, error);
 });
diff --git a/tools/wasm/src/index.ts b/tools/wasm/src/index.ts
index 4a35e5d1705112..8e871e4a2cd582 100644
--- a/tools/wasm/src/index.ts
+++ b/tools/wasm/src/index.ts
@@ -5,7 +5,8 @@ import { unreachable } from "./util.ts";
 const PAGE_SIZE = 1 << 16; // 64KiB
 
 interface MachineEventMap {
-  error: CustomEvent<{ error: Error; workerName: string }>;
+  restart: CustomEvent<void>;
+  error: CustomEvent<{ error: Error; threadName: string }>;
 }
 
 interface Machine {
@@ -43,7 +44,11 @@ export function start({
     eventTarget.dispatchEvent(new CustomEvent(type, { detail }));
   }
 
-  function newWorker(name: string) {
+  function newWorker(
+    name: string,
+    initMessage: ToWorkerMessage,
+    initTransfer: Transferable[],
+  ) {
     const worker = new Worker(
       new URL(
         (() => {
@@ -53,49 +58,51 @@ export function start({
       ),
       { type: "module", name },
     );
-    function postMessage(
-      message: ToWorkerMessage,
-      transfer: Transferable[] = [],
-    ) {
-      worker.postMessage(message, transfer);
-    }
 
-    worker.addEventListener(
-      "message",
-      async ({ data }: MessageEvent<FromWorkerMessage>) => {
-        switch (data.type) {
-          case "boot-console-write":
-            bootConsoleWriter.write(data.message);
-            break;
-          case "boot-console-close":
-            await bootConsoleWriter.close();
-            await bootConsole.writable.close();
-            break;
-          case "restart":
-            // @ts-ignore: reloads browsers, inert on deno/node
-            globalThis?.location?.reload?.();
-            break;
-          default:
-            unreachable(
-              data,
-              `invalid worker message type: ${(data as { type: string }).type}`,
-            );
-        }
-      },
-    );
-    worker.addEventListener(
-      "error",
-      (event) =>
-        emit("error", {
-          error: event.error ?? event.message,
-          workerName: name,
-        }),
-    );
+    worker.onmessage = async ({ data }: MessageEvent<FromWorkerMessage>) => {
+      switch (data.type) {
+        case "boot-console-write":
+          bootConsoleWriter.write(data.message);
+          break;
+        case "boot-console-close":
+          await bootConsoleWriter.close();
+          await bootConsole.writable.close();
+          break;
+        case "restart":
+          emit("restart", undefined);
+          break;
+        case "spawn":
+          newWorker(data.name, {
+            type: "task",
+            task: data.task,
+            vmlinux,
+            memory,
+          }, []);
+          break;
+        case "error":
+          emit("error", { error: data.error, threadName: name });
+          break;
+        case "bringup-secondary":
+          newWorker(`entry${data.cpu}`, {
+            type: "secondary",
+            cpu: data.cpu,
+            idle: data.idle,
+            vmlinux,
+            memory,
+          }, []);
+          break;
+        default:
+          unreachable(
+            data,
+            `invalid worker message type: ${(data as { type: string }).type}`,
+          );
+      }
+    };
 
-    return { postMessage };
+    worker.postMessage(initMessage, initTransfer);
   }
 
-  const memory: WebAssembly.Memory = new WebAssembly.Memory({
+  const memory = new WebAssembly.Memory({
     initial: memoryPages,
     maximum: memoryPages,
     shared: true,
@@ -113,13 +120,9 @@ export function start({
     },
   });
 
-  newWorker("entry").postMessage(
-    {
-      type: "boot",
-      vmlinux,
-      memory,
-      devicetree,
-    },
+  newWorker(
+    "entry",
+    { type: "boot", devicetree, vmlinux, memory },
     [devicetree.buffer],
   );
 
diff --git a/tools/wasm/src/worker.ts b/tools/wasm/src/worker.ts
index 1e658cc9eff1a8..e176003f4d5175 100644
--- a/tools/wasm/src/worker.ts
+++ b/tools/wasm/src/worker.ts
@@ -3,28 +3,30 @@
 
 import { assert, unreachable } from "./util.ts";
 
-export type ToWorkerMessage = {
-  type: "boot";
-  vmlinux: WebAssembly.Module;
-  memory: WebAssembly.Memory;
-  devicetree: Uint8Array;
-};
+export type ToWorkerMessage =
+  & {
+    vmlinux: WebAssembly.Module;
+    memory: WebAssembly.Memory;
+  }
+  & (
+    | { type: "boot"; devicetree: Uint8Array }
+    | { type: "task"; task: number }
+    | { type: "secondary"; cpu: number; idle: number }
+  );
 
 export type FromWorkerMessage =
   | { type: "boot-console-write"; message: Uint8Array }
   | { type: "boot-console-close" }
   | { type: "restart" }
-
-function postMessage(
-  message: FromWorkerMessage,
-  transfer: Transferable[] = [],
-) {
-  self.postMessage(message, transfer);
-}
+  | { type: "spawn"; task: number; name: string }
+  | { type: "error"; error: Error }
+  | { type: "bringup-secondary"; cpu: number; idle: number };
 
 interface Instance {
   exports: {
-    start(): void;
+    boot(): void;
+    task(task: number): void;
+    secondary(cpu: number, idle: number): void;
   };
 }
 
@@ -35,115 +37,130 @@ interface Instance {
 //   return value;
 // }
 
-function boot(
-  module: WebAssembly.Module,
-  wasmMemory: WebAssembly.Memory,
-  options: { type: "boot"; devicetree: Uint8Array },
+function postMessage(
+  message: FromWorkerMessage,
+  transfer: Transferable[] = [],
 ) {
-  const memory = new Uint8Array(wasmMemory.buffer);
-
-  let irqflags = 0;
-  const imports = {
-    env: { memory: wasmMemory },
-    kernel: {
-      breakpoint() {
-        // deno-lint-ignore no-debugger
-        debugger;
-      },
-      halt() {
-        self.close();
-      },
-      restart() {
-        postMessage({ type: "restart" });
-      },
-
-      boot_console_write(msg: number, len: number) {
-        const message = memory.slice(msg, msg + len);
-        postMessage({ type: "boot-console-write", message }, [message.buffer]);
-      },
-      boot_console_close() {
-        postMessage({ type: "boot-console-close" }, []);
-      },
+  self.postMessage(message, transfer);
+}
 
-      return_address() {
-        return 0;
-      },
-      // return_address(n: number) {
-      //   const matches = new Error().stack?.matchAll(BACKTRACE_ADDRESS_RE);
-      //   if (!matches) return -1;
-      //   const match = iteratorNth(matches, n + 1);
-      //   return parseInt(match?.[1] ?? "-1");
-      // },
-
-      set_irq_enabled(flags: number) {
-        irqflags = flags;
-      },
-      get_irq_enabled() {
-        return irqflags;
-      },        
-      get_dt(buf: number, size: number) {
-        assert(options.type === "boot", "get_dt called on non-boot thread");
-        assert(size >= options.devicetree.byteLength, "Device tree truncated");
-        memory.set(options.devicetree.slice(0, size), buf);
+self.onmessage = ({ data }: MessageEvent<ToWorkerMessage>) => {
+  try {
+    const memory = new Uint8Array(data.memory.buffer);
+
+    let irqflags = 0;
+    const imports = {
+      env: { memory: data.memory },
+      kernel: {
+        breakpoint() {
+          // deno-lint-ignore no-debugger
+          debugger;
+        },
+        halt() {
+          self.close();
+        },
+        restart() {
+          postMessage({ type: "restart" });
+        },
+
+        boot_console_write(msg: number, len: number) {
+          const message = memory.slice(msg, msg + len);
+          postMessage({ type: "boot-console-write", message }, [
+            message.buffer,
+          ]);
+        },
+        boot_console_close() {
+          postMessage({ type: "boot-console-close" });
+        },
+
+        return_address() {
+          return 0;
+        },
+        // return_address(n: number) {
+        //   const matches = new Error().stack?.matchAll(BACKTRACE_ADDRESS_RE);
+        //   if (!matches) return -1;
+        //   const match = iteratorNth(matches, n + 1);
+        //   return parseInt(match?.[1] ?? "-1");
+        // },
+
+        set_irq_enabled(flags: number) {
+          irqflags = flags;
+        },
+        get_irq_enabled() {
+          return irqflags;
+        },
+        get_dt(buf: number, size: number) {
+          assert(data.type === "boot", "get_dt called on non-boot thread");
+          assert(
+            size >= data.devicetree.byteLength,
+            "Device tree truncated",
+          );
+          memory.set(data.devicetree.slice(0, size), buf);
+        },
+
+        get_now_nsec() {
+          /*
+              The more straightforward way to do this is
+              `BigInt(Math.round(performance.now() * 1_000_000))`.
+              Below is semantically identical but has less floating point
+              inaccuracy.
+              `performance.now()` has 5μs precision in the browser.
+              In server runtimes it has full nanosecond precision, but this code
+              rounds to the same 5μs precision.
+            */
+          return BigInt(Math.round(performance.now() * 200)) * 5000n;
+        },
+
+        get_stacktrace(buf: number, size: number) {
+          // 5 lines: strip Error, strip 4 common lines of stack
+          const trace = new TextEncoder().encode(
+            new Error().stack?.split("\n").slice(5).join("\n"),
+          );
+          if (size >= trace.byteLength) {
+            /// 46 = "."
+            trace[size - 1] = 46;
+            trace[size - 2] = 46;
+            trace[size - 3] = 46;
+          }
+          memory.set(trace.slice(0, size), buf);
+        },
+
+        new_worker(task: number, comm: number, commLen: number) {
+          const name = new TextDecoder().decode(
+            memory.slice(comm, comm + commLen),
+          );
+          postMessage({ type: "spawn", task, name });
+        },
+
+        bringup_secondary(cpu: number, idle: number) {
+          postMessage({ type: "bringup-secondary", cpu, idle });
+        },
       },
+    } satisfies WebAssembly.Imports;
 
-      get_now_nsec() {
-        /*
-          The more straightforward way to do this is
-          `BigInt(Math.round(performance.now() * 1_000_000))`.
-          Below is semantically identical but has less floating point
-          inaccuracy.
-          `performance.now()` has 5μs precision in the browser.
-          In server runtimes it has full nanosecond precision, but this code
-          rounds to the same 5μs precision.
-        */
-        return BigInt(Math.round(performance.now() * 200)) * 5000n;
-      },
+    const instance = new WebAssembly.Instance(
+      data.vmlinux,
+      imports,
+    ) as Instance;
 
-      get_stacktrace(buf: number, size: number) {
-        // 5 lines: strip Error, strip 4 common lines of stack
-        const trace = new TextEncoder().encode(
-          new Error().stack?.split("\n").slice(5).join("\n"),
-        );
-        if (size >= trace.byteLength) {
-          /// 46 = "."
-          trace[size - 1] = 46;
-          trace[size - 2] = 46;
-          trace[size - 3] = 46;
-        }
-        memory.set(trace.slice(0, size), buf);
-      },
-    },
-  } satisfies WebAssembly.Imports;
-
-  const instances = [
-    new WebAssembly.Instance(module, imports) as Instance
-  ];
-
-  switch (options.type) {
-    case "boot":
-      instances[0].exports.start();
-      break;
-    default:
-      unreachable(options.type);
-  }
-}
-
-self.addEventListener(
-  "message",
-  ({ data }: MessageEvent<ToWorkerMessage>) => {
     switch (data.type) {
       case "boot":
-        boot(data.vmlinux, data.memory, {
-          type: "boot",
-          devicetree: data.devicetree,
-        });
+        instance.exports.boot();
+        break;
+      case "task":
+        instance.exports.task(data.task);
+        break;
+      case "secondary":
+        instance.exports.secondary(data.cpu, data.idle);
         break;
       default:
-        unreachable(
-          data.type,
-          `invalid worker message type: ${(data as { type: string }).type}`,
-        );
+        unreachable(data);
     }
-  },
-);
+  } catch (error) {
+    console.error(error);
+    postMessage({
+      type: "error",
+      error: error instanceof Error ? error : new Error(error),
+    });
+  }
+};