Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

The full multicore kernel

We have everything we need. Linear continuations, effects as the dispatch mechanism, NoC packets as the cross-core transport, and the discipline that the kernel code is invariant to whether it’s running on one core or many. Let me write the full multicore kernel, then walk through what’s new and what isn’t.

The structure: I’ll show the complete kernel first, then dissect it. Pay attention to how much of it you’ve already seen.

The full kernel

(* === Core identification === *)

type core_id = int    (* 0..3 for our 4-core toy *)

let this_core () : core_id / {Unsafe} =
  unsafe { __read_core_id_register () }

(* === Process representation === *)

type process = {
  id        : int;
  home_core : core_id;       (* where this process was spawned *)
  k         : unit continuation;
}

(* The 'k' field is linear; therefore process is linear. *)

(* === Effect interfaces === *)

effect Ready {
  push : process -> ()
  pop  : () -> option process
  steal_from : core_id -> option process    (* for work-stealing *)
}

effect Migration {
  to_core : core_id -> ()
}

effect ProcLifecycle {
  spawn : (() -> ()) -> ()
  exit  : () -> empty
  yield : () -> ()
}

effect IO {
  print     : string -> ()
  read_byte : () -> byte
}

effect Mem {
  alloc : { layout : layout; size : int } -> ptr
  free  : ptr -> ()
}

effect Hardware {
  tick           : core_id -> ()
  noc_arrival    : { src : core_id; op : int; payload : int } -> ()
  key_arrival    : byte -> ()
}

(* === Core-local scheduler state === *)

type sched_state = {
  mutable ready    : process queue;
  mutable waiters  : process queue;
  mutable inbox    : byte queue;
  mutable next_id  : int;
  this_core        : core_id;
}

let sched : sched_state = init_sched_state (this_core ())

(* === The scheduler === *)

let rec scheduler () : () / {Ready, Mem, IO, ProcLifecycle, Hardware} =
  match perform Ready.pop () with
  | None -> idle_or_steal ()
  | Some p ->
      with proc_handlers p in {
        invoke p.k ()
      }

and proc_handlers p = {
  | ProcLifecycle.yield (), k ->
      perform Ready.push { id = p.id; home_core = p.home_core; k };
      scheduler ()

  | ProcLifecycle.exit (), _k ->
      (* k is dropped; the process is gone *)
      scheduler ()

  | ProcLifecycle.spawn body, k ->
      let new_p = make_process body (this_core ()) in
      perform Ready.push new_p;
      resume k ()

  | Migration.to_core target, k ->
      if target == this_core () then
        perform Ready.push { id = p.id; home_core = p.home_core; k };
        scheduler ()
      else
        perform NoC.send target (Wrap_RemoteProcess
          { id = p.id; home_core = p.home_core; k });
        scheduler ()

  | Hardware.tick _, k ->
      perform Ready.push { id = p.id; home_core = p.home_core; k };
      scheduler ()

  | IO.read_byte (), k ->
      match Queue.pop sched.inbox with
      | Some b -> resume k b
      | None ->
          Queue.push sched.waiters { id = p.id; home_core = p.home_core; k };
          scheduler ()

  | IO.print msg, k ->
      perform UartTx msg;
      resume k ()

  | Hardware.key_arrival b, k ->
      match Queue.pop sched.waiters with
      | Some waiter ->
          perform Ready.push { id = p.id; home_core = p.home_core; k };
          invoke waiter.k b
      | None ->
          Queue.push sched.inbox b;
          resume k ()

  | Hardware.noc_arrival { src; op; payload }, k ->
      handle_noc_message src op payload;
      resume k ()
}

and idle_or_steal () : () / {Ready, Mem, IO, ProcLifecycle, Hardware} =
  let victim = pick_victim_core () in
  match perform Ready.steal_from victim with
  | Some p ->
      perform Ready.push p;
      scheduler ()
  | None ->
      unsafe { __halt () };
      scheduler ()

and handle_noc_message src op payload : () / {Ready, Mem} =
  match op with
  | NOC_OP_MIGRATE ->
      let p = decode_process payload in
      perform Ready.push p
  | NOC_OP_STEAL_REQUEST ->
      match Queue.pop sched.ready with
      | Some stolen ->
          perform NoC.send src (Wrap_StealReply (Some stolen))
      | None ->
          perform NoC.send src (Wrap_StealReply None)
  | NOC_OP_STEAL_REPLY ->
      ()
  | _ -> ()

Bottom handlers — the part that differs per deployment

(* === Single-core bottom handler === *)

handler local_ready_only = {
  | Ready.push p, k         -> Queue.push sched.ready p; resume k ()
  | Ready.pop (), k         -> resume k (Queue.pop sched.ready)
  | Ready.steal_from _, k   -> resume k None
  | return v                -> v
}

(* === Multicore bottom handler with work-stealing === *)

handler distributed_ready = {
  | Ready.push p, k ->
      if Queue.length sched.ready > LOAD_THRESHOLD then
        let target = least_loaded_neighbor () in
        perform NoC.send target (Wrap_Migrate p)
      else
        Queue.push sched.ready p;
      resume k ()

  | Ready.pop (), k ->
      resume k (Queue.pop sched.ready)

  | Ready.steal_from victim, k ->
      perform NoC.send victim Wrap_StealRequest;
      let reply = wait_for_steal_reply () in
      resume k reply

  | return v -> v
}

(* === Core-local memory handler === *)

handler local_arena_mem = {
  | Mem.alloc { layout; size }, k ->
      let p = bump_allocate_local size in
      let _ = write_header p layout size in
      resume k p
  | Mem.free p, k ->
      resume k ()
  | return v -> v
}

Boot

let boot () : () =
  unsafe { configure_effect_table () };

  with local_arena_mem in {
    with distributed_ready in {       (* or local_ready_only on single-core *)
      if this_core () == 0 then begin
        perform ProcLifecycle.spawn proc_a;
        perform ProcLifecycle.spawn proc_b;
        perform ProcLifecycle.spawn proc_echo
      end;
      scheduler ()
    }
  }

That is the entire kernel. About 150 lines including the bottom handlers. Let’s walk through what’s new.

What changed from single-core: very little

Compare against the single-core version. The handler clauses for Yield, Tick, ReadByte, Print, and KeyArrival are almost identical. The only differences:

  1. Process records now carry home_core. Useful for debugging and affinity-based scheduling.

  2. Queue operations are performs, not direct calls. Instead of Queue.push sched.ready p, we write perform Ready.push p. The handler-resolved version might be the same direct call (single-core) or might involve a NoC send (multicore). The kernel doesn’t know.

  3. New Migration effect. Two clauses to handle migrating to self (trivial) and migrating to a remote core (ship the continuation as a NoC payload).

  4. New idle_or_steal for the idle case. When the ready queue is empty, the kernel tries to steal work from another core before halting.

  5. A NoC message dispatch handler. When a packet arrives from another core, dispatch based on the op code: migration, steal request, steal reply.

That’s the entire delta. The structure — handler-per-effect, scheduler-as-tail-loop, continuation-as-process-state — is preserved exactly.

How the same kernel runs on one or many cores

This is the architectural payoff. Look at the boot sequence: with local_arena_mem in { with distributed_ready in { ... scheduler () } }. The choice between local_ready_only and distributed_ready is one line of code. Swap them and the kernel is single-core or multicore. The scheduler, handler clauses, process types — all unchanged.

Three reasons it works:

The kernel writes against effect interfaces, not concrete implementations. perform Ready.push is an abstract operation. The bottom handler defines what it does. By varying the bottom handler, we vary the deployment without varying the code.

Linearity makes ownership semantics deployment-invariant. When the kernel writes perform Ready.push p, it’s saying “I hand over ownership of p.” The type system enforces this regardless of whether the handler ships p to another core or stores it locally. The single-use guarantee holds in both cases. Use-after-migration is a compile error before we pick a bottom handler.

Continuations are first-class values that can travel. A process is a continuation. A continuation is a heap object with a well-defined layout. It can be serialized into a NoC packet, sent, deserialized, and invoked on another core. The receiving core’s CONT_REC is functionally identical to what the sending core had.

Work-stealing

and idle_or_steal () =
  let victim = pick_victim_core () in
  match perform Ready.steal_from victim with
  | Some p -> perform Ready.push p; scheduler ()
  | None   -> unsafe { __halt () }; scheduler ()

The Ready.steal_from effect is what makes this clean. The kernel doesn’t write “send a packet to core 2, wait for a response, deserialize the reply.” It writes “steal from core 2,” and the bottom handler implements the protocol.

The steal request is itself an effect-handler conversation. The victim core’s NoC handler receives STEAL_REQUEST, pops from its local ready queue if non-empty, and sends back STEAL_REPLY with the process. The thief’s handler receives the reply and resumes the waiting Ready.steal_from perform.

Correctness point: When core 1 steals a process from core 0, that process’s continuation transfers ownership. Core 0 popped it (consuming local ownership), sent it via NoC packet (the linear-type transfer), core 1 received it and pushed to its local ready queue (taking ownership). The single-use invariant holds. If core 0 tried to keep a copy, compile error. If core 1 tried to send it on to core 2 and invoke it locally, compile error. Linearity makes work-stealing safe by construction.

What got waved at

A few places I cheated, that the compiler team handles:

Wrap_RemoteProcess, Wrap_StealRequest, Wrap_Migrate, Wrap_StealReply — I used these as if they were values, but they’re actually serialized forms. Sending a process over the NoC requires writing its data into a packet payload and reconstructing it on receipt. The compiler needs serialization machinery (probably auto-derived for linear types).

For the toy, you can punt: ship pointers and assume shared memory across cores. The semantics work, the linearity is preserved, and you lose only the option of cores having genuinely separate address spaces. For the four-core mesh design, this is fine.

__read_core_id_register — Assumes an unsafe primitive that returns the current core’s ID. The ISA needs this; it’s a special-purpose read of a hardware register that the boot ROM hardwires.

wait_for_steal_reply — Inside Ready.steal_from, “perform yields the current process until a response packet arrives.” The mechanism: register a one-shot continuation in a “pending replies” table keyed by request ID, then yield to the scheduler. When the matching reply arrives, the NoC handler looks up the continuation and resumes it.

Cross-core synchronization for shared structures. The sched.ready queue is core-local in this design, so no synchronization needed for normal scheduling. Work-stealing is implemented as request-reply over the NoC (so the request is processed by the victim core in its own handler context, not by the thief directly touching the queue). For genuinely shared structures, we’d need atomics (the CAS instruction flagged for later in the ISA spec).

Boot, end to end

Each core runs boot(). The boot ROM is the same for all cores; the behavior differs because this_core () == 0 is only true on core 0. Core 0 spawns the initial processes; all four cores enter the scheduler loop.

Once running, work distributes naturally:

  • Core 0 starts with processes; it runs one.
  • When core 0’s process yields or ticks, the scheduler pushes it back to ready.
  • If core 0’s ready queue gets long, distributed_ready rebalances by migrating a process to a less-loaded neighbor.
  • Cores 1, 2, 3 wake up from idle_or_steal, steal a process from core 0, run it.
  • As processes block on I/O, they move to waiters; when events occur, they’re pushed back to ready.
  • The system runs.

A nice property: processes don’t know they’re on a multicore system. The user code in proc_loop or echo_proc performs effects exactly as before. The same processes run on single-core or multicore deployments. The kernel mediates everything.

What this chapter committed to

The full multicore kernel, in approximately 150 lines. The single-line deployment-time choice between single and multicore. The work-stealing protocol implemented as effect handlers over the NoC. The realization that the only differences between deployments are in the bottom handlers, not in the kernel proper.

Part III is complete. The book has built up, from algebraic principles, an ISA, a language, and a kernel that runs on it across single and multicore. Part IV applies the design to surprising places.