runtime: use MSP/PSP registers for scheduling on Cortex-M

The Cortex-M architecture contains two stack pointers, designed to be used by RTOSes: MSP and PSP (where MSP is the default at reset). In fact, the ARM documentation recommends using the PSP for tasks in a RTOS. This commit switches to using the PSP for goroutine stacks. Aside from being the recommended operation, this has the big advantage that the NVIC automatically switches to the MSP when handling interrupts. This avoids having to make every goroutine stack big enough that interrupts can be handled on it. Additionally, I've optimized the assembly code to save/restore registers (made possible by this change). For Cortex-M3 and up, saving all registers is just a single push instruction and restoring+branching is a single pop instruction. For Cortex-M0 it's a bit more work because the push/pop instructions there don't support most high registers. Sidenote: the fact that you can pop a number of registers and branch at the same time makes ARM not exactly a true RISC system. However, it's very useful in this case.
2019-11-19 21:39:43 +01:00 · 2019-11-19 21:39:43 +01:00 · 3d3e48179e
--- a/src/runtime/scheduler_cortexm.S
+++ b/src/runtime/scheduler_cortexm.S
@ -19,76 +19,96 @@ tinygo_startTask:
    // After return, exit this goroutine. This is a tail call.
    bl    runtime.yield

+.section .text.tinygo_getSystemStackPointer
+.global  tinygo_getSystemStackPointer
+.type    tinygo_getSystemStackPointer, %function
+tinygo_getSystemStackPointer:
+    // The system stack pointer is always stored in the MSP register.
+    mrs r0, MSP
+    bx lr
+
+
+// switchToScheduler and switchToTask are also in the same section, to make sure
+// relative branches work.
 .section .text.tinygo_swapTask
+
+.global  tinygo_switchToScheduler
+.type    tinygo_switchToScheduler, %function
+tinygo_switchToScheduler:
+    // r0 = oldTask *task
+
+    // Currently on the task stack (SP=PSP). We need to store the position on
+    // the stack where the in-use registers will be stored.
+    mov r1, sp
+    subs r1, #36
+    str r1, [r0, #36]
+
+    b tinygo_swapTask
+
+.global  tinygo_switchToTask
+.type    tinygo_switchToTask, %function
+tinygo_switchToTask:
+    // r0 = newTask *task
+
+    // Currently on the scheduler stack (SP=MSP). We'll have to update the PSP,
+    // and then we can invoke swapTask.
+    ldr r0, [r0, #36]
+    msr PSP, r0
+
+    // Continue executing in the swapTask function, which swaps the stack
+    // pointer.
+
 .global  tinygo_swapTask
 .type    tinygo_swapTask, %function
 tinygo_swapTask:
-    // r0 = oldTask *task
-    // r1 = newTask *task
-
-    // This function stores the current register state to a task struct and
-    // loads the state of another task to replace the current state. Apart from
-    // saving and restoring all relevant callee-saved registers, it also ends
-    // with branching to the last program counter (saved as the lr register, to
-    // follow the ARM calling convention).
+    // This function stores the current register state to the stack, switches to
+    // the other stack (MSP/PSP), and loads the register state from the other
+    // stack. Apart from saving and restoring all relevant callee-saved
+    // registers, it also ends with branching to the last program counter (saved
+    // as the lr register, to follow the ARM calling convention).

    // On pre-Thumb2 CPUs (Cortex-M0 in particular), registers r8-r15 cannot be
    // used directly. Only very few operations work on them, such as mov. That's
    // why the higher register values are first stored in the temporary register
    // r3 when loading/storing them.
+    // It is possible to reduce the swapTask by two instructions (~2 cycles) on
+    // Cortex-M0 by reordering the layout of the pushed registers from {r4-r11,
+    // lr} to {r8-r11, r4-r8, lr}. However, that also requires a change on the
+    // Go side (depending on thumb1/thumb2!) and so is not really worth the
+    // complexity.

    // Store state to old task. It saves the lr instead of the pc, because that
    // will be the pc after returning back to the old task (in a different
    // invocation of swapTask).
-    str r4, [r0, #0]
-    str r5, [r0, #4]
-    str r6, [r0, #8]
-    str r7, [r0, #12]
    #if defined(__thumb2__)
-    str r8, [r0, #16]
-    str r9, [r0, #20]
-    str r10, [r0, #24]
-    str r11, [r0, #28]
-    str sp, [r0, #32]
-    str lr, [r0, #36]
+    push {r4-r11, lr}
    #else
-    mov r3, r8
-    str r3, [r0, #16]
-    mov r3, r9
-    str r3, [r0, #20]
-    mov r3, r10
-    str r3, [r0, #24]
+    mov r0, r8
+    mov r1, r9
+    mov r2, r10
    mov r3, r11
-    str r3, [r0, #28]
-    mov r3, sp
-    str r3, [r0, #32]
-    mov r3, lr
-    str r3, [r0, #36]
+    push {r0-r3, lr}
+    push {r4-r7}
    #endif

+    // Switch the stack. This could either switch from PSP to MSP, or from MSP
+    // to PSP. By using an XOR (eor), it will just switch to the other stack.
+    mrs  r0, CONTROL // load CONTROL register
+    movs r3, #2
+    eors r0, r0, r3  // flip the SPSEL (active stack pointer) bit
+    msr  CONTROL, r0 // store CONTROL register
+    isb              // required to flush the pipeline
+
    // Load state from new task and branch to the previous position in the
    // program.
-    ldr r4, [r1, #0]
-    ldr r5, [r1, #4]
-    ldr r6, [r1, #8]
-    ldr r7, [r1, #12]
    #if defined(__thumb2__)
-    ldr r8, [r1, #16]
-    ldr r9, [r1, #20]
-    ldr r10, [r1, #24]
-    ldr r11, [r1, #28]
-    ldr sp, [r1, #32]
+    pop {r4-r11, pc}
    #else
-    ldr r3, [r1, #16]
-    mov r8, r3
-    ldr r3, [r1, #20]
-    mov r9, r3
-    ldr r3, [r1, #24]
-    mov r10, r3
-    ldr r3, [r1, #28]
+    pop {r4-r7}
+    pop {r0-r3}
+    mov r8, r0
+    mov r9, r1
+    mov r10, r2
    mov r11, r3
-    ldr r3, [r1, #32]
-    mov sp, r3
+    pop {pc}
    #endif
-    ldr r3, [r1, #36]
-    bx r3
--- a/src/runtime/scheduler_tasks.go
+++ b/src/runtime/scheduler_tasks.go
@ -12,8 +12,7 @@ const stackSize = 1024
 const stackCanary = uintptr(uint64(0x670c1333b83bf575) & uint64(^uintptr(0)))

 var (
-	schedulerState = task{canary: stackCanary}
-	currentTask    *task // currently running goroutine, or nil
+	currentTask *task // currently running goroutine, or nil
 )

 // This type points to the bottom of the goroutine stack and contains some state
@ -22,10 +21,10 @@ var (
 type task struct {
 	// The order of fields in this structs must be kept in sync with assembly!
 	calleeSavedRegs
-	sp uintptr
 	pc uintptr
+	sp uintptr
 	taskState
-	canary uintptr // used to detect stack overflows
+	canaryPtr *uintptr // used to detect stack overflows
 }

 // getCoroutine returns the currently executing goroutine. It is used as an
@ -47,26 +46,24 @@ func (t *task) state() *taskState {
 // to the scheduler.
 func (t *task) resume() {
 	currentTask = t
-	swapTask(&schedulerState, t)
+	switchToTask(t)
 	currentTask = nil
 }

-// swapTask saves the current state to oldTask (which must contain the current
-// task state) and switches to newTask. Note that this function usually does
-// return, when another task (perhaps newTask) switches back to the current
-// task.
-//
-// As an additional protection, before switching tasks, it checks whether this
-// goroutine has overflowed the stack.
-func swapTask(oldTask, newTask *task) {
-	if oldTask.canary != stackCanary {
-		runtimePanic("goroutine stack overflow")
-	}
-	swapTaskLower(oldTask, newTask)
-}
+// switchToScheduler saves the current state on the stack, saves the current
+// stack pointer in the task, and switches to the scheduler. It must only be
+// called when actually running on this task.
+// When it returns, the scheduler has switched back to this task (for example,
+// after a blocking operation completed).
+//export tinygo_switchToScheduler
+func switchToScheduler(t *task)

-//go:linkname swapTaskLower tinygo_swapTask
-func swapTaskLower(oldTask, newTask *task)
+// switchToTask switches from the scheduler to the task. It must only be called
+// from the scheduler.
+// When this function returns, the task just yielded control back to the
+// scheduler.
+//export tinygo_switchToTask
+func switchToTask(t *task)

 // startTask is a small wrapper function that sets up the first (and only)
 // argument to the new goroutine and makes sure it is exited when the goroutine
@ -79,11 +76,20 @@ var startTask [0]uint8
 // adds it to the runqueue.
 func startGoroutine(fn, args uintptr) {
 	stack := alloc(stackSize)
-	t := (*task)(stack)
-	t.sp = uintptr(stack) + stackSize
+	t := (*task)(unsafe.Pointer(uintptr(stack) + stackSize - unsafe.Sizeof(task{})))
+
+	// Set up the stack canary, a random number that should be checked when
+	// switching from the task back to the scheduler. The stack canary pointer
+	// points to the first word of the stack. If it has changed between now and
+	// the next stack switch, there was a stack overflow.
+	t.canaryPtr = (*uintptr)(unsafe.Pointer(stack))
+	*t.canaryPtr = stackCanary
+
+	// Store the initial sp/pc for the startTask function (implemented in
+	// assembly).
+	t.sp = uintptr(stack) + stackSize - unsafe.Sizeof(task{})
 	t.pc = uintptr(unsafe.Pointer(&startTask))
 	t.prepareStartTask(fn, args)
-	t.canary = stackCanary
 	scheduleLogTask("  start goroutine:", t)
 	runqueuePushBack(t)
 }
@ -92,17 +98,15 @@ func startGoroutine(fn, args uintptr) {
 // any wakeups must be configured before calling yield
 //export runtime.yield
 func yield() {
-	swapTask(currentTask, &schedulerState)
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occured.
+	if *currentTask.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	switchToScheduler(currentTask)
 }

 // getSystemStackPointer returns the current stack pointer of the system stack.
 // This is not necessarily the same as the current stack pointer.
-func getSystemStackPointer() uintptr {
-	if currentTask == nil {
-		// Currently on the system stack.
-		return getCurrentStackPointer()
-	} else {
-		// Currently in a goroutine.
-		return schedulerState.sp
-	}
-}
+//export tinygo_getSystemStackPointer
+func getSystemStackPointer() uintptr