diff --git a/src/runtime/scheduler_cortexm.S b/src/runtime/scheduler_cortexm.S
index 9652d94e..6d83ba4f 100644
--- a/src/runtime/scheduler_cortexm.S
+++ b/src/runtime/scheduler_cortexm.S
@@ -19,76 +19,96 @@ tinygo_startTask:
     // After return, exit this goroutine. This is a tail call.
     bl    runtime.yield
 
+.section .text.tinygo_getSystemStackPointer
+.global  tinygo_getSystemStackPointer
+.type    tinygo_getSystemStackPointer, %function
+tinygo_getSystemStackPointer:
+    // The system stack pointer is always stored in the MSP register.
+    mrs r0, MSP
+    bx lr
+
+
+// switchToScheduler and switchToTask are also in the same section, to make sure
+// relative branches work.
 .section .text.tinygo_swapTask
+
+.global  tinygo_switchToScheduler
+.type    tinygo_switchToScheduler, %function
+tinygo_switchToScheduler:
+    // r0 = oldTask *task
+
+    // Currently on the task stack (SP=PSP). We need to store the position on
+    // the stack where the in-use registers will be stored.
+    mov r1, sp
+    subs r1, #36
+    str r1, [r0, #36]
+
+    b tinygo_swapTask
+
+.global  tinygo_switchToTask
+.type    tinygo_switchToTask, %function
+tinygo_switchToTask:
+    // r0 = newTask *task
+
+    // Currently on the scheduler stack (SP=MSP). We'll have to update the PSP,
+    // and then we can invoke swapTask.
+    ldr r0, [r0, #36]
+    msr PSP, r0
+
+    // Continue executing in the swapTask function, which swaps the stack
+    // pointer.
+
 .global  tinygo_swapTask
 .type    tinygo_swapTask, %function
 tinygo_swapTask:
-    // r0 = oldTask *task
-    // r1 = newTask *task
-
-    // This function stores the current register state to a task struct and
-    // loads the state of another task to replace the current state. Apart from
-    // saving and restoring all relevant callee-saved registers, it also ends
-    // with branching to the last program counter (saved as the lr register, to
-    // follow the ARM calling convention).
+    // This function stores the current register state to the stack, switches to
+    // the other stack (MSP/PSP), and loads the register state from the other
+    // stack. Apart from saving and restoring all relevant callee-saved
+    // registers, it also ends with branching to the last program counter (saved
+    // as the lr register, to follow the ARM calling convention).
 
     // On pre-Thumb2 CPUs (Cortex-M0 in particular), registers r8-r15 cannot be
     // used directly. Only very few operations work on them, such as mov. That's
     // why the higher register values are first stored in the temporary register
     // r3 when loading/storing them.
+    // It is possible to reduce the swapTask by two instructions (~2 cycles) on
+    // Cortex-M0 by reordering the layout of the pushed registers from {r4-r11,
+    // lr} to {r8-r11, r4-r8, lr}. However, that also requires a change on the
+    // Go side (depending on thumb1/thumb2!) and so is not really worth the
+    // complexity.
 
     // Store state to old task. It saves the lr instead of the pc, because that
     // will be the pc after returning back to the old task (in a different
     // invocation of swapTask).
-    str r4, [r0, #0]
-    str r5, [r0, #4]
-    str r6, [r0, #8]
-    str r7, [r0, #12]
     #if defined(__thumb2__)
-    str r8, [r0, #16]
-    str r9, [r0, #20]
-    str r10, [r0, #24]
-    str r11, [r0, #28]
-    str sp, [r0, #32]
-    str lr, [r0, #36]
+    push {r4-r11, lr}
     #else
-    mov r3, r8
-    str r3, [r0, #16]
-    mov r3, r9
-    str r3, [r0, #20]
-    mov r3, r10
-    str r3, [r0, #24]
+    mov r0, r8
+    mov r1, r9
+    mov r2, r10
     mov r3, r11
-    str r3, [r0, #28]
-    mov r3, sp
-    str r3, [r0, #32]
-    mov r3, lr
-    str r3, [r0, #36]
+    push {r0-r3, lr}
+    push {r4-r7}
     #endif
 
+    // Switch the stack. This could either switch from PSP to MSP, or from MSP
+    // to PSP. By using an XOR (eor), it will just switch to the other stack.
+    mrs  r0, CONTROL // load CONTROL register
+    movs r3, #2
+    eors r0, r0, r3  // flip the SPSEL (active stack pointer) bit
+    msr  CONTROL, r0 // store CONTROL register
+    isb              // required to flush the pipeline
+
     // Load state from new task and branch to the previous position in the
     // program.
-    ldr r4, [r1, #0]
-    ldr r5, [r1, #4]
-    ldr r6, [r1, #8]
-    ldr r7, [r1, #12]
     #if defined(__thumb2__)
-    ldr r8, [r1, #16]
-    ldr r9, [r1, #20]
-    ldr r10, [r1, #24]
-    ldr r11, [r1, #28]
-    ldr sp, [r1, #32]
+    pop {r4-r11, pc}
     #else
-    ldr r3, [r1, #16]
-    mov r8, r3
-    ldr r3, [r1, #20]
-    mov r9, r3
-    ldr r3, [r1, #24]
-    mov r10, r3
-    ldr r3, [r1, #28]
+    pop {r4-r7}
+    pop {r0-r3}
+    mov r8, r0
+    mov r9, r1
+    mov r10, r2
     mov r11, r3
-    ldr r3, [r1, #32]
-    mov sp, r3
+    pop {pc}
     #endif
-    ldr r3, [r1, #36]
-    bx r3
diff --git a/src/runtime/scheduler_tasks.go b/src/runtime/scheduler_tasks.go
index b6f5ce3c..b2142892 100644
--- a/src/runtime/scheduler_tasks.go
+++ b/src/runtime/scheduler_tasks.go
@@ -12,8 +12,7 @@ const stackSize = 1024
 const stackCanary = uintptr(uint64(0x670c1333b83bf575) & uint64(^uintptr(0)))
 
 var (
-	schedulerState = task{canary: stackCanary}
-	currentTask    *task // currently running goroutine, or nil
+	currentTask *task // currently running goroutine, or nil
 )
 
 // This type points to the bottom of the goroutine stack and contains some state
@@ -22,10 +21,10 @@ var (
 type task struct {
 	// The order of fields in this structs must be kept in sync with assembly!
 	calleeSavedRegs
-	sp uintptr
 	pc uintptr
+	sp uintptr
 	taskState
-	canary uintptr // used to detect stack overflows
+	canaryPtr *uintptr // used to detect stack overflows
 }
 
 // getCoroutine returns the currently executing goroutine. It is used as an
@@ -47,26 +46,24 @@ func (t *task) state() *taskState {
 // to the scheduler.
 func (t *task) resume() {
 	currentTask = t
-	swapTask(&schedulerState, t)
+	switchToTask(t)
 	currentTask = nil
 }
 
-// swapTask saves the current state to oldTask (which must contain the current
-// task state) and switches to newTask. Note that this function usually does
-// return, when another task (perhaps newTask) switches back to the current
-// task.
-//
-// As an additional protection, before switching tasks, it checks whether this
-// goroutine has overflowed the stack.
-func swapTask(oldTask, newTask *task) {
-	if oldTask.canary != stackCanary {
-		runtimePanic("goroutine stack overflow")
-	}
-	swapTaskLower(oldTask, newTask)
-}
+// switchToScheduler saves the current state on the stack, saves the current
+// stack pointer in the task, and switches to the scheduler. It must only be
+// called when actually running on this task.
+// When it returns, the scheduler has switched back to this task (for example,
+// after a blocking operation completed).
+//export tinygo_switchToScheduler
+func switchToScheduler(t *task)
 
-//go:linkname swapTaskLower tinygo_swapTask
-func swapTaskLower(oldTask, newTask *task)
+// switchToTask switches from the scheduler to the task. It must only be called
+// from the scheduler.
+// When this function returns, the task just yielded control back to the
+// scheduler.
+//export tinygo_switchToTask
+func switchToTask(t *task)
 
 // startTask is a small wrapper function that sets up the first (and only)
 // argument to the new goroutine and makes sure it is exited when the goroutine
@@ -79,11 +76,20 @@ var startTask [0]uint8
 // adds it to the runqueue.
 func startGoroutine(fn, args uintptr) {
 	stack := alloc(stackSize)
-	t := (*task)(stack)
-	t.sp = uintptr(stack) + stackSize
+	t := (*task)(unsafe.Pointer(uintptr(stack) + stackSize - unsafe.Sizeof(task{})))
+
+	// Set up the stack canary, a random number that should be checked when
+	// switching from the task back to the scheduler. The stack canary pointer
+	// points to the first word of the stack. If it has changed between now and
+	// the next stack switch, there was a stack overflow.
+	t.canaryPtr = (*uintptr)(unsafe.Pointer(stack))
+	*t.canaryPtr = stackCanary
+
+	// Store the initial sp/pc for the startTask function (implemented in
+	// assembly).
+	t.sp = uintptr(stack) + stackSize - unsafe.Sizeof(task{})
 	t.pc = uintptr(unsafe.Pointer(&startTask))
 	t.prepareStartTask(fn, args)
-	t.canary = stackCanary
 	scheduleLogTask("  start goroutine:", t)
 	runqueuePushBack(t)
 }
@@ -92,17 +98,15 @@ func startGoroutine(fn, args uintptr) {
 // any wakeups must be configured before calling yield
 //export runtime.yield
 func yield() {
-	swapTask(currentTask, &schedulerState)
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occured.
+	if *currentTask.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	switchToScheduler(currentTask)
 }
 
 // getSystemStackPointer returns the current stack pointer of the system stack.
 // This is not necessarily the same as the current stack pointer.
-func getSystemStackPointer() uintptr {
-	if currentTask == nil {
-		// Currently on the system stack.
-		return getCurrentStackPointer()
-	} else {
-		// Currently in a goroutine.
-		return schedulerState.sp
-	}
-}
+//export tinygo_getSystemStackPointer
+func getSystemStackPointer() uintptr