Improved blocking (#513)

core: major improvements to blocking, including support for buffered channels.
2019-09-22 11:58:00 -04:00 · 2019-09-22 11:58:00 -04:00 · d843ebfe40
--- a/compiler/calls.go
+++ b/compiler/calls.go
@ -1,6 +1,7 @@
 package compiler

 import (
+	"fmt"
 	"golang.org/x/tools/go/ssa"
 	"tinygo.org/x/go-llvm"
 )
@ -20,6 +21,9 @@ func (c *Compiler) createRuntimeCall(fnName string, args []llvm.Value, name stri
 		panic("trying to call runtime." + fnName)
 	}
 	fn := c.ir.GetFunction(member.(*ssa.Function))
+	if fn.LLVMFn.IsNil() {
+		panic(fmt.Errorf("function %s does not appear in LLVM IR", fnName))
+	}
 	if !fn.IsExported() {
 		args = append(args, llvm.Undef(c.i8ptrType))            // unused context parameter
 		args = append(args, llvm.ConstPointerNull(c.i8ptrType)) // coroutine handle
--- a/compiler/channel.go
+++ b/compiler/channel.go
@ -4,32 +4,17 @@ package compiler
 // or pseudo-operations that are lowered during goroutine lowering.

 import (
-	"fmt"
 	"go/types"

 	"golang.org/x/tools/go/ssa"
 	"tinygo.org/x/go-llvm"
 )

-// emitMakeChan returns a new channel value for the given channel type.
-func (c *Compiler) emitMakeChan(expr *ssa.MakeChan) (llvm.Value, error) {
-	chanType := c.getLLVMType(expr.Type())
-	size := c.targetData.TypeAllocSize(chanType.ElementType())
-	sizeValue := llvm.ConstInt(c.uintptrType, size, false)
-	ptr := c.createRuntimeCall("alloc", []llvm.Value{sizeValue}, "chan.alloc")
-	ptr = c.builder.CreateBitCast(ptr, chanType, "chan")
-	// Set the elementSize field
-	elementSizePtr := c.builder.CreateGEP(ptr, []llvm.Value{
-		llvm.ConstInt(c.ctx.Int32Type(), 0, false),
-		llvm.ConstInt(c.ctx.Int32Type(), 0, false),
-	}, "")
+func (c *Compiler) emitMakeChan(frame *Frame, expr *ssa.MakeChan) llvm.Value {
 	elementSize := c.targetData.TypeAllocSize(c.getLLVMType(expr.Type().(*types.Chan).Elem()))
-	if elementSize > 0xffff {
-		return ptr, c.makeError(expr.Pos(), fmt.Sprintf("element size is %d bytes, which is bigger than the maximum of %d bytes", elementSize, 0xffff))
-	}
-	elementSizeValue := llvm.ConstInt(c.ctx.Int16Type(), elementSize, false)
-	c.builder.CreateStore(elementSizeValue, elementSizePtr)
-	return ptr, nil
+	elementSizeValue := llvm.ConstInt(c.uintptrType, elementSize, false)
+	bufSize := c.getValue(frame, expr.Size)
+	return c.createRuntimeCall("chanMake", []llvm.Value{elementSizeValue, bufSize}, "")
 }

 // emitChanSend emits a pseudo chan send operation. It is lowered to the actual
@ -44,8 +29,7 @@ func (c *Compiler) emitChanSend(frame *Frame, instr *ssa.Send) {
 	c.builder.CreateStore(chanValue, valueAlloca)

 	// Do the send.
-	coroutine := c.createRuntimeCall("getCoroutine", nil, "")
-	c.createRuntimeCall("chanSend", []llvm.Value{coroutine, ch, valueAllocaCast}, "")
+	c.createRuntimeCall("chanSend", []llvm.Value{ch, valueAllocaCast}, "")

 	// End the lifetime of the alloca.
 	// This also works around a bug in CoroSplit, at least in LLVM 8:
@ -63,14 +47,11 @@ func (c *Compiler) emitChanRecv(frame *Frame, unop *ssa.UnOp) llvm.Value {
 	valueAlloca, valueAllocaCast, valueAllocaSize := c.createTemporaryAlloca(valueType, "chan.value")

 	// Do the receive.
-	coroutine := c.createRuntimeCall("getCoroutine", nil, "")
-	c.createRuntimeCall("chanRecv", []llvm.Value{coroutine, ch, valueAllocaCast}, "")
+	commaOk := c.createRuntimeCall("chanRecv", []llvm.Value{ch, valueAllocaCast}, "")
 	received := c.builder.CreateLoad(valueAlloca, "chan.received")
 	c.emitLifetimeEnd(valueAllocaCast, valueAllocaSize)

 	if unop.CommaOk {
-		commaOk := c.createRuntimeCall("getTaskStateData", []llvm.Value{coroutine}, "chan.commaOk.wide")
-		commaOk = c.builder.CreateTrunc(commaOk, c.ctx.Int1Type(), "chan.commaOk")
 		tuple := llvm.Undef(c.ctx.StructType([]llvm.Type{valueType, c.ctx.Int1Type()}, false))
 		tuple = c.builder.CreateInsertValue(tuple, received, 0, "")
 		tuple = c.builder.CreateInsertValue(tuple, commaOk, 1, "")
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@ -36,13 +36,23 @@ const tinygoPath = "github.com/tinygo-org/tinygo"
 var functionsUsedInTransforms = []string{
 	"runtime.alloc",
 	"runtime.free",
-	"runtime.sleepTask",
-	"runtime.sleepCurrentTask",
+	"runtime.scheduler",
+}
+
+var taskFunctionsUsedInTransforms = []string{
+	"runtime.startGoroutine",
+}
+
+var coroFunctionsUsedInTransforms = []string{
+	"runtime.avrSleep",
+	"runtime.getFakeCoroutine",
 	"runtime.setTaskStatePtr",
 	"runtime.getTaskStatePtr",
 	"runtime.activateTask",
-	"runtime.scheduler",
-	"runtime.startGoroutine",
+	"runtime.noret",
+	"runtime.getParentHandle",
+	"runtime.getCoroutine",
+	"runtime.llvmCoroRefHolder",
 }

 // Configure the compiler.
@ -201,6 +211,20 @@ func (c *Compiler) selectScheduler() string {
 	return "coroutines"
 }

+// getFunctionsUsedInTransforms gets a list of all special functions that should be preserved during transforms and optimization.
+func (c *Compiler) getFunctionsUsedInTransforms() []string {
+	fnused := functionsUsedInTransforms
+	switch c.selectScheduler() {
+	case "coroutines":
+		fnused = append(append([]string{}, fnused...), coroFunctionsUsedInTransforms...)
+	case "tasks":
+		fnused = append(append([]string{}, fnused...), taskFunctionsUsedInTransforms...)
+	default:
+		panic(fmt.Errorf("invalid scheduler %q", c.selectScheduler()))
+	}
+	return fnused
+}
+
 // Compile the given package path or .go file path. Return an error when this
 // fails (in any stage).
 func (c *Compiler) Compile(mainPath string) []error {
@ -366,10 +390,10 @@ func (c *Compiler) Compile(mainPath string) []error {
 	realMain.SetLinkage(llvm.ExternalLinkage) // keep alive until goroutine lowering

 	// Make sure these functions are kept in tact during TinyGo transformation passes.
-	for _, name := range functionsUsedInTransforms {
+	for _, name := range c.getFunctionsUsedInTransforms() {
 		fn := c.mod.NamedFunction(name)
 		if fn.IsNil() {
-			continue
+			panic(fmt.Errorf("missing core function %q", name))
 		}
 		fn.SetLinkage(llvm.ExternalLinkage)
 	}
@ -1618,7 +1642,7 @@ func (c *Compiler) parseExpr(frame *Frame, expr ssa.Value) (llvm.Value, error) {
 			panic("unknown lookup type: " + expr.String())
 		}
 	case *ssa.MakeChan:
-		return c.emitMakeChan(expr)
+		return c.emitMakeChan(frame, expr), nil
 	case *ssa.MakeClosure:
 		return c.parseMakeClosure(frame, expr)
 	case *ssa.MakeInterface:
--- a/compiler/goroutine-lowering.go
+++ b/compiler/goroutine-lowering.go
@ -105,16 +105,20 @@ package compiler

 import (
 	"errors"
+	"fmt"
 	"strings"

 	"tinygo.org/x/go-llvm"
 )

+// setting this to true will cause the compiler to spew tons of information about coroutine transformations
+// this can be useful when debugging coroutine lowering or looking for potential missed optimizations
+const coroDebug = false
+
 type asyncFunc struct {
 	taskHandle   llvm.Value
 	cleanupBlock llvm.BasicBlock
 	suspendBlock llvm.BasicBlock
-	unreachableBlock llvm.BasicBlock
 }

 // LowerGoroutines performs some IR transformations necessary to support
@ -142,7 +146,7 @@ func (c *Compiler) lowerTasks() error {
 	mainCall := uses[0]

 	realMain := c.mod.NamedFunction(c.ir.MainPkg().Pkg.Path() + ".main")
-	if len(getUses(c.mod.NamedFunction("runtime.startGoroutine"))) != 0 {
+	if len(getUses(c.mod.NamedFunction("runtime.startGoroutine"))) != 0 || len(getUses(c.mod.NamedFunction("runtime.yield"))) != 0 {
 		// Program needs a scheduler. Start main.main as a goroutine and start
 		// the scheduler.
 		realMainWrapper := c.createGoroutineStartWrapper(realMain)
@ -150,10 +154,6 @@ func (c *Compiler) lowerTasks() error {
 		zero := llvm.ConstInt(c.uintptrType, 0, false)
 		c.createRuntimeCall("startGoroutine", []llvm.Value{realMainWrapper, zero}, "")
 		c.createRuntimeCall("scheduler", nil, "")
-		sleep := c.mod.NamedFunction("time.Sleep")
-		if !sleep.IsNil() {
-			sleep.ReplaceAllUsesWith(c.mod.NamedFunction("runtime.sleepCurrentTask"))
-		}
 	} else {
 		// Program doesn't need a scheduler. Call main.main directly.
 		c.builder.SetInsertPointBefore(mainCall)
@ -162,9 +162,6 @@ func (c *Compiler) lowerTasks() error {
 			llvm.Undef(c.i8ptrType), // unused coroutine handle
 		}
 		c.createCall(realMain, params, "")
-		// runtime.Goexit isn't needed so let it be optimized away by
-		// globalopt.
-		c.mod.NamedFunction("runtime.Goexit").SetLinkage(llvm.InternalLinkage)
 	}
 	mainCall.EraseFromParentAsInstruction()

@ -195,7 +192,13 @@ func (c *Compiler) lowerCoroutines() error {
 	// optionally followed by a call to runtime.scheduler().
 	c.builder.SetInsertPointBefore(mainCall)
 	realMain := c.mod.NamedFunction(c.ir.MainPkg().Pkg.Path() + ".main")
-	c.builder.CreateCall(realMain, []llvm.Value{llvm.Undef(c.i8ptrType), llvm.ConstPointerNull(c.i8ptrType)}, "")
+	var ph llvm.Value
+	if needsScheduler {
+		ph = c.createRuntimeCall("getFakeCoroutine", []llvm.Value{}, "")
+	} else {
+		ph = llvm.Undef(c.i8ptrType)
+	}
+	c.builder.CreateCall(realMain, []llvm.Value{llvm.Undef(c.i8ptrType), ph}, "")
 	if needsScheduler {
 		c.createRuntimeCall("scheduler", nil, "")
 	}
@ -218,6 +221,12 @@ func (c *Compiler) lowerCoroutines() error {
 	return nil
 }

+func coroDebugPrintln(s ...interface{}) {
+	if coroDebug {
+		fmt.Println(s...)
+	}
+}
+
 // markAsyncFunctions does the bulk of the work of lowering goroutines. It
 // determines whether a scheduler is needed, and if it is, it transforms
 // blocking operations into goroutines and blocking calls into await calls.
@ -233,26 +242,14 @@ func (c *Compiler) lowerCoroutines() error {
 func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 	var worklist []llvm.Value

-	sleep := c.mod.NamedFunction("time.Sleep")
-	if !sleep.IsNil() {
-		worklist = append(worklist, sleep)
-	}
-	deadlock := c.mod.NamedFunction("runtime.deadlock")
-	if !deadlock.IsNil() {
-		worklist = append(worklist, deadlock)
-	}
-	chanSend := c.mod.NamedFunction("runtime.chanSend")
-	if !chanSend.IsNil() {
-		worklist = append(worklist, chanSend)
-	}
-	chanRecv := c.mod.NamedFunction("runtime.chanRecv")
-	if !chanRecv.IsNil() {
-		worklist = append(worklist, chanRecv)
+	yield := c.mod.NamedFunction("runtime.yield")
+	if !yield.IsNil() {
+		worklist = append(worklist, yield)
 	}

 	if len(worklist) == 0 {
 		// There are no blocking operations, so no need to transform anything.
-		return false, c.lowerMakeGoroutineCalls()
+		return false, c.lowerMakeGoroutineCalls(false)
 	}

 	// Find all async functions.
@ -269,6 +266,9 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 		if _, ok := asyncFuncs[f]; ok {
 			continue // already processed
 		}
+		if f.Name() == "resume" {
+			continue
+		}
 		// Add to set of async functions.
 		asyncFuncs[f] = &asyncFunc{}
 		asyncList = append(asyncList, f)
@ -312,11 +312,23 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {

 	// Check whether a scheduler is needed.
 	makeGoroutine := c.mod.NamedFunction("runtime.makeGoroutine")
-	if c.GOOS == "js" && strings.HasPrefix(c.Triple, "wasm") {
-		// JavaScript always needs a scheduler, as in general no blocking
-		// operations are possible. Blocking operations block the browser UI,
-		// which is very bad.
-		needsScheduler = true
+	if strings.HasPrefix(c.Triple, "avr") {
+		needsScheduler = false
+		getCoroutine := c.mod.NamedFunction("runtime.getCoroutine")
+		for _, inst := range getUses(getCoroutine) {
+			inst.ReplaceAllUsesWith(llvm.Undef(inst.Type()))
+			inst.EraseFromParentAsInstruction()
+		}
+		yield := c.mod.NamedFunction("runtime.yield")
+		for _, inst := range getUses(yield) {
+			inst.EraseFromParentAsInstruction()
+		}
+		sleep := c.mod.NamedFunction("time.Sleep")
+		for _, inst := range getUses(sleep) {
+			c.builder.SetInsertPointBefore(inst)
+			c.createRuntimeCall("avrSleep", []llvm.Value{inst.Operand(0)}, "")
+			inst.EraseFromParentAsInstruction()
+		}
 	} else {
 		// Only use a scheduler when an async goroutine is started. When the
 		// goroutine is not async (does not do any blocking operation), no
@ -328,18 +340,353 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 				panic("expected const ptrtoint operand of runtime.makeGoroutine")
 			}
 			goroutine := ptrtoint.Operand(0)
+			if goroutine.Name() == "runtime.fakeCoroutine" {
+				continue
+			}
 			if _, ok := asyncFuncs[goroutine]; ok {
 				needsScheduler = true
 				break
 			}
 		}
+		if _, ok := asyncFuncs[c.mod.NamedFunction(c.ir.MainPkg().Pkg.Path()+".main")]; ok {
+			needsScheduler = true
+		}
 	}

 	if !needsScheduler {
+		// on wasm, we may still have calls to deadlock
+		// replace these with an abort
+		abort := c.mod.NamedFunction("runtime.abort")
+		if deadlock := c.mod.NamedFunction("runtime.deadlock"); !deadlock.IsNil() {
+			deadlock.ReplaceAllUsesWith(abort)
+		}
+
 		// No scheduler is needed. Do not transform all functions here.
 		// However, make sure that all go calls (which are all non-async) are
 		// transformed into regular calls.
-		return false, c.lowerMakeGoroutineCalls()
+		return false, c.lowerMakeGoroutineCalls(false)
+	}
+
+	if noret := c.mod.NamedFunction("runtime.noret"); noret.IsNil() {
+		panic("missing noret")
+	}
+
+	// replace indefinitely blocking yields
+	getCoroutine := c.mod.NamedFunction("runtime.getCoroutine")
+	coroDebugPrintln("replace indefinitely blocking yields")
+	nonReturning := map[llvm.Value]bool{}
+	for _, f := range asyncList {
+		if f == yield {
+			continue
+		}
+		coroDebugPrintln("scanning", f.Name())
+
+		var callsAsyncNotYield bool
+		var callsYield bool
+		var getsCoroutine bool
+		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+				if !inst.IsACallInst().IsNil() {
+					callee := inst.CalledValue()
+					if callee == yield {
+						callsYield = true
+					} else if callee == getCoroutine {
+						getsCoroutine = true
+					} else if _, ok := asyncFuncs[callee]; ok {
+						callsAsyncNotYield = true
+					}
+				}
+			}
+		}
+
+		coroDebugPrintln("result", f.Name(), callsYield, getsCoroutine, callsAsyncNotYield)
+
+		if callsYield && !getsCoroutine && !callsAsyncNotYield {
+			coroDebugPrintln("optimizing", f.Name())
+			// calls yield without registering for a wakeup
+			// this actually could otherwise wake up, but only in the case of really messed up undefined behavior
+			// so everything after a yield is unreachable, so we can just inject a fake return
+			delQueue := []llvm.Value{}
+			for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+				var broken bool
+
+				for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+					if !broken && !inst.IsACallInst().IsNil() && inst.CalledValue() == yield {
+						coroDebugPrintln("broke", f.Name(), bb.AsValue().Name())
+						broken = true
+						c.builder.SetInsertPointBefore(inst)
+						c.createRuntimeCall("noret", []llvm.Value{}, "")
+						if f.Type().ElementType().ReturnType().TypeKind() == llvm.VoidTypeKind {
+							c.builder.CreateRetVoid()
+						} else {
+							c.builder.CreateRet(llvm.Undef(f.Type().ElementType().ReturnType()))
+						}
+					}
+					if broken {
+						if inst.Type().TypeKind() != llvm.VoidTypeKind {
+							inst.ReplaceAllUsesWith(llvm.Undef(inst.Type()))
+						}
+						delQueue = append(delQueue, inst)
+					}
+				}
+				if !broken {
+					coroDebugPrintln("did not break", f.Name(), bb.AsValue().Name())
+				}
+			}
+
+			for _, v := range delQueue {
+				v.EraseFromParentAsInstruction()
+			}
+
+			nonReturning[f] = true
+		}
+	}
+
+	// convert direct calls into an async call followed by a yield operation
+	coroDebugPrintln("convert direct calls into an async call followed by a yield operation")
+	for _, f := range asyncList {
+		if f == yield {
+			continue
+		}
+		coroDebugPrintln("scanning", f.Name())
+
+		var retAlloc llvm.Value
+
+		// Rewrite async calls
+		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+				if !inst.IsACallInst().IsNil() {
+					callee := inst.CalledValue()
+					if _, ok := asyncFuncs[callee]; !ok || callee == yield {
+						continue
+					}
+
+					uses := getUses(inst)
+					next := llvm.NextInstruction(inst)
+					switch {
+					case nonReturning[callee]:
+						// callee blocks forever
+						coroDebugPrintln("optimizing indefinitely blocking call", f.Name(), callee.Name())
+
+						// never calls getCoroutine - coroutine handle is irrelevant
+						inst.SetOperand(inst.OperandsCount()-2, llvm.Undef(c.i8ptrType))
+
+						// insert return
+						c.builder.SetInsertPointBefore(next)
+						c.createRuntimeCall("noret", []llvm.Value{}, "")
+						var retInst llvm.Value
+						if f.Type().ElementType().ReturnType().TypeKind() == llvm.VoidTypeKind {
+							retInst = c.builder.CreateRetVoid()
+						} else {
+							retInst = c.builder.CreateRet(llvm.Undef(f.Type().ElementType().ReturnType()))
+						}
+
+						// delete everything after return
+						for next := llvm.NextInstruction(retInst); !next.IsNil(); next = llvm.NextInstruction(retInst) {
+							next.ReplaceAllUsesWith(llvm.Undef(retInst.Type()))
+							next.EraseFromParentAsInstruction()
+						}
+
+						continue
+					case next.IsAReturnInst().IsNil():
+						// not a return instruction
+						coroDebugPrintln("not a return instruction", f.Name(), callee.Name())
+					case callee.Type().ElementType().ReturnType() != f.Type().ElementType().ReturnType():
+						// return types do not match
+						coroDebugPrintln("return types do not match", f.Name(), callee.Name())
+					case callee.Type().ElementType().ReturnType().TypeKind() == llvm.VoidTypeKind:
+						fallthrough
+					case next.Operand(0) == inst:
+						// async tail call optimization - just pass parent handle
+						coroDebugPrintln("doing async tail call opt", f.Name())
+
+						// insert before call
+						c.builder.SetInsertPointBefore(inst)
+
+						// get parent handle
+						parentHandle := c.createRuntimeCall("getParentHandle", []llvm.Value{}, "")
+
+						// pass parent handle directly into function
+						inst.SetOperand(inst.OperandsCount()-2, parentHandle)
+
+						if inst.Type().TypeKind() != llvm.VoidTypeKind {
+							// delete return value
+							uses[0].SetOperand(0, llvm.Undef(inst.Type()))
+						}
+
+						c.builder.SetInsertPointBefore(next)
+						c.createRuntimeCall("yield", []llvm.Value{}, "")
+						c.createRuntimeCall("noret", []llvm.Value{}, "")
+
+						continue
+					}
+
+					coroDebugPrintln("inserting regular call", f.Name(), callee.Name())
+					c.builder.SetInsertPointBefore(inst)
+
+					// insert call to getCoroutine, this will be lowered later
+					coro := c.createRuntimeCall("getCoroutine", []llvm.Value{}, "")
+
+					// provide coroutine handle to function
+					inst.SetOperand(inst.OperandsCount()-2, coro)
+
+					// Allocate space for the return value.
+					var retvalAlloca llvm.Value
+					if inst.Type().TypeKind() != llvm.VoidTypeKind {
+						if retAlloc.IsNil() {
+							// insert at start of function
+							c.builder.SetInsertPointBefore(f.EntryBasicBlock().FirstInstruction())
+
+							// allocate return value buffer
+							retAlloc = c.builder.CreateAlloca(inst.Type(), "coro.retvalAlloca")
+						}
+						retvalAlloca = retAlloc
+
+						// call before function
+						c.builder.SetInsertPointBefore(inst)
+
+						// cast buffer pointer to *i8
+						data := c.builder.CreateBitCast(retvalAlloca, c.i8ptrType, "")
+
+						// set state pointer to return value buffer so it can be written back
+						c.createRuntimeCall("setTaskStatePtr", []llvm.Value{coro, data}, "")
+					}
+
+					// insert yield after starting function
+					c.builder.SetInsertPointBefore(llvm.NextInstruction(inst))
+					yieldCall := c.createRuntimeCall("yield", []llvm.Value{}, "")
+
+					if !retvalAlloca.IsNil() && !inst.FirstUse().IsNil() {
+						// Load the return value from the alloca.
+						// The callee has written the return value to it.
+						c.builder.SetInsertPointBefore(llvm.NextInstruction(yieldCall))
+						retval := c.builder.CreateLoad(retvalAlloca, "coro.retval")
+						inst.ReplaceAllUsesWith(retval)
+					}
+				}
+			}
+		}
+	}
+
+	// ditch unnecessary tail yields
+	coroDebugPrintln("ditch unnecessary tail yields")
+	noret := c.mod.NamedFunction("runtime.noret")
+	for _, f := range asyncList {
+		if f == yield {
+			continue
+		}
+		coroDebugPrintln("scanning", f.Name())
+
+		// we can only ditch a yield if we can ditch all yields
+		var yields []llvm.Value
+		var canDitch bool
+	scanYields:
+		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+				if inst.IsACallInst().IsNil() || inst.CalledValue() != yield {
+					continue
+				}
+
+				yields = append(yields, inst)
+
+				// we can only ditch the yield if the next instruction is a void return *or* noret
+				next := llvm.NextInstruction(inst)
+				ditchable := false
+				switch {
+				case !next.IsACallInst().IsNil() && next.CalledValue() == noret:
+					coroDebugPrintln("ditching yield with noret", f.Name())
+					ditchable = true
+				case !next.IsAReturnInst().IsNil() && f.Type().ElementType().ReturnType().TypeKind() == llvm.VoidTypeKind:
+					coroDebugPrintln("ditching yield with void return", f.Name())
+					ditchable = true
+				case !next.IsAReturnInst().IsNil():
+					coroDebugPrintln("not ditching because return is not void", f.Name(), f.Type().ElementType().ReturnType().String())
+				default:
+					coroDebugPrintln("not ditching", f.Name())
+				}
+				if !ditchable {
+					// unditchable yield
+					canDitch = false
+					break scanYields
+				}
+
+				// ditchable yield
+				canDitch = true
+			}
+		}
+
+		if canDitch {
+			coroDebugPrintln("ditching all in", f.Name())
+			for _, inst := range yields {
+				if !llvm.NextInstruction(inst).IsAReturnInst().IsNil() {
+					// insert noret
+					coroDebugPrintln("insering noret", f.Name())
+					c.builder.SetInsertPointBefore(inst)
+					c.createRuntimeCall("noret", []llvm.Value{}, "")
+				}
+
+				// delete original yield
+				inst.EraseFromParentAsInstruction()
+			}
+		}
+	}
+
+	// generate return reactivations
+	coroDebugPrintln("generate return reactivations")
+	for _, f := range asyncList {
+		if f == yield {
+			continue
+		}
+		coroDebugPrintln("scanning", f.Name())
+
+		var retPtr llvm.Value
+		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+		block:
+			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+				switch {
+				case !inst.IsACallInst().IsNil() && inst.CalledValue() == noret:
+					// does not return normally - skip this basic block
+					coroDebugPrintln("noret found - skipping", f.Name(), bb.AsValue().Name())
+					break block
+				case !inst.IsAReturnInst().IsNil():
+					// return instruction - rewrite to reactivation
+					coroDebugPrintln("adding return reactivation", f.Name(), bb.AsValue().Name())
+					if f.Type().ElementType().ReturnType().TypeKind() != llvm.VoidTypeKind {
+						// returns something
+						if retPtr.IsNil() {
+							coroDebugPrintln("adding return pointer get", f.Name())
+
+							// get return pointer in entry block
+							c.builder.SetInsertPointBefore(f.EntryBasicBlock().FirstInstruction())
+							parentHandle := c.createRuntimeCall("getParentHandle", []llvm.Value{}, "")
+							ptr := c.createRuntimeCall("getTaskStatePtr", []llvm.Value{parentHandle}, "")
+							retPtr = c.builder.CreateBitCast(ptr, llvm.PointerType(f.Type().ElementType().ReturnType(), 0), "retPtr")
+						}
+
+						coroDebugPrintln("adding return store", f.Name(), bb.AsValue().Name())
+
+						// store result into return pointer
+						c.builder.SetInsertPointBefore(inst)
+						c.builder.CreateStore(inst.Operand(0), retPtr)
+
+						// delete return value
+						inst.SetOperand(0, llvm.Undef(inst.Type()))
+					}
+
+					// insert reactivation call
+					c.builder.SetInsertPointBefore(inst)
+					parentHandle := c.createRuntimeCall("getParentHandle", []llvm.Value{}, "")
+					c.createRuntimeCall("activateTask", []llvm.Value{parentHandle}, "")
+
+					// mark as noret
+					c.builder.SetInsertPointBefore(inst)
+					c.createRuntimeCall("noret", []llvm.Value{}, "")
+					break block
+
+					// DO NOT ERASE THE RETURN!!!!!!!
+				}
+			}
+		}
 	}

 	// Create a few LLVM intrinsics for coroutine support.
@ -362,45 +709,62 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 	coroFreeType := llvm.FunctionType(c.i8ptrType, []llvm.Type{c.ctx.TokenType(), c.i8ptrType}, false)
 	coroFreeFunc := llvm.AddFunction(c.mod, "llvm.coro.free", coroFreeType)

-	// Transform all async functions into coroutines.
+	// split blocks and add LLVM coroutine intrinsics
+	coroDebugPrintln("split blocks and add LLVM coroutine intrinsics")
 	for _, f := range asyncList {
-		if f == sleep || f == deadlock || f == chanSend || f == chanRecv {
+		if f == yield {
 			continue
 		}

-		frame := asyncFuncs[f]
-		frame.cleanupBlock = c.ctx.AddBasicBlock(f, "task.cleanup")
-		frame.suspendBlock = c.ctx.AddBasicBlock(f, "task.suspend")
-		frame.unreachableBlock = c.ctx.AddBasicBlock(f, "task.unreachable")
-
-		// Scan for async calls and return instructions that need to have
-		// suspend points inserted.
-		var asyncCalls []llvm.Value
-		var returns []llvm.Value
+		// find calls to yield
+		var yieldCalls []llvm.Value
 		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
 			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
-				if !inst.IsACallInst().IsNil() {
-					callee := inst.CalledValue()
-					if _, ok := asyncFuncs[callee]; !ok || callee == sleep || callee == deadlock || callee == chanSend || callee == chanRecv {
-						continue
-					}
-					asyncCalls = append(asyncCalls, inst)
-				} else if !inst.IsAReturnInst().IsNil() {
-					returns = append(returns, inst)
+				if !inst.IsACallInst().IsNil() && inst.CalledValue() == yield {
+					yieldCalls = append(yieldCalls, inst)
 				}
 			}
 		}

-		// Coroutine setup.
+		if len(yieldCalls) == 0 {
+			// no yields - we do not have to LLVM-ify this
+			coroDebugPrintln("skipping", f.Name())
+			for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+				for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+					if !inst.IsACallInst().IsNil() && inst.CalledValue() == getCoroutine {
+						// no seperate local task - replace getCoroutine with getParentHandle
+						c.builder.SetInsertPointBefore(inst)
+						inst.ReplaceAllUsesWith(c.createRuntimeCall("getParentHandle", []llvm.Value{}, ""))
+						inst.EraseFromParentAsInstruction()
+					}
+				}
+			}
+			continue
+		}
+
+		coroDebugPrintln("converting", f.Name())
+
+		// get frame data to mess with
+		frame := asyncFuncs[f]
+
+		// add basic blocks to put cleanup and suspend code
+		frame.cleanupBlock = c.ctx.AddBasicBlock(f, "task.cleanup")
+		frame.suspendBlock = c.ctx.AddBasicBlock(f, "task.suspend")
+
+		// at start of function
 		c.builder.SetInsertPointBefore(f.EntryBasicBlock().FirstInstruction())
 		taskState := c.builder.CreateAlloca(c.getLLVMRuntimeType("taskState"), "task.state")
 		stateI8 := c.builder.CreateBitCast(taskState, c.i8ptrType, "task.state.i8")
+
+		// get LLVM-assigned coroutine ID
 		id := c.builder.CreateCall(coroIdFunc, []llvm.Value{
 			llvm.ConstInt(c.ctx.Int32Type(), 0, false),
 			stateI8,
 			llvm.ConstNull(c.i8ptrType),
 			llvm.ConstNull(c.i8ptrType),
 		}, "task.token")
+
+		// allocate buffer for task struct
 		size := c.builder.CreateCall(coroSizeFunc, nil, "task.size")
 		if c.targetData.TypeAllocSize(size.Type()) > c.targetData.TypeAllocSize(c.uintptrType) {
 			size = c.builder.CreateTrunc(size, c.uintptrType, "task.size.uintptr")
@ -411,108 +775,10 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 		if c.needsStackObjects() {
 			c.trackPointer(data)
 		}
+
+		// invoke llvm.coro.begin intrinsic and save task pointer
 		frame.taskHandle = c.builder.CreateCall(coroBeginFunc, []llvm.Value{id, data}, "task.handle")

-		// Modify async calls so this function suspends right after the child
-		// returns, because the child is probably not finished yet. Wait until
-		// the child reactivates the parent.
-		for _, inst := range asyncCalls {
-			inst.SetOperand(inst.OperandsCount()-2, frame.taskHandle)
-
-			// Split this basic block.
-			await := c.splitBasicBlock(inst, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.await")
-
-			// Allocate space for the return value.
-			var retvalAlloca llvm.Value
-			if inst.Type().TypeKind() != llvm.VoidTypeKind {
-				c.builder.SetInsertPointBefore(inst.InstructionParent().Parent().EntryBasicBlock().FirstInstruction())
-				retvalAlloca = c.builder.CreateAlloca(inst.Type(), "coro.retvalAlloca")
-				c.builder.SetInsertPointBefore(inst)
-				data := c.builder.CreateBitCast(retvalAlloca, c.i8ptrType, "")
-				c.createRuntimeCall("setTaskStatePtr", []llvm.Value{frame.taskHandle, data}, "")
-			}
-
-			// Suspend.
-			c.builder.SetInsertPointAtEnd(inst.InstructionParent())
-			continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
-				llvm.ConstNull(c.ctx.TokenType()),
-				llvm.ConstInt(c.ctx.Int1Type(), 0, false),
-			}, "")
-			sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
-			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), await)
-			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
-
-			if inst.Type().TypeKind() != llvm.VoidTypeKind {
-				// Load the return value from the alloca. The callee has
-				// written the return value to it.
-				c.builder.SetInsertPointBefore(await.FirstInstruction())
-				retval := c.builder.CreateLoad(retvalAlloca, "coro.retval")
-				inst.ReplaceAllUsesWith(retval)
-			}
-		}
-
-		// Replace return instructions with suspend points that should
-		// reactivate the parent coroutine.
-		for _, inst := range returns {
-			// These properties were added by the functionattrs pass. Remove
-			// them, because now we start using the parameter.
-			// https://llvm.org/docs/Passes.html#functionattrs-deduce-function-attributes
-			for _, kind := range []string{"nocapture", "readnone"} {
-				kindID := llvm.AttributeKindID(kind)
-				f.RemoveEnumAttributeAtIndex(f.ParamsCount(), kindID)
-			}
-
-			c.builder.SetInsertPointBefore(inst)
-
-			var parentHandle llvm.Value
-			if f.Linkage() == llvm.ExternalLinkage {
-				// Exported function.
-				// Note that getTaskStatePtr will panic if it is called with
-				// a nil pointer, so blocking exported functions that try to
-				// return anything will not work.
-				parentHandle = llvm.ConstPointerNull(c.i8ptrType)
-			} else {
-				parentHandle = f.LastParam()
-				if parentHandle.IsNil() || parentHandle.Name() != "parentHandle" {
-					// sanity check
-					panic("trying to make exported function async: " + f.Name())
-				}
-			}
-
-			// Store return values.
-			switch inst.OperandsCount() {
-			case 0:
-				// Nothing to return.
-			case 1:
-				// Return this value by writing to the pointer stored in the
-				// parent handle. The parent coroutine has made an alloca that
-				// we can write to to store our return value.
-				returnValuePtr := c.createRuntimeCall("getTaskStatePtr", []llvm.Value{parentHandle}, "coro.parentData")
-				alloca := c.builder.CreateBitCast(returnValuePtr, llvm.PointerType(inst.Operand(0).Type(), 0), "coro.parentAlloca")
-				c.builder.CreateStore(inst.Operand(0), alloca)
-			default:
-				panic("unreachable")
-			}
-
-			// Reactivate the parent coroutine. This adds it back to the run
-			// queue, so it is started again by the scheduler when possible
-			// (possibly right after the following suspend).
-			c.createRuntimeCall("activateTask", []llvm.Value{parentHandle}, "")
-
-			// Suspend this coroutine.
-			// It would look like this is unnecessary, but if this
-			// suspend point is left out, it leads to undefined
-			// behavior somehow (with the unreachable instruction).
-			continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
-				llvm.ConstNull(c.ctx.TokenType()),
-				llvm.ConstInt(c.ctx.Int1Type(), 0, false),
-			}, "ret")
-			sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
-			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), frame.unreachableBlock)
-			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
-			inst.EraseFromParentAsInstruction()
-		}
-
 		// Coroutine cleanup. Free resources associated with this coroutine.
 		c.builder.SetInsertPointAtEnd(frame.cleanupBlock)
 		mem := c.builder.CreateCall(coroFreeFunc, []llvm.Value{id, frame.taskHandle}, "task.data.free")
@ -529,106 +795,96 @@ func (c *Compiler) markAsyncFunctions() (needsScheduler bool, err error) {
 			c.builder.CreateRet(llvm.Undef(returnType))
 		}

-		// Coroutine exit. All final suspends (return instructions) will branch
-		// here.
-		c.builder.SetInsertPointAtEnd(frame.unreachableBlock)
-		c.builder.CreateUnreachable()
-	}
-
-	// Replace calls to runtime.getCoroutineCall with the coroutine of this
-	// frame.
-	for _, getCoroutineCall := range getUses(c.mod.NamedFunction("runtime.getCoroutine")) {
-		frame := asyncFuncs[getCoroutineCall.InstructionParent().Parent()]
-		getCoroutineCall.ReplaceAllUsesWith(frame.taskHandle)
-		getCoroutineCall.EraseFromParentAsInstruction()
-	}
-
-	// Transform calls to time.Sleep() into coroutine suspend points.
-	for _, sleepCall := range getUses(sleep) {
-		// sleepCall must be a call instruction.
-		frame := asyncFuncs[sleepCall.InstructionParent().Parent()]
-		duration := sleepCall.Operand(0)
-
-		// Set task state to TASK_STATE_SLEEP and set the duration.
-		c.builder.SetInsertPointBefore(sleepCall)
-		c.createRuntimeCall("sleepTask", []llvm.Value{frame.taskHandle, duration}, "")
-
-		// Yield to scheduler.
+		for _, inst := range yieldCalls {
+			// Replace call to yield with a suspension of the coroutine.
+			c.builder.SetInsertPointBefore(inst)
 			continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
 				llvm.ConstNull(c.ctx.TokenType()),
 				llvm.ConstInt(c.ctx.Int1Type(), 0, false),
 			}, "")
-		wakeup := c.splitBasicBlock(sleepCall, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.wakeup")
-		c.builder.SetInsertPointBefore(sleepCall)
+			wakeup := c.splitBasicBlock(inst, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.wakeup")
+			c.builder.SetInsertPointBefore(inst)
 			sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
 			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), wakeup)
 			sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
-		sleepCall.EraseFromParentAsInstruction()
+			inst.EraseFromParentAsInstruction()
+		}
+		ditchQueue := []llvm.Value{}
+		for bb := f.EntryBasicBlock(); !bb.IsNil(); bb = llvm.NextBasicBlock(bb) {
+			for inst := bb.FirstInstruction(); !inst.IsNil(); inst = llvm.NextInstruction(inst) {
+				if !inst.IsACallInst().IsNil() && inst.CalledValue() == getCoroutine {
+					// replace getCoroutine calls with the task handle
+					inst.ReplaceAllUsesWith(frame.taskHandle)
+					ditchQueue = append(ditchQueue, inst)
+				}
+				if !inst.IsACallInst().IsNil() && inst.CalledValue() == noret {
+					// replace tail yield with jump to cleanup, otherwise we end up with undefined behavior
+					c.builder.SetInsertPointBefore(inst)
+					c.builder.CreateBr(frame.cleanupBlock)
+					ditchQueue = append(ditchQueue, inst, llvm.NextInstruction(inst))
+				}
+			}
+		}
+		for _, v := range ditchQueue {
+			v.EraseFromParentAsInstruction()
+		}
 	}

-	// Transform calls to runtime.deadlock into coroutine suspends (without
-	// resume).
-	for _, deadlockCall := range getUses(deadlock) {
-		// deadlockCall must be a call instruction.
-		frame := asyncFuncs[deadlockCall.InstructionParent().Parent()]
-
-		// Exit coroutine.
-		c.builder.SetInsertPointBefore(deadlockCall)
-		continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
-			llvm.ConstNull(c.ctx.TokenType()),
-			llvm.ConstInt(c.ctx.Int1Type(), 0, false),
-		}, "")
-		c.splitBasicBlock(deadlockCall, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.wakeup.dead")
-		c.builder.SetInsertPointBefore(deadlockCall)
-		sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), frame.unreachableBlock)
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
-		deadlockCall.EraseFromParentAsInstruction()
+	// check for leftover calls to getCoroutine
+	if uses := getUses(getCoroutine); len(uses) > 0 {
+		useNames := make([]string, 0, len(uses))
+		for _, u := range uses {
+			if u.InstructionParent().Parent().Name() == "runtime.llvmCoroRefHolder" {
+				continue
+			}
+			useNames = append(useNames, u.InstructionParent().Parent().Name())
+		}
+		if len(useNames) > 0 {
+			panic("bad use of getCoroutine: " + strings.Join(useNames, ","))
+		}
 	}

-	// Transform calls to runtime.chanSend into channel send operations.
-	for _, sendOp := range getUses(chanSend) {
-		// sendOp must be a call instruction.
-		frame := asyncFuncs[sendOp.InstructionParent().Parent()]
-
-		// Yield to scheduler.
-		c.builder.SetInsertPointBefore(llvm.NextInstruction(sendOp))
-		continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
-			llvm.ConstNull(c.ctx.TokenType()),
-			llvm.ConstInt(c.ctx.Int1Type(), 0, false),
-		}, "")
-		sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
-		wakeup := c.splitBasicBlock(sw, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.sent")
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), wakeup)
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
+	// rewrite calls to getParentHandle
+	for _, inst := range getUses(c.mod.NamedFunction("runtime.getParentHandle")) {
+		f := inst.InstructionParent().Parent()
+		var parentHandle llvm.Value
+		parentHandle = f.LastParam()
+		if parentHandle.IsNil() || parentHandle.Name() != "parentHandle" {
+			// sanity check
+			panic("trying to make exported function async: " + f.Name())
+		}
+		inst.ReplaceAllUsesWith(parentHandle)
+		inst.EraseFromParentAsInstruction()
 	}

-	// Transform calls to runtime.chanRecv into channel receive operations.
-	for _, recvOp := range getUses(chanRecv) {
-		// recvOp must be a call instruction.
-		frame := asyncFuncs[recvOp.InstructionParent().Parent()]
-
-		// Yield to scheduler.
-		c.builder.SetInsertPointBefore(llvm.NextInstruction(recvOp))
-		continuePoint := c.builder.CreateCall(coroSuspendFunc, []llvm.Value{
-			llvm.ConstNull(c.ctx.TokenType()),
-			llvm.ConstInt(c.ctx.Int1Type(), 0, false),
-		}, "")
-		sw := c.builder.CreateSwitch(continuePoint, frame.suspendBlock, 2)
-		wakeup := c.splitBasicBlock(sw, llvm.NextBasicBlock(c.builder.GetInsertBlock()), "task.received")
-		c.builder.SetInsertPointAtEnd(recvOp.InstructionParent())
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 0, false), wakeup)
-		sw.AddCase(llvm.ConstInt(c.ctx.Int8Type(), 1, false), frame.cleanupBlock)
+	// ditch invalid function attributes
+	bads := []llvm.Value{c.mod.NamedFunction("runtime.setTaskStatePtr")}
+	for _, f := range append(bads, asyncList...) {
+		// These properties were added by the functionattrs pass. Remove
+		// them, because now we start using the parameter.
+		// https://llvm.org/docs/Passes.html#functionattrs-deduce-function-attributes
+		for _, kind := range []string{"nocapture", "readnone"} {
+			kindID := llvm.AttributeKindID(kind)
+			n := f.ParamsCount()
+			for i := 0; i <= n; i++ {
+				f.RemoveEnumAttributeAtIndex(i, kindID)
+			}
+		}
 	}

-	return true, c.lowerMakeGoroutineCalls()
+	// eliminate noret
+	for _, inst := range getUses(noret) {
+		inst.EraseFromParentAsInstruction()
+	}
+
+	return true, c.lowerMakeGoroutineCalls(true)
 }

 // Lower runtime.makeGoroutine calls to regular call instructions. This is done
 // after the regular goroutine transformations. The started goroutines are
 // either non-blocking (in which case they can be called directly) or blocking,
 // in which case they will ask the scheduler themselves to be rescheduled.
-func (c *Compiler) lowerMakeGoroutineCalls() error {
+func (c *Compiler) lowerMakeGoroutineCalls(sched bool) error {
 	// The following Go code:
 	//   go startedGoroutine()
 	//
@ -661,13 +917,21 @@ func (c *Compiler) lowerMakeGoroutineCalls() error {
 		for i := 0; i < realCall.OperandsCount()-1; i++ {
 			params = append(params, realCall.Operand(i))
 		}
-		params[len(params)-1] = llvm.ConstPointerNull(c.i8ptrType) // parent coroutine handle (must be nil)
 		c.builder.SetInsertPointBefore(realCall)
+		if (!sched) || goroutine.InstructionParent().Parent() == c.mod.NamedFunction("runtime.getFakeCoroutine") {
+			params[len(params)-1] = llvm.Undef(c.i8ptrType)
+		} else {
+			params[len(params)-1] = c.createRuntimeCall("getFakeCoroutine", []llvm.Value{}, "") // parent coroutine handle (must not be nil)
+		}
 		c.builder.CreateCall(origFunc, params, "")
 		realCall.EraseFromParentAsInstruction()
 		inttoptrOut.EraseFromParentAsInstruction()
 		goroutine.EraseFromParentAsInstruction()
 	}

+	if !sched && len(getUses(c.mod.NamedFunction("runtime.getFakeCoroutine"))) > 0 {
+		panic("getFakeCoroutine used without scheduler")
+	}
+
 	return nil
 }
--- a/compiler/optimizer.go
+++ b/compiler/optimizer.go
@ -108,7 +108,7 @@ func (c *Compiler) Optimize(optLevel, sizeLevel int, inlinerThreshold uint) erro
 	}

 	// After TinyGo-specific transforms have finished, undo exporting these functions.
-	for _, name := range functionsUsedInTransforms {
+	for _, name := range c.getFunctionsUsedInTransforms() {
 		fn := c.mod.NamedFunction(name)
 		if fn.IsNil() {
 			continue
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@ -27,21 +27,245 @@ import (
 	"unsafe"
 )

+func chanDebug(ch *channel) {
+	if schedulerDebug {
+		if ch.bufSize > 0 {
+			println("--- channel update:", ch, ch.state.String(), ch.bufSize, ch.bufUsed)
+		} else {
+			println("--- channel update:", ch, ch.state.String())
+		}
+	}
+}
+
 type channel struct {
-	elementSize uint16 // the size of one value in this channel
+	elementSize uintptr // the size of one value in this channel
+	bufSize     uintptr // size of buffer (in elements)
 	state       chanState
 	blocked     *task
+	bufHead     uintptr        // head index of buffer (next push index)
+	bufTail     uintptr        // tail index of buffer (next pop index)
+	bufUsed     uintptr        // number of elements currently in buffer
+	buf         unsafe.Pointer // pointer to first element of buffer
+}
+
+// chanMake creates a new channel with the given element size and buffer length in number of elements.
+// This is a compiler intrinsic.
+func chanMake(elementSize uintptr, bufSize uintptr) *channel {
+	return &channel{
+		elementSize: elementSize,
+		bufSize:     bufSize,
+		buf:         alloc(elementSize * bufSize),
+	}
+}
+
+// push value to end of channel if space is available
+// returns whether there was space for the value in the buffer
+func (ch *channel) push(value unsafe.Pointer) bool {
+	// immediately return false if the channel is not buffered
+	if ch.bufSize == 0 {
+		return false
+	}
+
+	// ensure space is available
+	if ch.bufUsed == ch.bufSize {
+		return false
+	}
+
+	// copy value to buffer
+	memcpy(
+		unsafe.Pointer( // pointer to the base of the buffer + offset = pointer to destination element
+			uintptr(ch.buf)+
+				uintptr( // element size * equivalent slice index = offset
+					ch.elementSize* // element size (bytes)
+						ch.bufHead, // index of first available buffer entry
+				),
+		),
+		value,
+		ch.elementSize,
+	)
+
+	// update buffer state
+	ch.bufUsed++
+	ch.bufHead++
+	if ch.bufHead == ch.bufSize {
+		ch.bufHead = 0
+	}
+
+	return true
+}
+
+// pop value from channel buffer if one is available
+// returns whether a value was popped or not
+// result is stored into value pointer
+func (ch *channel) pop(value unsafe.Pointer) bool {
+	// channel is empty
+	if ch.bufUsed == 0 {
+		return false
+	}
+
+	// compute address of source
+	addr := unsafe.Pointer(uintptr(ch.buf) + (ch.elementSize * ch.bufTail))
+
+	// copy value from buffer
+	memcpy(
+		value,
+		addr,
+		ch.elementSize,
+	)
+
+	// zero buffer element to allow garbage collection of value
+	memzero(
+		addr,
+		ch.elementSize,
+	)
+
+	// update buffer state
+	ch.bufUsed--
+
+	// move tail up
+	ch.bufTail++
+	if ch.bufTail == ch.bufSize {
+		ch.bufTail = 0
+	}
+
+	return true
+}
+
+// try to send a value to a channel, without actually blocking
+// returns whether the value was sent
+// will panic if channel is closed
+func (ch *channel) trySend(value unsafe.Pointer) bool {
+	if ch == nil {
+		// send to nil channel blocks forever
+		// this is non-blocking, so just say no
+		return false
+	}
+
+	switch ch.state {
+	case chanStateEmpty, chanStateBuf:
+		// try to dump the value directly into the buffer
+		if ch.push(value) {
+			ch.state = chanStateBuf
+			return true
+		}
+		return false
+	case chanStateRecv:
+		// unblock reciever
+		receiver := unblockChain(&ch.blocked, nil)
+
+		// copy value to reciever
+		receiverState := receiver.state()
+		memcpy(receiverState.ptr, value, ch.elementSize)
+		receiverState.data = 1 // commaOk = true
+
+		// change state to empty if there are no more receivers
+		if ch.blocked == nil {
+			ch.state = chanStateEmpty
+		}
+
+		return true
+	case chanStateSend:
+		// something else is already waiting to send
+		return false
+	case chanStateClosed:
+		runtimePanic("send on closed channel")
+	default:
+		runtimePanic("invalid channel state")
+	}
+
+	return false
+}
+
+// try to recieve a value from a channel, without really blocking
+// returns whether a value was recieved
+// second return is the comma-ok value
+func (ch *channel) tryRecv(value unsafe.Pointer) (bool, bool) {
+	if ch == nil {
+		// recieve from nil channel blocks forever
+		// this is non-blocking, so just say no
+		return false, false
+	}
+
+	switch ch.state {
+	case chanStateBuf, chanStateSend:
+		// try to pop the value directly from the buffer
+		if ch.pop(value) {
+			// unblock next sender if applicable
+			if sender := unblockChain(&ch.blocked, nil); sender != nil {
+				// push sender's value into buffer
+				ch.push(sender.state().ptr)
+
+				if ch.blocked == nil {
+					// last sender unblocked - update state
+					ch.state = chanStateBuf
+				}
+			}
+
+			if ch.bufUsed == 0 {
+				// channel empty - update state
+				ch.state = chanStateEmpty
+			}
+
+			return true, true
+		} else if sender := unblockChain(&ch.blocked, nil); sender != nil {
+			// unblock next sender if applicable
+			// copy sender's value
+			memcpy(value, sender.state().ptr, ch.elementSize)
+
+			if ch.blocked == nil {
+				// last sender unblocked - update state
+				ch.state = chanStateEmpty
+			}
+
+			return true, true
+		}
+		return false, false
+	case chanStateRecv, chanStateEmpty:
+		// something else is already waiting to recieve
+		return false, false
+	case chanStateClosed:
+		if ch.pop(value) {
+			return true, true
+		}
+
+		// channel closed - nothing to recieve
+		memzero(value, ch.elementSize)
+		return true, false
+	default:
+		runtimePanic("invalid channel state")
+	}
+
+	runtimePanic("unreachable")
+	return false, false
 }

 type chanState uint8

 const (
-	chanStateEmpty chanState = iota
-	chanStateRecv
-	chanStateSend
-	chanStateClosed
+	chanStateEmpty  chanState = iota // nothing in channel, no senders/recievers
+	chanStateRecv                    // nothing in channel, recievers waiting
+	chanStateSend                    // senders waiting, buffer full if present
+	chanStateBuf                     // buffer not empty, no senders waiting
+	chanStateClosed                  // channel closed
 )

+func (s chanState) String() string {
+	switch s {
+	case chanStateEmpty:
+		return "empty"
+	case chanStateRecv:
+		return "recv"
+	case chanStateSend:
+		return "send"
+	case chanStateBuf:
+		return "buffered"
+	case chanStateClosed:
+		return "closed"
+	default:
+		return "invalid"
+	}
+}
+
 // chanSelectState is a single channel operation (send/recv) in a select
 // statement. The value pointer is either nil (for receives) or points to the
 // value to send (for sends).
@ -50,89 +274,59 @@ type chanSelectState struct {
 	value unsafe.Pointer
 }

-// chanSend sends a single value over the channel. If this operation can
-// complete immediately (there is a goroutine waiting for a value), it sends the
-// value and re-activates both goroutines. If not, it sets itself as waiting on
-// a value.
-func chanSend(sender *task, ch *channel, value unsafe.Pointer) {
-	if ch == nil {
-		// A nil channel blocks forever. Do not scheduler this goroutine again.
-		chanYield()
+// chanSend sends a single value over the channel.
+// This operation will block unless a value is immediately available.
+// May panic if the channel is closed.
+func chanSend(ch *channel, value unsafe.Pointer) {
+	if ch.trySend(value) {
+		// value immediately sent
+		chanDebug(ch)
 		return
 	}
-	switch ch.state {
-	case chanStateEmpty:
-		scheduleLogChan("  send: chan is empty    ", ch, sender)
-		sender.state().ptr = value
+
+	if ch == nil {
+		// A nil channel blocks forever. Do not schedule this goroutine again.
+		deadlock()
+	}
+
+	// wait for reciever
+	sender := getCoroutine()
 	ch.state = chanStateSend
-		ch.blocked = sender
-		chanYield()
-	case chanStateRecv:
-		scheduleLogChan("  send: chan in recv mode", ch, sender)
-		receiver := ch.blocked
-		receiverState := receiver.state()
-		memcpy(receiverState.ptr, value, uintptr(ch.elementSize))
-		receiverState.data = 1 // commaOk = true
-		ch.blocked = receiverState.next
-		receiverState.next = nil
-		activateTask(receiver)
-		reactivateParent(sender)
-		if ch.blocked == nil {
-			ch.state = chanStateEmpty
-		}
-	case chanStateClosed:
-		runtimePanic("send on closed channel")
-	case chanStateSend:
-		scheduleLogChan("  send: chan in send mode", ch, sender)
-		sender.state().ptr = value
-		sender.state().next = ch.blocked
-		ch.blocked = sender
-		chanYield()
-	}
+	senderState := sender.state()
+	senderState.ptr = value
+	ch.blocked, senderState.next = sender, ch.blocked
+	chanDebug(ch)
+	yield()
+	senderState.ptr = nil
 }

-// chanRecv receives a single value over a channel. If there is an available
-// sender, it receives the value immediately and re-activates both coroutines.
-// If not, it sets itself as available for receiving. If the channel is closed,
-// it immediately activates itself with a zero value as the result.
-func chanRecv(receiver *task, ch *channel, value unsafe.Pointer) {
+// chanRecv receives a single value over a channel.
+// It blocks if there is no available value to recieve.
+// The recieved value is copied into the value pointer.
+// Returns the comma-ok value.
+func chanRecv(ch *channel, value unsafe.Pointer) bool {
+	if rx, ok := ch.tryRecv(value); rx {
+		// value immediately available
+		chanDebug(ch)
+		return ok
+	}
+
 	if ch == nil {
-		// A nil channel blocks forever. Do not scheduler this goroutine again.
-		chanYield()
-		return
+		// A nil channel blocks forever. Do not schedule this goroutine again.
+		deadlock()
 	}
-	switch ch.state {
-	case chanStateSend:
-		scheduleLogChan("  recv: chan in send mode", ch, receiver)
-		sender := ch.blocked
-		senderState := sender.state()
-		memcpy(value, senderState.ptr, uintptr(ch.elementSize))
-		receiver.state().data = 1 // commaOk = true
-		ch.blocked = senderState.next
-		senderState.next = nil
-		reactivateParent(receiver)
-		activateTask(sender)
-		if ch.blocked == nil {
-			ch.state = chanStateEmpty
-		}
-	case chanStateEmpty:
-		scheduleLogChan("  recv: chan is empty    ", ch, receiver)
-		receiver.state().ptr = value
+
+	// wait for a value
+	receiver := getCoroutine()
 	ch.state = chanStateRecv
-		ch.blocked = receiver
-		chanYield()
-	case chanStateClosed:
-		scheduleLogChan("  recv: chan is closed   ", ch, receiver)
-		memzero(value, uintptr(ch.elementSize))
-		receiver.state().data = 0 // commaOk = false
-		reactivateParent(receiver)
-	case chanStateRecv:
-		scheduleLogChan("  recv: chan in recv mode", ch, receiver)
-		receiver.state().ptr = value
-		receiver.state().next = ch.blocked
-		ch.blocked = receiver
-		chanYield()
-	}
+	receiverState := receiver.state()
+	receiverState.ptr, receiverState.data = value, 0
+	ch.blocked, receiverState.next = receiver, ch.blocked
+	chanDebug(ch)
+	yield()
+	ok := receiverState.data == 1
+	receiverState.ptr, receiverState.data = nil, 0
+	return ok
 }

 // chanClose closes the given channel. If this channel has a receiver or is
@ -153,17 +347,22 @@ func chanClose(ch *channel) {
 		// before the close.
 		runtimePanic("close channel during send")
 	case chanStateRecv:
-		// The receiver must be re-activated with a zero value.
-		receiverState := ch.blocked.state()
-		memzero(receiverState.ptr, uintptr(ch.elementSize))
-		receiverState.data = 0 // commaOk = false
-		activateTask(ch.blocked)
-		ch.state = chanStateClosed
-		ch.blocked = nil
-	case chanStateEmpty:
-		// Easy case. No available sender or receiver.
-		ch.state = chanStateClosed
+		// unblock all receivers with the zero value
+		for rx := unblockChain(&ch.blocked, nil); rx != nil; rx = unblockChain(&ch.blocked, nil) {
+			// get receiver state
+			state := rx.state()
+
+			// store the zero value
+			memzero(state.ptr, ch.elementSize)
+
+			// set the comma-ok value to false (channel closed)
+			state.data = 0
 		}
+	case chanStateEmpty, chanStateBuf:
+		// Easy case. No available sender or receiver.
+	}
+	ch.state = chanStateClosed
+	chanDebug(ch)
 }

 // chanSelect is the runtime implementation of the select statement. This is
@ -175,47 +374,17 @@ func chanClose(ch *channel) {
 func chanSelect(recvbuf unsafe.Pointer, states []chanSelectState, blocking bool) (uintptr, bool) {
 	// See whether we can receive from one of the channels.
 	for i, state := range states {
-		if state.ch == nil {
-			// A nil channel blocks forever, so don't consider it here.
-			continue
-		}
 		if state.value == nil {
 			// A receive operation.
-			switch state.ch.state {
-			case chanStateSend:
-				// We can receive immediately.
-				sender := state.ch.blocked
-				senderState := sender.state()
-				memcpy(recvbuf, senderState.ptr, uintptr(state.ch.elementSize))
-				state.ch.blocked = senderState.next
-				senderState.next = nil
-				activateTask(sender)
-				if state.ch.blocked == nil {
-					state.ch.state = chanStateEmpty
-				}
-				return uintptr(i), true // commaOk = true
-			case chanStateClosed:
-				// Receive the zero value.
-				memzero(recvbuf, uintptr(state.ch.elementSize))
-				return uintptr(i), false // commaOk = false
+			if rx, ok := state.ch.tryRecv(recvbuf); rx {
+				chanDebug(state.ch)
+				return uintptr(i), ok
 			}
 		} else {
 			// A send operation: state.value is not nil.
-			switch state.ch.state {
-			case chanStateRecv:
-				receiver := state.ch.blocked
-				receiverState := receiver.state()
-				memcpy(receiverState.ptr, state.value, uintptr(state.ch.elementSize))
-				receiverState.data = 1 // commaOk = true
-				state.ch.blocked = receiverState.next
-				receiverState.next = nil
-				activateTask(receiver)
-				if state.ch.blocked == nil {
-					state.ch.state = chanStateEmpty
-				}
-				return uintptr(i), false
-			case chanStateClosed:
-				runtimePanic("send on closed channel")
+			if state.ch.trySend(state.value) {
+				chanDebug(state.ch)
+				return uintptr(i), true
 			}
 		}
 	}
--- a/src/runtime/scheduler.go
+++ b/src/runtime/scheduler.go
@ -59,16 +59,78 @@ func scheduleLogChan(msg string, ch *channel, t *task) {
 	}
 }

-// Set the task to sleep for a given time.
+// deadlock is called when a goroutine cannot proceed any more, but is in theory
+// not exited (so deferred calls won't run). This can happen for example in code
+// like this, that blocks forever:
 //
-// This is a compiler intrinsic.
-func sleepTask(caller *task, duration int64) {
-	if schedulerDebug {
-		println("  set sleep:", caller, uint(duration/tickMicros))
+//     select{}
+//go:noinline
+func deadlock() {
+	// call yield without requesting a wakeup
+	yield()
+	panic("unreachable")
+}
+
+// Goexit terminates the currently running goroutine. No other goroutines are affected.
+//
+// Unlike the main Go implementation, no deffered calls will be run.
+//go:inline
+func Goexit() {
+	// its really just a deadlock
+	deadlock()
+}
+
+// unblock unblocks a task and returns the next value
+func unblock(t *task) *task {
+	state := t.state()
+	next := state.next
+	state.next = nil
+	activateTask(t)
+	return next
+}
+
+// unblockChain unblocks the next task on the stack/queue, returning it
+// also updates the chain, putting the next element into the chain pointer
+// if the chain is used as a queue, tail is used as a pointer to the final insertion point
+// if the chain is used as a stack, tail should be nil
+func unblockChain(chain **task, tail ***task) *task {
+	t := *chain
+	if t == nil {
+		return nil
 	}
-	state := caller.state()
-	state.data = uint(duration / tickMicros) // TODO: longer durations
-	addSleepTask(caller)
+	*chain = unblock(t)
+	if tail != nil && *chain == nil {
+		*tail = chain
+	}
+	return t
+}
+
+// dropChain drops a task from the given stack or queue
+// if the chain is used as a queue, tail is used as a pointer to the field containing a pointer to the next insertion point
+// if the chain is used as a stack, tail should be nil
+func dropChain(t *task, chain **task, tail ***task) {
+	for c := chain; *c != nil; c = &((*c).state().next) {
+		if *c == t {
+			next := (*c).state().next
+			if next == nil && tail != nil {
+				*tail = c
+			}
+			*c = next
+			return
+		}
+	}
+	panic("runtime: task not in chain")
+}
+
+// Pause the current task for a given time.
+//go:linkname sleep time.Sleep
+func sleep(duration int64) {
+	addSleepTask(getCoroutine(), duration)
+	yield()
+}
+
+func avrSleep(duration int64) {
+	sleepTicks(timeUnit(duration / tickMicros))
 }

 // Add a non-queued task to the run queue.
@ -85,6 +147,7 @@ func activateTask(t *task) {

 // getTaskStateData is a helper function to get the current .data field of the
 // goroutine state.
+//go:inline
 func getTaskStateData(t *task) uint {
 	return t.state().data
 }
@ -93,6 +156,7 @@ func getTaskStateData(t *task) uint {
 // done.
 func runqueuePushBack(t *task) {
 	if schedulerDebug {
+		scheduleLogTask("  pushing back:", t)
 		if t.state().next != nil {
 			panic("runtime: runqueuePushBack: expected next task to be nil")
 		}
@ -124,12 +188,14 @@ func runqueuePopFront() *task {
 }

 // Add this task to the sleep queue, assuming its state is set to sleeping.
-func addSleepTask(t *task) {
+func addSleepTask(t *task, duration int64) {
 	if schedulerDebug {
+		println("  set sleep:", t, uint(duration/tickMicros))
 		if t.state().next != nil {
 			panic("runtime: addSleepTask: expected next task to be nil")
 		}
 	}
+	t.state().data = uint(duration / tickMicros) // TODO: longer durations
 	now := ticks()
 	if sleepQueue == nil {
 		scheduleLog("  -> sleep new queue")
@ -209,3 +275,8 @@ func scheduler() {
 		t.resume()
 	}
 }
+
+func Gosched() {
+	runqueuePushBack(getCoroutine())
+	yield()
+}
--- a/src/runtime/scheduler_coroutines.go
+++ b/src/runtime/scheduler_coroutines.go
@ -50,7 +50,7 @@ func makeGoroutine(uintptr) uintptr
 // removed in the goroutine lowering pass.
 func getCoroutine() *task

-// getTaskStatePtr is a helper function to set the current .ptr field of a
+// setTaskStatePtr is a helper function to set the current .ptr field of a
 // coroutine promise.
 func setTaskStatePtr(t *task, value unsafe.Pointer) {
 	t.state().ptr = value
@ -65,37 +65,40 @@ func getTaskStatePtr(t *task) unsafe.Pointer {
 	return t.state().ptr
 }

-//go:linkname sleep time.Sleep
-func sleep(d int64) {
-	sleepTicks(timeUnit(d / tickMicros))
-}
-
-// deadlock is called when a goroutine cannot proceed any more, but is in theory
-// not exited (so deferred calls won't run). This can happen for example in code
-// like this, that blocks forever:
-//
-//     select{}
-//
-// The coroutine version is implemented directly in the compiler but it needs
-// this definition to work.
-func deadlock()
-
-// reactivateParent reactivates the parent goroutine. It is necessary in case of
-// the coroutine-based scheduler.
-func reactivateParent(t *task) {
-	activateTask(t)
-}
-
-// chanYield exits the current goroutine. Used in the channel implementation, to
-// suspend the current goroutine until it is reactivated by a channel operation
-// of a different goroutine. It is a no-op in the coroutine implementation.
-func chanYield() {
-	// Nothing to do here, simply returning from the channel operation also exits
-	// the goroutine temporarily.
-}
+// yield suspends execution of the current goroutine
+// any wakeups must be configured before calling yield
+func yield()

 // getSystemStackPointer returns the current stack pointer of the system stack.
 // This is always the current stack pointer.
 func getSystemStackPointer() uintptr {
 	return getCurrentStackPointer()
 }
+
+func fakeCoroutine(dst **task) {
+	*dst = getCoroutine()
+	for {
+		yield()
+	}
+}
+
+func getFakeCoroutine() *task {
+	// this isnt defined behavior, but this is what our implementation does
+	// this is really a horrible hack
+	var t *task
+	go fakeCoroutine(&t)
+
+	// the first line of fakeCoroutine will have completed by now
+	return t
+}
+
+// noret is a placeholder that can be used to indicate that an async function is not going to directly return here
+func noret()
+
+func getParentHandle() *task
+
+func llvmCoroRefHolder() {
+	noret()
+	getParentHandle()
+	getCoroutine()
+}
--- a/src/runtime/scheduler_cortexm.S
+++ b/src/runtime/scheduler_cortexm.S
@ -17,7 +17,7 @@ tinygo_startTask:
    blx   r4

    // After return, exit this goroutine. This is a tail call.
-    bl    runtime.Goexit
+    bl    runtime.yield

 .section .text.tinygo_swapTask
 .global  tinygo_swapTask
--- a/src/runtime/scheduler_tasks.go
+++ b/src/runtime/scheduler_tasks.go
@ -31,6 +31,7 @@ type task struct {
 // getCoroutine returns the currently executing goroutine. It is used as an
 // intrinsic when compiling channel operations, but is not necessary with the
 // task-based scheduler.
+//go:inline
 func getCoroutine() *task {
 	return currentTask
 }
@ -67,15 +68,6 @@ func swapTask(oldTask, newTask *task) {
 //go:linkname swapTaskLower tinygo_swapTask
 func swapTaskLower(oldTask, newTask *task)

-// Goexit terminates the currently running goroutine. No other goroutines are affected.
-//
-// Unlike the main Go implementation, no deffered calls will be run.
-//export runtime.Goexit
-func Goexit() {
-	// Swap without rescheduling first, effectively exiting the goroutine.
-	swapTask(currentTask, &schedulerState)
-}
-
 // startTask is a small wrapper function that sets up the first (and only)
 // argument to the new goroutine and makes sure it is exited when the goroutine
 // finishes.
@ -96,40 +88,13 @@ func startGoroutine(fn, args uintptr) {
 	runqueuePushBack(t)
 }

-//go:linkname sleep time.Sleep
-func sleep(d int64) {
-	sleepTicks(timeUnit(d / tickMicros))
-}
-
-// sleepCurrentTask suspends the current goroutine. This is a compiler
-// intrinsic. It replaces calls to time.Sleep when a scheduler is in use.
-func sleepCurrentTask(d int64) {
-	sleepTask(currentTask, d)
+// yield suspends execution of the current goroutine
+// any wakeups must be configured before calling yield
+//export runtime.yield
+func yield() {
 	swapTask(currentTask, &schedulerState)
 }

-// deadlock is called when a goroutine cannot proceed any more, but is in theory
-// not exited (so deferred calls won't run). This can happen for example in code
-// like this, that blocks forever:
-//
-//     select{}
-func deadlock() {
-	Goexit()
-}
-
-// reactivateParent reactivates the parent goroutine. It is a no-op for the task
-// based scheduler.
-func reactivateParent(t *task) {
-	// Nothing to do here, tasks don't stop automatically.
-}
-
-// chanYield exits the current goroutine. Used in the channel implementation, to
-// suspend the current goroutine until it is reactivated by a channel operation
-// of a different goroutine.
-func chanYield() {
-	Goexit()
-}
-
 // getSystemStackPointer returns the current stack pointer of the system stack.
 // This is not necessarily the same as the current stack pointer.
 func getSystemStackPointer() uintptr {
--- a/testdata/channel.go
+++ b/testdata/channel.go
@ -1,65 +1,110 @@
 package main

-import "time"
+import (
+	"time"
+	"runtime"
+)
+
+// waitGroup is a small type reimplementing some of the behavior of sync.WaitGroup
+type waitGroup uint
+
+func (wg *waitGroup) wait() {
+	n := 0
+	for *wg != 0 {
+		// pause and wait to be rescheduled
+		runtime.Gosched()
+
+		if n > 100 {
+			// if something is using the sleep queue, this may be necessary
+			time.Sleep(time.Millisecond)
+		}
+
+		n++
+	}
+}
+
+func (wg *waitGroup) add(n uint) {
+	*wg += waitGroup(n)
+}
+
+func (wg *waitGroup) done() {
+	if *wg == 0 {
+		panic("wait group underflow")
+	}
+	*wg--
+}
+
+var wg waitGroup

 func main() {
 	ch := make(chan int)
 	println("len, cap of channel:", len(ch), cap(ch), ch == nil)
+
+	wg.add(1)
 	go sender(ch)

 	n, ok := <-ch
 	println("recv from open channel:", n, ok)

 	for n := range ch {
-		if n == 6 {
-			time.Sleep(time.Microsecond)
-		}
 		println("received num:", n)
 	}

+	wg.wait()
 	n, ok = <-ch
 	println("recv from closed channel:", n, ok)

 	// Test bigger values
 	ch2 := make(chan complex128)
+	wg.add(1)
 	go sendComplex(ch2)
 	println("complex128:", <-ch2)
+	wg.wait()

 	// Test multi-sender.
 	ch = make(chan int)
+	wg.add(3)
 	go fastsender(ch, 10)
 	go fastsender(ch, 23)
 	go fastsender(ch, 40)
 	slowreceiver(ch)
+	wg.wait()

 	// Test multi-receiver.
 	ch = make(chan int)
+	wg.add(3)
 	go fastreceiver(ch)
 	go fastreceiver(ch)
 	go fastreceiver(ch)
 	slowsender(ch)
+	wg.wait()

 	// Test iterator style channel.
 	ch = make(chan int)
+	wg.add(1)
 	go iterator(ch, 100)
 	sum := 0
 	for i := range ch {
 		sum += i
 	}
+	wg.wait()
 	println("sum(100):", sum)

 	// Test simple selects.
-	go selectDeadlock()
+	go selectDeadlock()	// cannot use waitGroup here - never terminates
+	wg.add(1)
 	go selectNoOp()
+	wg.wait()

 	// Test select with a single send operation (transformed into chan send).
 	ch = make(chan int)
+	wg.add(1)
 	go fastreceiver(ch)
 	select {
 	case ch <- 5:
 	}
 	close(ch)
-	time.Sleep(time.Millisecond)
+	wg.wait()
 	println("did send one")

 	// Test select with a single recv operation (transformed into chan recv).
@ -70,9 +115,12 @@ func main() {

 	// Test select recv with channel that has one entry.
 	ch = make(chan int)
+	wg.add(1)
 	go func(ch chan int) {
 		ch <- 55
+		wg.done()
 	}(ch)
+	// not defined behavior, but we cant really fix this until select has been fixed
 	time.Sleep(time.Millisecond)
 	select {
 	case make(chan int) <- 3:
@ -82,6 +130,7 @@ func main() {
 	case n := <-make(chan int):
 		println("unreachable:", n)
 	}
+	wg.wait()

 	// Test select recv with closed channel.
 	close(ch)
@ -96,6 +145,7 @@ func main() {

 	// Test select send.
 	ch = make(chan int)
+	wg.add(1)
 	go fastreceiver(ch)
 	time.Sleep(time.Millisecond)
 	select {
@ -105,9 +155,49 @@ func main() {
 		println("unreachable:", n)
 	}
 	close(ch)
+	wg.wait()

-	// Allow goroutines to exit.
-	time.Sleep(time.Microsecond)
+	// test non-concurrent buffered channels
+	ch = make(chan int, 2)
+	ch <- 1
+	ch <- 2
+	println("non-concurrent channel recieve:", <-ch)
+	println("non-concurrent channel recieve:", <-ch)
+
+	// test closing channels with buffered data
+	ch <- 3
+	ch <- 4
+	close(ch)
+	println("closed buffered channel recieve:", <-ch)
+	println("closed buffered channel recieve:", <-ch)
+	println("closed buffered channel recieve:", <-ch)
+
+	// test using buffered channels as regular channels with special properties
+	wg.add(6)
+	ch = make(chan int, 2)
+	go send(ch)
+	go send(ch)
+	go send(ch)
+	go send(ch)
+	go receive(ch)
+	go receive(ch)
+	wg.wait()
+	close(ch)
+	var count int
+	for range ch {
+		count++
+	}
+	println("hybrid buffered channel recieve:", count)
+}
+
+func send(ch chan<- int) {
+	ch <- 1
+	wg.done()
+}
+
+func receive(ch <-chan int) {
+	<-ch
+	wg.done()
 }

 func sender(ch chan int) {
@ -119,15 +209,18 @@ func sender(ch chan int) {
 		ch <- i
 	}
 	close(ch)
+	wg.done()
 }

 func sendComplex(ch chan complex128) {
 	ch <- 7 + 10.5i
+	wg.done()
 }

 func fastsender(ch chan int, n int) {
 	ch <- n
 	ch <- n + 1
+	wg.done()
 }

 func slowreceiver(ch chan int) {
@ -153,6 +246,7 @@ func fastreceiver(ch chan int) {
 		sum += n
 	}
 	println("sum:", sum)
+	wg.done()
 }

 func iterator(ch chan int, top int) {
@ -160,6 +254,7 @@ func iterator(ch chan int, top int) {
 		ch <- i
 	}
 	close(ch)
+	wg.done()
 }

 func selectDeadlock() {
@ -174,4 +269,5 @@ func selectNoOp() {
 	default:
 	}
 	println("after no-op")
+	wg.done()
 }
--- a/testdata/channel.txt
+++ b/testdata/channel.txt
@ -25,3 +25,9 @@ select n from chan: 55
 select n from closed chan: 0
 select send
 sum: 235
+non-concurrent channel recieve: 1
+non-concurrent channel recieve: 2
+closed buffered channel recieve: 3
+closed buffered channel recieve: 4
+closed buffered channel recieve: 0
+hybrid buffered channel recieve: 2