From 9fa667ce6341fe653f2cac571dd0a262fb390cdf Mon Sep 17 00:00:00 2001
From: Nia Waldvogel <niaow1234@gmail.com>
Date: Mon, 22 Nov 2021 18:01:50 -0500
Subject: [PATCH] rumtime: implement __sync libcalls as critical sections

This change implements __sync atomic polyfill libcalls by disabling interrupts.
This was previously done in a limited capacity on some targets, but this change uses a go:generate to emit all of the calls on all microcontroller targets.
---
 src/runtime/arch_cortexm.go                   |  68 +----
 src/runtime/arch_tinygoriscv.go               | 105 +------
 src/runtime/atomics_critical.go               | 286 ++++++++++++++++++
 src/runtime/runtime.go                        |   2 +
 .../gen-critical-atomics.go                   | 182 +++++++++++
 5 files changed, 472 insertions(+), 171 deletions(-)
 create mode 100644 src/runtime/atomics_critical.go
 create mode 100644 tools/gen-critical-atomics/gen-critical-atomics.go

diff --git a/src/runtime/arch_cortexm.go b/src/runtime/arch_cortexm.go
index e4c88609..fbc4427c 100644
--- a/src/runtime/arch_cortexm.go
+++ b/src/runtime/arch_cortexm.go
@@ -1,3 +1,4 @@
+//go:build cortexm
 // +build cortexm
 
 package runtime
@@ -20,73 +21,6 @@ func getCurrentStackPointer() uintptr {
 	return uintptr(stacksave())
 }
 
-// Documentation:
-// * https://llvm.org/docs/Atomics.html
-// * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
-//
-// In the case of Cortex-M, some atomic operations are emitted inline while
-// others are emitted as libcalls. How many are emitted as libcalls depends on
-// the MCU core variant (M3 and higher support some 32-bit atomic operations
-// while M0 and M0+ do not).
-
-//export __sync_fetch_and_add_4
-func __sync_fetch_and_add_4(ptr *uint32, value uint32) uint32 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = oldValue + value
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __sync_fetch_and_add_8
-func __sync_fetch_and_add_8(ptr *uint64, value uint64) uint64 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = oldValue + value
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __sync_lock_test_and_set_4
-func __sync_lock_test_and_set_4(ptr *uint32, value uint32) uint32 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = value
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __sync_lock_test_and_set_8
-func __sync_lock_test_and_set_8(ptr *uint64, value uint64) uint64 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = value
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __sync_val_compare_and_swap_4
-func __sync_val_compare_and_swap_4(ptr *uint32, expected, desired uint32) uint32 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	if oldValue == expected {
-		*ptr = desired
-	}
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __sync_val_compare_and_swap_8
-func __sync_val_compare_and_swap_8(ptr *uint64, expected, desired uint64) uint64 {
-	mask := arm.DisableInterrupts()
-	oldValue := *ptr
-	if oldValue == expected {
-		*ptr = desired
-	}
-	arm.EnableInterrupts(mask)
-	return oldValue
-}
-
 // The safest thing to do here would just be to disable interrupts for
 // procPin/procUnpin. Note that a global variable is safe in this case, as any
 // access to procPinnedMask will happen with interrupts disabled.
diff --git a/src/runtime/arch_tinygoriscv.go b/src/runtime/arch_tinygoriscv.go
index 3b4ff798..d4ec961b 100644
--- a/src/runtime/arch_tinygoriscv.go
+++ b/src/runtime/arch_tinygoriscv.go
@@ -1,3 +1,4 @@
+//go:build tinygo.riscv
 // +build tinygo.riscv
 
 package runtime
@@ -8,110 +9,6 @@ func getCurrentStackPointer() uintptr {
 	return uintptr(stacksave())
 }
 
-// Documentation:
-// * https://llvm.org/docs/Atomics.html
-// * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
-//
-// In the case of RISC-V, some operations may be implemented with libcalls if
-// the operation is too big to be handled by assembly. Officially, these calls
-// should be implemented with a lock-free algorithm but as (as of this time) all
-// supported RISC-V chips have a single hart, we can simply disable interrupts
-// to get the same behavior.
-
-//export __atomic_load_4
-func __atomic_load_4(ptr *uint32, ordering int32) uint32 {
-	mask := riscv.DisableInterrupts()
-	value := *ptr
-	riscv.EnableInterrupts(mask)
-	return value
-}
-
-//export __atomic_store_4
-func __atomic_store_4(ptr *uint32, value uint32, ordering int32) {
-	mask := riscv.DisableInterrupts()
-	*ptr = value
-	riscv.EnableInterrupts(mask)
-}
-
-//export __atomic_exchange_4
-func __atomic_exchange_4(ptr *uint32, value uint32, ordering int32) uint32 {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = value
-	riscv.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __atomic_compare_exchange_4
-func __atomic_compare_exchange_4(ptr, expected *uint32, desired uint32, success_ordering, failure_ordering int32) bool {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	success := oldValue == *expected
-	if success {
-		*ptr = desired
-	} else {
-		*expected = oldValue
-	}
-	riscv.EnableInterrupts(mask)
-	return success
-}
-
-//export __atomic_fetch_add_4
-func __atomic_fetch_add_4(ptr *uint32, value uint32, ordering int32) uint32 {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = oldValue + value
-	riscv.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __atomic_load_8
-func __atomic_load_8(ptr *uint64, ordering int32) uint64 {
-	mask := riscv.DisableInterrupts()
-	value := *ptr
-	riscv.EnableInterrupts(mask)
-	return value
-}
-
-//export __atomic_store_8
-func __atomic_store_8(ptr *uint64, value uint64, ordering int32) {
-	mask := riscv.DisableInterrupts()
-	*ptr = value
-	riscv.EnableInterrupts(mask)
-}
-
-//export __atomic_exchange_8
-func __atomic_exchange_8(ptr *uint64, value uint64, ordering int32) uint64 {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = value
-	riscv.EnableInterrupts(mask)
-	return oldValue
-}
-
-//export __atomic_compare_exchange_8
-func __atomic_compare_exchange_8(ptr, expected *uint64, desired uint64, success_ordering, failure_ordering int32) bool {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	success := oldValue == *expected
-	if success {
-		*ptr = desired
-	} else {
-		*expected = oldValue
-	}
-	riscv.EnableInterrupts(mask)
-	return success
-}
-
-//export __atomic_fetch_add_8
-func __atomic_fetch_add_8(ptr *uint64, value uint64, ordering int32) uint64 {
-	mask := riscv.DisableInterrupts()
-	oldValue := *ptr
-	*ptr = oldValue + value
-	riscv.EnableInterrupts(mask)
-	return oldValue
-}
-
 // The safest thing to do here would just be to disable interrupts for
 // procPin/procUnpin. Note that a global variable is safe in this case, as any
 // access to procPinnedMask will happen with interrupts disabled.
diff --git a/src/runtime/atomics_critical.go b/src/runtime/atomics_critical.go
new file mode 100644
index 00000000..615426ae
--- /dev/null
+++ b/src/runtime/atomics_critical.go
@@ -0,0 +1,286 @@
+//go:build baremetal && !tinygo.wasm
+// +build baremetal,!tinygo.wasm
+
+// Automatically generated file. DO NOT EDIT.
+// This file implements standins for non-native atomics using critical sections.
+
+package runtime
+
+import (
+	"runtime/interrupt"
+	_ "unsafe"
+)
+
+// Documentation:
+// * https://llvm.org/docs/Atomics.html
+// * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
+//
+// Some atomic operations are emitted inline while others are emitted as libcalls.
+// How many are emitted as libcalls depends on the MCU arch and core variant.
+
+// 16-bit atomics.
+
+//export __atomic_load_2
+func __atomic_load_2(ptr *uint16, ordering uintptr) uint16 {
+	// The LLVM docs for this say that there is a val argument after the pointer.
+	// That is a typo, and the GCC docs omit it.
+	mask := interrupt.Disable()
+	val := *ptr
+	interrupt.Restore(mask)
+	return val
+}
+
+//export __atomic_store_2
+func __atomic_store_2(ptr *uint16, val uint16, ordering uintptr) {
+	mask := interrupt.Disable()
+	*ptr = val
+	interrupt.Restore(mask)
+}
+
+//go:inline
+func doAtomicCAS16(ptr *uint16, expected, desired uint16) uint16 {
+	mask := interrupt.Disable()
+	old := *ptr
+	if old == expected {
+		*ptr = desired
+	}
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_val_compare_and_swap_2
+func __sync_val_compare_and_swap_2(ptr *uint16, expected, desired uint16) uint16 {
+	return doAtomicCAS16(ptr, expected, desired)
+}
+
+//export __atomic_compare_exchange_2
+func __atomic_compare_exchange_2(ptr, expected *uint16, desired uint16, successOrder, failureOrder uintptr) bool {
+	exp := *expected
+	old := doAtomicCAS16(ptr, exp, desired)
+	return old == exp
+}
+
+//go:inline
+func doAtomicSwap16(ptr *uint16, new uint16) uint16 {
+	mask := interrupt.Disable()
+	old := *ptr
+	*ptr = new
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_lock_test_and_set_2
+func __sync_lock_test_and_set_2(ptr *uint16, new uint16) uint16 {
+	return doAtomicSwap16(ptr, new)
+}
+
+//export __atomic_exchange_2
+func __atomic_exchange_2(ptr *uint16, new uint16, ordering uintptr) uint16 {
+	return doAtomicSwap16(ptr, new)
+}
+
+//go:inline
+func doAtomicAdd16(ptr *uint16, value uint16) (old, new uint16) {
+	mask := interrupt.Disable()
+	old = *ptr
+	new = old + value
+	*ptr = new
+	interrupt.Restore(mask)
+	return old, new
+}
+
+//export __atomic_fetch_add_2
+func __atomic_fetch_add_2(ptr *uint16, value uint16, ordering uintptr) uint16 {
+	old, _ := doAtomicAdd16(ptr, value)
+	return old
+}
+
+//export __sync_fetch_and_add_2
+func __sync_fetch_and_add_2(ptr *uint16, value uint16) uint16 {
+	old, _ := doAtomicAdd16(ptr, value)
+	return old
+}
+
+//export __atomic_add_fetch_2
+func __atomic_add_fetch_2(ptr *uint16, value uint16, ordering uintptr) uint16 {
+	_, new := doAtomicAdd16(ptr, value)
+	return new
+}
+
+// 32-bit atomics.
+
+//export __atomic_load_4
+func __atomic_load_4(ptr *uint32, ordering uintptr) uint32 {
+	// The LLVM docs for this say that there is a val argument after the pointer.
+	// That is a typo, and the GCC docs omit it.
+	mask := interrupt.Disable()
+	val := *ptr
+	interrupt.Restore(mask)
+	return val
+}
+
+//export __atomic_store_4
+func __atomic_store_4(ptr *uint32, val uint32, ordering uintptr) {
+	mask := interrupt.Disable()
+	*ptr = val
+	interrupt.Restore(mask)
+}
+
+//go:inline
+func doAtomicCAS32(ptr *uint32, expected, desired uint32) uint32 {
+	mask := interrupt.Disable()
+	old := *ptr
+	if old == expected {
+		*ptr = desired
+	}
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_val_compare_and_swap_4
+func __sync_val_compare_and_swap_4(ptr *uint32, expected, desired uint32) uint32 {
+	return doAtomicCAS32(ptr, expected, desired)
+}
+
+//export __atomic_compare_exchange_4
+func __atomic_compare_exchange_4(ptr, expected *uint32, desired uint32, successOrder, failureOrder uintptr) bool {
+	exp := *expected
+	old := doAtomicCAS32(ptr, exp, desired)
+	return old == exp
+}
+
+//go:inline
+func doAtomicSwap32(ptr *uint32, new uint32) uint32 {
+	mask := interrupt.Disable()
+	old := *ptr
+	*ptr = new
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_lock_test_and_set_4
+func __sync_lock_test_and_set_4(ptr *uint32, new uint32) uint32 {
+	return doAtomicSwap32(ptr, new)
+}
+
+//export __atomic_exchange_4
+func __atomic_exchange_4(ptr *uint32, new uint32, ordering uintptr) uint32 {
+	return doAtomicSwap32(ptr, new)
+}
+
+//go:inline
+func doAtomicAdd32(ptr *uint32, value uint32) (old, new uint32) {
+	mask := interrupt.Disable()
+	old = *ptr
+	new = old + value
+	*ptr = new
+	interrupt.Restore(mask)
+	return old, new
+}
+
+//export __atomic_fetch_add_4
+func __atomic_fetch_add_4(ptr *uint32, value uint32, ordering uintptr) uint32 {
+	old, _ := doAtomicAdd32(ptr, value)
+	return old
+}
+
+//export __sync_fetch_and_add_4
+func __sync_fetch_and_add_4(ptr *uint32, value uint32) uint32 {
+	old, _ := doAtomicAdd32(ptr, value)
+	return old
+}
+
+//export __atomic_add_fetch_4
+func __atomic_add_fetch_4(ptr *uint32, value uint32, ordering uintptr) uint32 {
+	_, new := doAtomicAdd32(ptr, value)
+	return new
+}
+
+// 64-bit atomics.
+
+//export __atomic_load_8
+func __atomic_load_8(ptr *uint64, ordering uintptr) uint64 {
+	// The LLVM docs for this say that there is a val argument after the pointer.
+	// That is a typo, and the GCC docs omit it.
+	mask := interrupt.Disable()
+	val := *ptr
+	interrupt.Restore(mask)
+	return val
+}
+
+//export __atomic_store_8
+func __atomic_store_8(ptr *uint64, val uint64, ordering uintptr) {
+	mask := interrupt.Disable()
+	*ptr = val
+	interrupt.Restore(mask)
+}
+
+//go:inline
+func doAtomicCAS64(ptr *uint64, expected, desired uint64) uint64 {
+	mask := interrupt.Disable()
+	old := *ptr
+	if old == expected {
+		*ptr = desired
+	}
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_val_compare_and_swap_8
+func __sync_val_compare_and_swap_8(ptr *uint64, expected, desired uint64) uint64 {
+	return doAtomicCAS64(ptr, expected, desired)
+}
+
+//export __atomic_compare_exchange_8
+func __atomic_compare_exchange_8(ptr, expected *uint64, desired uint64, successOrder, failureOrder uintptr) bool {
+	exp := *expected
+	old := doAtomicCAS64(ptr, exp, desired)
+	return old == exp
+}
+
+//go:inline
+func doAtomicSwap64(ptr *uint64, new uint64) uint64 {
+	mask := interrupt.Disable()
+	old := *ptr
+	*ptr = new
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_lock_test_and_set_8
+func __sync_lock_test_and_set_8(ptr *uint64, new uint64) uint64 {
+	return doAtomicSwap64(ptr, new)
+}
+
+//export __atomic_exchange_8
+func __atomic_exchange_8(ptr *uint64, new uint64, ordering uintptr) uint64 {
+	return doAtomicSwap64(ptr, new)
+}
+
+//go:inline
+func doAtomicAdd64(ptr *uint64, value uint64) (old, new uint64) {
+	mask := interrupt.Disable()
+	old = *ptr
+	new = old + value
+	*ptr = new
+	interrupt.Restore(mask)
+	return old, new
+}
+
+//export __atomic_fetch_add_8
+func __atomic_fetch_add_8(ptr *uint64, value uint64, ordering uintptr) uint64 {
+	old, _ := doAtomicAdd64(ptr, value)
+	return old
+}
+
+//export __sync_fetch_and_add_8
+func __sync_fetch_and_add_8(ptr *uint64, value uint64) uint64 {
+	old, _ := doAtomicAdd64(ptr, value)
+	return old
+}
+
+//export __atomic_add_fetch_8
+func __atomic_add_fetch_8(ptr *uint64, value uint64, ordering uintptr) uint64 {
+	_, new := doAtomicAdd64(ptr, value)
+	return new
+}
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index dc4c9f87..f22e2cdd 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -4,6 +4,8 @@ import (
 	"unsafe"
 )
 
+//go:generate go run ../../tools/gen-critical-atomics -out ./atomics_critical.go
+
 const Compiler = "tinygo"
 
 // The compiler will fill this with calls to the initialization function of each
diff --git a/tools/gen-critical-atomics/gen-critical-atomics.go b/tools/gen-critical-atomics/gen-critical-atomics.go
new file mode 100644
index 00000000..f444c1c3
--- /dev/null
+++ b/tools/gen-critical-atomics/gen-critical-atomics.go
@@ -0,0 +1,182 @@
+package main
+
+import (
+	"bytes"
+	"flag"
+	"os"
+	"os/exec"
+	"strings"
+	"text/template"
+)
+
+var tmpl = template.Must(template.New("go").Funcs(template.FuncMap{
+	"mul": func(x, y int) int {
+		return x * y
+	},
+	"tuple": func(v ...interface{}) []interface{} {
+		return v
+	},
+	"title": strings.Title,
+}).Parse(`//+build baremetal,!tinygo.wasm
+
+// Automatically generated file. DO NOT EDIT.
+// This file implements standins for non-native atomics using critical sections.
+
+package runtime
+
+import (
+	_ "unsafe"
+	"runtime/interrupt"
+)
+
+// Documentation:
+// * https://llvm.org/docs/Atomics.html
+// * https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
+//
+// Some atomic operations are emitted inline while others are emitted as libcalls.
+// How many are emitted as libcalls depends on the MCU arch and core variant.
+
+{{- define "load"}}{{$bits := mul . 8 -}}
+//export __atomic_load_{{.}}
+func __atomic_load_{{.}}(ptr *uint{{$bits}}, ordering uintptr) uint{{$bits}} {
+	// The LLVM docs for this say that there is a val argument after the pointer.
+	// That is a typo, and the GCC docs omit it.
+	mask := interrupt.Disable()
+	val := *ptr
+	interrupt.Restore(mask)
+	return val
+}
+{{end}}
+{{- define "store"}}{{$bits := mul . 8 -}}
+//export __atomic_store_{{.}}
+func __atomic_store_{{.}}(ptr *uint{{$bits}}, val uint{{$bits}}, ordering uintptr) {
+	mask := interrupt.Disable()
+	*ptr = val
+	interrupt.Restore(mask)
+}
+{{end}}
+{{- define "cas"}}{{$bits := mul . 8 -}}
+//go:inline
+func doAtomicCAS{{$bits}}(ptr *uint{{$bits}}, expected, desired uint{{$bits}}) uint{{$bits}} {
+	mask := interrupt.Disable()
+	old := *ptr
+	if old == expected {
+		*ptr = desired
+	}
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_val_compare_and_swap_{{.}}
+func __sync_val_compare_and_swap_{{.}}(ptr *uint{{$bits}}, expected, desired uint{{$bits}}) uint{{$bits}} {
+	return doAtomicCAS{{$bits}}(ptr, expected, desired)
+}
+
+//export __atomic_compare_exchange_{{.}}
+func __atomic_compare_exchange_{{.}}(ptr, expected *uint{{$bits}}, desired uint{{$bits}}, successOrder, failureOrder uintptr) bool {
+	exp := *expected
+	old := doAtomicCAS{{$bits}}(ptr, exp, desired)
+	return old == exp
+}
+{{end}}
+{{- define "swap"}}{{$bits := mul . 8 -}}
+//go:inline
+func doAtomicSwap{{$bits}}(ptr *uint{{$bits}}, new uint{{$bits}}) uint{{$bits}} {
+	mask := interrupt.Disable()
+	old := *ptr
+	*ptr = new
+	interrupt.Restore(mask)
+	return old
+}
+
+//export __sync_lock_test_and_set_{{.}}
+func __sync_lock_test_and_set_{{.}}(ptr *uint{{$bits}}, new uint{{$bits}}) uint{{$bits}} {
+	return doAtomicSwap{{$bits}}(ptr, new)
+}
+
+//export __atomic_exchange_{{.}}
+func __atomic_exchange_{{.}}(ptr *uint{{$bits}}, new uint{{$bits}}, ordering uintptr) uint{{$bits}} {
+	return doAtomicSwap{{$bits}}(ptr, new)
+}
+{{end}}
+{{- define "rmw"}}
+	{{- $opname := index . 0}}
+	{{- $bytes := index . 1}}{{$bits := mul $bytes 8}}
+	{{- $signed := index . 2}}
+	{{- $opdef := index . 3}}
+
+{{- $type := printf "int%d" $bits}}
+{{- if not $signed}}{{$type = printf "u%s" $type}}{{end -}}
+{{- $opfn := printf "doAtomic%s%d" (title $opname) $bits}}
+
+//go:inline
+func {{$opfn}}(ptr *{{$type}}, value {{$type}}) (old, new {{$type}}) {
+	mask := interrupt.Disable()
+	old = *ptr
+	{{$opdef}}
+	*ptr = new
+	interrupt.Restore(mask)
+	return old, new
+}
+
+//export __atomic_fetch_{{$opname}}_{{$bytes}}
+func __atomic_fetch_{{$opname}}_{{$bytes}}(ptr *{{$type}}, value {{$type}}, ordering uintptr) {{$type}} {
+	old, _ := {{$opfn}}(ptr, value)
+	return old
+}
+
+//export __sync_fetch_and_{{$opname}}_{{$bytes}}
+func __sync_fetch_and_{{$opname}}_{{$bytes}}(ptr *{{$type}}, value {{$type}}) {{$type}} {
+	old, _ := {{$opfn}}(ptr, value)
+	return old
+}
+
+//export __atomic_{{$opname}}_fetch_{{$bytes}}
+func __atomic_{{$opname}}_fetch_{{$bytes}}(ptr *{{$type}}, value {{$type}}, ordering uintptr) {{$type}} {
+	_, new := {{$opfn}}(ptr, value)
+	return new
+}
+{{end}}
+{{- define "atomics"}}
+// {{mul . 8}}-bit atomics.
+
+{{/* These atomics are accessible directly from sync/atomic. */ -}}
+{{template "load" .}}
+{{template "store" .}}
+{{template "cas" .}}
+{{template "swap" .}}
+{{template "rmw" (tuple "add" . false "new = old + value")}}
+
+{{- end}}
+{{template "atomics" 2 -}}
+{{template "atomics" 4 -}}
+{{template "atomics" 8}}
+`))
+
+func main() {
+	var out string
+	flag.StringVar(&out, "out", "-", "output path")
+	flag.Parse()
+	f := os.Stdout
+	if out != "-" {
+		var err error
+		f, err = os.Create(out)
+		if err != nil {
+			panic(err)
+		}
+		defer f.Close()
+	}
+	var buf bytes.Buffer
+	err := tmpl.Execute(&buf, nil)
+	if err != nil {
+		panic(err)
+	}
+	cmd := exec.Command("gofmt")
+	cmd.Stdin = &buf
+	cmd.Stdout = f
+	cmd.Stderr = os.Stderr
+	err = cmd.Run()
+	if err != nil {
+		panic(err)
+	}
+}