runtime: fix UTF-8 decoding

The algorithm now checks for invalid UTF-8 sequences, which is required by the Go spec. This gets the tests of the unicode/utf8 package to pass. Also add bytes.Equal for Go 1.11, which again is necessary for the unicode/utf8 package.
2020-09-17 22:45:51 +02:00 · 2020-09-17 22:45:51 +02:00 · ec54e7763d
--- a/1
+++ b/1
@ -198,6 +198,7 @@ tinygo-test:
 	$(TINYGO) test container/list
 	$(TINYGO) test container/ring
 	$(TINYGO) test text/scanner
+	$(TINYGO) test unicode/utf8

 .PHONY: smoketest
 smoketest:
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@ -186,21 +186,47 @@ func decodeUTF8(s string, index uintptr) (rune, uintptr) {
 	case x&0x80 == 0x00: // 0xxxxxxx
 		return rune(x), 1
 	case x&0xe0 == 0xc0: // 110xxxxx
-		if remaining < 2 {
+		if remaining < 2 || !isContinuation(s[index+1]) {
 			return 0xfffd, 1
 		}
-		return (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f), 2
+		r := (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f)
+		if r >= 1<<7 {
+			// Check whether the rune really needed to be encoded as a two-byte
+			// sequence. UTF-8 requires every rune to be encoded in the smallest
+			// sequence possible.
+			return r, 2
+		}
 	case x&0xf0 == 0xe0: // 1110xxxx
-		if remaining < 3 {
+		if remaining < 3 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) {
 			return 0xfffd, 1
 		}
-		return (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f), 3
+		r := (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f)
+		if r >= 1<<11 && !(r >= 0xD800 && r <= 0xDFFF) {
+			// Check whether the rune really needed to be encoded as a
+			// three-byte sequence and check that this is not a Unicode
+			// surrogate pair (which are not allowed by UTF-8).
+			return r, 3
+		}
 	case x&0xf8 == 0xf0: // 11110xxx
-		if remaining < 4 {
+		if remaining < 4 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) || !isContinuation(s[index+3]) {
 			return 0xfffd, 1
 		}
-		return (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f), 4
-	default:
-		return 0xfffd, 1
+		r := (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f)
+		if r >= 1<<16 && r <= '\U0010FFFF' {
+			// Check whether this rune really needed to be encoded as a four
+			// byte sequence and check that the resulting rune is in the valid
+			// range (up to at most U+10FFFF).
+			return r, 4
+		}
 	}
+
+	// Failed to decode. Return the Unicode replacement character and a length of 1.
+	return 0xfffd, 1
+}
+
+// isContinuation returns true if (and only if) this is a UTF-8 continuation
+// byte.
+func isContinuation(b byte) bool {
+	// Continuation bytes have their topmost bits set to 0b10.
+	return b&0xc0 == 0x80
 }
--- a/src/runtime/strings_go111.go
+++ b/src/runtime/strings_go111.go
@ -4,11 +4,17 @@ package runtime

 import "internal/bytealg"

-// indexByte provides compatibility with Go 1.11.
+// The following functions provide compatibility with Go 1.11.
 // See the following:
 // https://github.com/tinygo-org/tinygo/issues/351
 // https://github.com/golang/go/commit/ad4a58e31501bce5de2aad90a620eaecdc1eecb8
+
 //go:linkname indexByte strings.IndexByte
 func indexByte(s string, c byte) int {
 	return bytealg.IndexByteString(s, c)
 }
+
+//go:linkname bytesEqual bytes.Equal
+func bytesEqual(a, b []byte) bool {
+	return bytealg.Equal(a, b)
+}