diff --git a/Makefile b/Makefile index e59799ca..9e7819ec 100644 --- a/Makefile +++ b/Makefile @@ -198,6 +198,7 @@ tinygo-test: $(TINYGO) test container/list $(TINYGO) test container/ring $(TINYGO) test text/scanner + $(TINYGO) test unicode/utf8 .PHONY: smoketest smoketest: diff --git a/src/runtime/string.go b/src/runtime/string.go index e0d0a226..c1c12cc7 100644 --- a/src/runtime/string.go +++ b/src/runtime/string.go @@ -186,21 +186,47 @@ func decodeUTF8(s string, index uintptr) (rune, uintptr) { case x&0x80 == 0x00: // 0xxxxxxx return rune(x), 1 case x&0xe0 == 0xc0: // 110xxxxx - if remaining < 2 { + if remaining < 2 || !isContinuation(s[index+1]) { return 0xfffd, 1 } - return (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f), 2 + r := (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f) + if r >= 1<<7 { + // Check whether the rune really needed to be encoded as a two-byte + // sequence. UTF-8 requires every rune to be encoded in the smallest + // sequence possible. + return r, 2 + } case x&0xf0 == 0xe0: // 1110xxxx - if remaining < 3 { + if remaining < 3 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) { return 0xfffd, 1 } - return (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f), 3 + r := (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f) + if r >= 1<<11 && !(r >= 0xD800 && r <= 0xDFFF) { + // Check whether the rune really needed to be encoded as a + // three-byte sequence and check that this is not a Unicode + // surrogate pair (which are not allowed by UTF-8). + return r, 3 + } case x&0xf8 == 0xf0: // 11110xxx - if remaining < 4 { + if remaining < 4 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) || !isContinuation(s[index+3]) { return 0xfffd, 1 } - return (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f), 4 - default: - return 0xfffd, 1 + r := (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f) + if r >= 1<<16 && r <= '\U0010FFFF' { + // Check whether this rune really needed to be encoded as a four + // byte sequence and check that the resulting rune is in the valid + // range (up to at most U+10FFFF). + return r, 4 + } } + + // Failed to decode. Return the Unicode replacement character and a length of 1. + return 0xfffd, 1 +} + +// isContinuation returns true if (and only if) this is a UTF-8 continuation +// byte. +func isContinuation(b byte) bool { + // Continuation bytes have their topmost bits set to 0b10. + return b&0xc0 == 0x80 } diff --git a/src/runtime/strings_go111.go b/src/runtime/strings_go111.go index 866cb92a..2f14df33 100644 --- a/src/runtime/strings_go111.go +++ b/src/runtime/strings_go111.go @@ -4,11 +4,17 @@ package runtime import "internal/bytealg" -// indexByte provides compatibility with Go 1.11. +// The following functions provide compatibility with Go 1.11. // See the following: // https://github.com/tinygo-org/tinygo/issues/351 // https://github.com/golang/go/commit/ad4a58e31501bce5de2aad90a620eaecdc1eecb8 + //go:linkname indexByte strings.IndexByte func indexByte(s string, c byte) int { return bytealg.IndexByteString(s, c) } + +//go:linkname bytesEqual bytes.Equal +func bytesEqual(a, b []byte) bool { + return bytealg.Equal(a, b) +}