runtime: fix UTF-8 decoding
The algorithm now checks for invalid UTF-8 sequences, which is required by the Go spec. This gets the tests of the unicode/utf8 package to pass. Also add bytes.Equal for Go 1.11, which again is necessary for the unicode/utf8 package.
Этот коммит содержится в:
родитель
41afb77080
коммит
ec54e7763d
3 изменённых файлов: 42 добавлений и 9 удалений
1
Makefile
1
Makefile
|
@ -198,6 +198,7 @@ tinygo-test:
|
|||
$(TINYGO) test container/list
|
||||
$(TINYGO) test container/ring
|
||||
$(TINYGO) test text/scanner
|
||||
$(TINYGO) test unicode/utf8
|
||||
|
||||
.PHONY: smoketest
|
||||
smoketest:
|
||||
|
|
|
@ -186,21 +186,47 @@ func decodeUTF8(s string, index uintptr) (rune, uintptr) {
|
|||
case x&0x80 == 0x00: // 0xxxxxxx
|
||||
return rune(x), 1
|
||||
case x&0xe0 == 0xc0: // 110xxxxx
|
||||
if remaining < 2 {
|
||||
if remaining < 2 || !isContinuation(s[index+1]) {
|
||||
return 0xfffd, 1
|
||||
}
|
||||
return (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f), 2
|
||||
r := (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f)
|
||||
if r >= 1<<7 {
|
||||
// Check whether the rune really needed to be encoded as a two-byte
|
||||
// sequence. UTF-8 requires every rune to be encoded in the smallest
|
||||
// sequence possible.
|
||||
return r, 2
|
||||
}
|
||||
case x&0xf0 == 0xe0: // 1110xxxx
|
||||
if remaining < 3 {
|
||||
if remaining < 3 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) {
|
||||
return 0xfffd, 1
|
||||
}
|
||||
return (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f), 3
|
||||
r := (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f)
|
||||
if r >= 1<<11 && !(r >= 0xD800 && r <= 0xDFFF) {
|
||||
// Check whether the rune really needed to be encoded as a
|
||||
// three-byte sequence and check that this is not a Unicode
|
||||
// surrogate pair (which are not allowed by UTF-8).
|
||||
return r, 3
|
||||
}
|
||||
case x&0xf8 == 0xf0: // 11110xxx
|
||||
if remaining < 4 {
|
||||
if remaining < 4 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) || !isContinuation(s[index+3]) {
|
||||
return 0xfffd, 1
|
||||
}
|
||||
return (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f), 4
|
||||
default:
|
||||
return 0xfffd, 1
|
||||
r := (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f)
|
||||
if r >= 1<<16 && r <= '\U0010FFFF' {
|
||||
// Check whether this rune really needed to be encoded as a four
|
||||
// byte sequence and check that the resulting rune is in the valid
|
||||
// range (up to at most U+10FFFF).
|
||||
return r, 4
|
||||
}
|
||||
}
|
||||
|
||||
// Failed to decode. Return the Unicode replacement character and a length of 1.
|
||||
return 0xfffd, 1
|
||||
}
|
||||
|
||||
// isContinuation returns true if (and only if) this is a UTF-8 continuation
|
||||
// byte.
|
||||
func isContinuation(b byte) bool {
|
||||
// Continuation bytes have their topmost bits set to 0b10.
|
||||
return b&0xc0 == 0x80
|
||||
}
|
||||
|
|
|
@ -4,11 +4,17 @@ package runtime
|
|||
|
||||
import "internal/bytealg"
|
||||
|
||||
// indexByte provides compatibility with Go 1.11.
|
||||
// The following functions provide compatibility with Go 1.11.
|
||||
// See the following:
|
||||
// https://github.com/tinygo-org/tinygo/issues/351
|
||||
// https://github.com/golang/go/commit/ad4a58e31501bce5de2aad90a620eaecdc1eecb8
|
||||
|
||||
//go:linkname indexByte strings.IndexByte
|
||||
func indexByte(s string, c byte) int {
|
||||
return bytealg.IndexByteString(s, c)
|
||||
}
|
||||
|
||||
//go:linkname bytesEqual bytes.Equal
|
||||
func bytesEqual(a, b []byte) bool {
|
||||
return bytealg.Equal(a, b)
|
||||
}
|
||||
|
|
Загрузка…
Создание таблицы
Сослаться в новой задаче