runtime: fix UTF-8 decoding
The algorithm now checks for invalid UTF-8 sequences, which is required by the Go spec. This gets the tests of the unicode/utf8 package to pass. Also add bytes.Equal for Go 1.11, which again is necessary for the unicode/utf8 package.
Этот коммит содержится в:
родитель
41afb77080
коммит
ec54e7763d
3 изменённых файлов: 42 добавлений и 9 удалений
1
Makefile
1
Makefile
|
@ -198,6 +198,7 @@ tinygo-test:
|
||||||
$(TINYGO) test container/list
|
$(TINYGO) test container/list
|
||||||
$(TINYGO) test container/ring
|
$(TINYGO) test container/ring
|
||||||
$(TINYGO) test text/scanner
|
$(TINYGO) test text/scanner
|
||||||
|
$(TINYGO) test unicode/utf8
|
||||||
|
|
||||||
.PHONY: smoketest
|
.PHONY: smoketest
|
||||||
smoketest:
|
smoketest:
|
||||||
|
|
|
@ -186,21 +186,47 @@ func decodeUTF8(s string, index uintptr) (rune, uintptr) {
|
||||||
case x&0x80 == 0x00: // 0xxxxxxx
|
case x&0x80 == 0x00: // 0xxxxxxx
|
||||||
return rune(x), 1
|
return rune(x), 1
|
||||||
case x&0xe0 == 0xc0: // 110xxxxx
|
case x&0xe0 == 0xc0: // 110xxxxx
|
||||||
if remaining < 2 {
|
if remaining < 2 || !isContinuation(s[index+1]) {
|
||||||
return 0xfffd, 1
|
return 0xfffd, 1
|
||||||
}
|
}
|
||||||
return (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f), 2
|
r := (rune(x&0x1f) << 6) | (rune(s[index+1]) & 0x3f)
|
||||||
|
if r >= 1<<7 {
|
||||||
|
// Check whether the rune really needed to be encoded as a two-byte
|
||||||
|
// sequence. UTF-8 requires every rune to be encoded in the smallest
|
||||||
|
// sequence possible.
|
||||||
|
return r, 2
|
||||||
|
}
|
||||||
case x&0xf0 == 0xe0: // 1110xxxx
|
case x&0xf0 == 0xe0: // 1110xxxx
|
||||||
if remaining < 3 {
|
if remaining < 3 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) {
|
||||||
return 0xfffd, 1
|
return 0xfffd, 1
|
||||||
}
|
}
|
||||||
return (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f), 3
|
r := (rune(x&0x0f) << 12) | ((rune(s[index+1]) & 0x3f) << 6) | (rune(s[index+2]) & 0x3f)
|
||||||
|
if r >= 1<<11 && !(r >= 0xD800 && r <= 0xDFFF) {
|
||||||
|
// Check whether the rune really needed to be encoded as a
|
||||||
|
// three-byte sequence and check that this is not a Unicode
|
||||||
|
// surrogate pair (which are not allowed by UTF-8).
|
||||||
|
return r, 3
|
||||||
|
}
|
||||||
case x&0xf8 == 0xf0: // 11110xxx
|
case x&0xf8 == 0xf0: // 11110xxx
|
||||||
if remaining < 4 {
|
if remaining < 4 || !isContinuation(s[index+1]) || !isContinuation(s[index+2]) || !isContinuation(s[index+3]) {
|
||||||
return 0xfffd, 1
|
return 0xfffd, 1
|
||||||
}
|
}
|
||||||
return (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f), 4
|
r := (rune(x&0x07) << 18) | ((rune(s[index+1]) & 0x3f) << 12) | ((rune(s[index+2]) & 0x3f) << 6) | (rune(s[index+3]) & 0x3f)
|
||||||
default:
|
if r >= 1<<16 && r <= '\U0010FFFF' {
|
||||||
return 0xfffd, 1
|
// Check whether this rune really needed to be encoded as a four
|
||||||
|
// byte sequence and check that the resulting rune is in the valid
|
||||||
|
// range (up to at most U+10FFFF).
|
||||||
|
return r, 4
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Failed to decode. Return the Unicode replacement character and a length of 1.
|
||||||
|
return 0xfffd, 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// isContinuation returns true if (and only if) this is a UTF-8 continuation
|
||||||
|
// byte.
|
||||||
|
func isContinuation(b byte) bool {
|
||||||
|
// Continuation bytes have their topmost bits set to 0b10.
|
||||||
|
return b&0xc0 == 0x80
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,17 @@ package runtime
|
||||||
|
|
||||||
import "internal/bytealg"
|
import "internal/bytealg"
|
||||||
|
|
||||||
// indexByte provides compatibility with Go 1.11.
|
// The following functions provide compatibility with Go 1.11.
|
||||||
// See the following:
|
// See the following:
|
||||||
// https://github.com/tinygo-org/tinygo/issues/351
|
// https://github.com/tinygo-org/tinygo/issues/351
|
||||||
// https://github.com/golang/go/commit/ad4a58e31501bce5de2aad90a620eaecdc1eecb8
|
// https://github.com/golang/go/commit/ad4a58e31501bce5de2aad90a620eaecdc1eecb8
|
||||||
|
|
||||||
//go:linkname indexByte strings.IndexByte
|
//go:linkname indexByte strings.IndexByte
|
||||||
func indexByte(s string, c byte) int {
|
func indexByte(s string, c byte) int {
|
||||||
return bytealg.IndexByteString(s, c)
|
return bytealg.IndexByteString(s, c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//go:linkname bytesEqual bytes.Equal
|
||||||
|
func bytesEqual(a, b []byte) bool {
|
||||||
|
return bytealg.Equal(a, b)
|
||||||
|
}
|
||||||
|
|
Загрузка…
Создание таблицы
Сослаться в новой задаче