Commit b8b1aa4e authored by Christian Persch's avatar Christian Persch

utf8: Make decoder conform to recommendation on replacement characters

With this change, the decoder conforms to the W3 Encoding TR and
the Unicode recommendation on inserting replacement characters
from §3.9 of the Unicode core spec.

https://gitlab.gnome.org/GNOME/vte/issues/30
parent 9bda7de4
...@@ -347,18 +347,15 @@ process_file_utf8(int fd, ...@@ -347,18 +347,15 @@ process_file_utf8(int fd,
for (auto sptr = buf; sptr < bufend; ++sptr) { for (auto sptr = buf; sptr < bufend; ++sptr) {
switch (decoder.decode(*sptr)) { switch (decoder.decode(*sptr)) {
case vte::base::UTF8Decoder::REJECT: case vte::base::UTF8Decoder::REJECT_REWIND:
decoder.reset(); /* Rewind the stream.
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the * Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed. * next round this byte *will* be consumed.
*/ */
if (decoder.is_start_byte(*sptr)) --sptr;
--sptr; /* [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
/* Fall through to insert the U+FFFD replacement character. */ /* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */ /* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: { case vte::base::UTF8Decoder::ACCEPT: {
......
...@@ -48,21 +48,6 @@ test_utf8_decoder_decode(void) ...@@ -48,21 +48,6 @@ test_utf8_decoder_decode(void)
} }
} }
static constexpr bool
is_utf8_start_byte(uint32_t c)
{
return (c < 0x80u || (c >= 0xc2u && c <= 0xf4u));
}
static void
test_utf8_decoder_start(void)
{
decoder.reset();
for (uint32_t c = 0; c < 0x100u; ++c) {
g_assert_cmpint(decoder.is_start_byte(c), ==, is_utf8_start_byte(c));
}
}
static void static void
decode(uint8_t const* in, decode(uint8_t const* in,
size_t len, size_t len,
...@@ -74,19 +59,15 @@ decode(uint8_t const* in, ...@@ -74,19 +59,15 @@ decode(uint8_t const* in,
uint32_t state = UTF8Decoder::ACCEPT; uint32_t state = UTF8Decoder::ACCEPT;
for (auto iptr = in; iptr < iend; ++iptr) { for (auto iptr = in; iptr < iend; ++iptr) {
switch ((state = decoder.decode(*iptr))) { switch ((state = decoder.decode(*iptr))) {
case vte::base::UTF8Decoder::REJECT_REWIND:
/* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
--iptr;
// [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT: case vte::base::UTF8Decoder::REJECT:
decoder.reset(); decoder.reset();
state = UTF8Decoder::ACCEPT; state = UTF8Decoder::ACCEPT;
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (decoder.is_start_byte(*iptr))
--iptr;
/* Fall through to insert the U+FFFD replacement character. */ /* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */ /* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: case vte::base::UTF8Decoder::ACCEPT:
...@@ -210,13 +191,11 @@ test_utf8_decoder_replacement(void) ...@@ -210,13 +191,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC0\x80", -1, U"a\uFFFD\uFFFD"s); assert_decode("a\xC0\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC0\x80Z", -1, U"a\uFFFD\uFFFDZ"s); assert_decode("a\xC0\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
// Lowest single-byte as three-byte overlong sequence // Lowest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xE0\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xE0\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest single-byte as four-byte overlong sequence // Lowest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One below lowest single-byte // One below lowest single-byte
assert_decode("a\xFF", -1, U"a\uFFFD"s); assert_decode("a\xFF", -1, U"a\uFFFD"s);
assert_decode("a\xFFZ", -1, U"a\uFFFDZ"s); assert_decode("a\xFFZ", -1, U"a\uFFFDZ"s);
...@@ -227,13 +206,11 @@ test_utf8_decoder_replacement(void) ...@@ -227,13 +206,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC1\xBF", -1, U"a\uFFFD\uFFFD"s); assert_decode("a\xC1\xBF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\xBFZ", -1, U"a\uFFFD\uFFFDZ"s); assert_decode("a\xC1\xBFZ", -1, U"a\uFFFD\uFFFDZ"s);
// Highest single-byte as three-byte overlong sequence // Highest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xE0\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xE0\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest single-byte as four-byte overlong sequence // Highest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x80\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x80\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One past highest single byte (also lone continuation) // One past highest single byte (also lone continuation)
assert_decode("a\x80Z", -1, U"a\uFFFDZ"s); assert_decode("a\x80Z", -1, U"a\uFFFDZ"s);
assert_decode("a\x80", -1, U"a\uFFFD"s); assert_decode("a\x80", -1, U"a\uFFFD"s);
...@@ -250,13 +227,11 @@ test_utf8_decoder_replacement(void) ...@@ -250,13 +227,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC2\x80", -1, U"a\u0080"s); assert_decode("a\xC2\x80", -1, U"a\u0080"s);
assert_decode("a\xC2\x80Z", -1, U"a\u0080Z"s); assert_decode("a\xC2\x80Z", -1, U"a\u0080Z"s);
// Lowest two-byte as three-byte overlong sequence // Lowest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xE0\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xE0\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest two-byte as four-byte overlong sequence // Lowest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x80\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x80\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lead one below lowest two-byte // Lead one below lowest two-byte
assert_decode("a\xC1\x80", -1, U"a\uFFFD\uFFFD"s); assert_decode("a\xC1\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\x80Z", -1, U"a\uFFFD\uFFFDZ"s); assert_decode("a\xC1\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
...@@ -267,26 +242,21 @@ test_utf8_decoder_replacement(void) ...@@ -267,26 +242,21 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xDF\xBF", -1, U"a\u07FF"s); assert_decode("a\xDF\xBF", -1, U"a\u07FF"s);
assert_decode("a\xDF\xBFZ", -1, U"a\u07FFZ"s); assert_decode("a\xDF\xBFZ", -1, U"a\u07FFZ"s);
// Highest two-byte as three-byte overlong sequence // Highest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xE0\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xE0\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest two-byte as four-byte overlong sequence // Highest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x80\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x80\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest three-byte // Lowest three-byte
assert_decode("a\xE0\xA0\x80", -1, U"a\u0800"s); assert_decode("a\xE0\xA0\x80", -1, U"a\u0800"s);
assert_decode("a\xE0\xA0\x80Z", -1, U"a\u0800Z"s); assert_decode("a\xE0\xA0\x80Z", -1, U"a\u0800Z"s);
// Lowest three-byte as four-byte overlong sequence // Lowest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x80\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x80\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x80\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest below surrogates // Highest below surrogates
assert_decode("a\xED\x9F\xBF", -1, U"a\uD7FF"s); assert_decode("a\xED\x9F\xBF", -1, U"a\uD7FF"s);
assert_decode("a\xED\x9F\xBFZ", -1, U"a\uD7FFZ"s); assert_decode("a\xED\x9F\xBFZ", -1, U"a\uD7FFZ"s);
// Highest below surrogates as four-byte overlong sequence // Highest below surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8D\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x8D\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x8D\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// First surrogate // First surrogate
...@@ -301,38 +271,31 @@ test_utf8_decoder_replacement(void) ...@@ -301,38 +271,31 @@ test_utf8_decoder_replacement(void)
// Last surrogate as four-byte overlong sequence // Last surrogate as four-byte overlong sequence
assert_decode("a\xF0\x8D\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x8D\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x8D\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest above surrogates // Lowest above surrogates
assert_decode("a\xEE\x80\x80", -1, U"a\uE000"s); assert_decode("a\xEE\x80\x80", -1, U"a\uE000"s);
assert_decode("a\xEE\x80\x80Z", -1, U"a\uE000Z"s); assert_decode("a\xEE\x80\x80Z", -1, U"a\uE000Z"s);
// Lowest above surrogates as four-byte overlong sequence // Lowest above surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8E\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x8E\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8E\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x8E\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest three-byte // Highest three-byte
assert_decode("a\xEF\xBF\xBF", -1, U"a\uFFFF"s); assert_decode("a\xEF\xBF\xBF", -1, U"a\uFFFF"s);
assert_decode("a\xEF\xBF\xBFZ", -1, U"a\uFFFFZ"s); assert_decode("a\xEF\xBF\xBFZ", -1, U"a\uFFFFZ"s);
// Highest three-byte as four-byte overlong sequence // Highest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8F\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF0\x8F\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8F\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF0\x8F\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// Lowest four-byte // Lowest four-byte
assert_decode("a\xF0\x90\x80\x80", -1, U"a\u10000"s); assert_decode("a\xF0\x90\x80\x80", -1, U"a\U00010000"s);
assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\u10000Z"s); assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\U00010000Z"s);
// Highest four-byte // Highest four-byte
assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\u10FFFF"s); assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\U0010FFFF"s);
assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\u10FFFFZ"s); assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\U0010FFFFZ"s);
// One past highest four-byte // One past highest four-byte
assert_decode("a\xF4\x90\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s); assert_decode("a\xF4\x90\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF4\x90\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s); assert_decode("a\xF4\x90\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest four-byte with last byte replaced with 0xFF // Highest four-byte with last byte replaced with 0xFF
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF4\x8F\xBF\xFF", -1, U"a\uFFFD\uFFFD"s); assert_decode("a\xF4\x8F\xBF\xFF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xF4\x8F\xBF\xFFZ", -1, U"a\uFFFD\uFFFDZ"s); assert_decode("a\xF4\x8F\xBF\xFFZ", -1, U"a\uFFFD\uFFFDZ"s);
#endif
} }
int int
...@@ -342,7 +305,6 @@ main(int argc, ...@@ -342,7 +305,6 @@ main(int argc,
g_test_init(&argc, &argv, nullptr); g_test_init(&argc, &argv, nullptr);
g_test_add_func("/vte/utf8/decoder/decode", test_utf8_decoder_decode); g_test_add_func("/vte/utf8/decoder/decode", test_utf8_decoder_decode);
g_test_add_func("/vte/utf8/decoder/start", test_utf8_decoder_start);
g_test_add_func("/vte/utf8/decoder/replacement", test_utf8_decoder_replacement); g_test_add_func("/vte/utf8/decoder/replacement", test_utf8_decoder_replacement);
return g_test_run(); return g_test_run();
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "utf8.hh" #include "utf8.hh"
#define RJ vte::base::UTF8Decoder::REJECT #define RJ vte::base::UTF8Decoder::REJECT
#define RW vte::base::UTF8Decoder::REJECT_REWIND
uint8_t const vte::base::UTF8Decoder::kTable[] = { uint8_t const vte::base::UTF8Decoder::kTable[] = {
// The first part of the table maps bytes to character classes that // The first part of the table maps bytes to character classes that
...@@ -40,7 +41,7 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = { ...@@ -40,7 +41,7 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
// 0xe0: 10 // 0xe0: 10
// 0xe1..0xec: 3 // 0xe1..0xec: 3
// 0xed: 4 // 0xed: 4
// 0xee..0xff: 3 // 0xee..0xef: 3
// 0xf0: 11 // 0xf0: 11
// 0xf1..0xf3: 6 // 0xf1..0xf3: 6
// 0xf4: 5 // 0xf4: 5
...@@ -64,6 +65,35 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = { ...@@ -64,6 +65,35 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
// To understand this DFA, see transitions graph on the website // To understand this DFA, see transitions graph on the website
// linked above. // linked above.
//
// The following translates the states of the DFA to the
// algorithm of the UTF-8 decoder from the W3 Encodings spec
// [https://www.w3.org/TR/encoding/#utf-8]:
//
// DFA │ bytes bytes lower upper
// state │ seen needed bound bound
// ──────┼─────────────────────────────────
// 0 │ 0 0 0x80 0xbf
// 12 │
// 24 │ 1,2,3 1 0x80 0xbf
// 36 │ 1,2 2 0x80 0xbf
// 48 │ 1 2 0xa0 0xbf
// 60 │ 1 2 0x80 0x9f
// 72 │ 1 3 0x90 0xbf
// 84 │ 1 3 0x80 0xbf
// 96 │ 1 3 0x80 0x8f
// 108 │
//
// If an unexpected byte is read in a non-ACCEPT/REJECT* state,
// transition to REJECT_REWIND so that the decoder will read that
// byte again after being reset; this makes the decoder conform
// to the Unicode recommendation for insering replacement
// characters, and to the W3 Encoding TR spec.
//
// If an unexpected byte is read in the ACCEPT or a REJECT* state,
// transition to REJECT; that byte must not be read again, since
// that would lead to an infinite loop.
//
// For each state (row), the table records which state will // For each state (row), the table records which state will
// be transitioned to when consuming a character of the class // be transitioned to when consuming a character of the class
// (column). // (column).
...@@ -72,11 +102,12 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = { ...@@ -72,11 +102,12 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
*/ */
0, RJ, 24, 36, 60, 96, 84, RJ, RJ, RJ, 48, 72, // state 0 (accept) 0, RJ, 24, 36, 60, 96, 84, RJ, RJ, RJ, 48, 72, // state 0 (accept)
RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 12 (reject) RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 12 (reject)
RJ, 0, RJ, RJ, RJ, RJ, RJ, 0, RJ, 0, RJ, RJ, // state 24 RW, 0, RW, RW, RW, RW, RW, 0, RW, 0, RW, RW, // state 24
RJ, 24, RJ, RJ, RJ, RJ, RJ, 24, RJ, 24, RJ, RJ, // state 36 RW, 24, RW, RW, RW, RW, RW, 24, RW, 24, RW, RW, // state 36
RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, RJ, RJ, // state 48 RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, RW, RW, // state 48
RJ, 24, RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, // state 60 RW, 24, RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, // state 60
RJ, RJ, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 72 RW, RW, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 72
RJ, 36, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 84 RW, 36, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 84
RJ, 36, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 96 RW, 36, RW, RW, RW, RW, RW, RW, RW, RW, RW, RW, // state 96
RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 108 (reject-rewind)
}; };
...@@ -38,7 +38,8 @@ class UTF8Decoder { ...@@ -38,7 +38,8 @@ class UTF8Decoder {
public: public:
enum { enum {
ACCEPT = 0, ACCEPT = 0,
REJECT = 12 REJECT = 12,
REJECT_REWIND = 108
}; };
UTF8Decoder() noexcept = default; UTF8Decoder() noexcept = default;
...@@ -66,10 +67,6 @@ public: ...@@ -66,10 +67,6 @@ public:
m_codepoint = 0xfffdU; m_codepoint = 0xfffdU;
} }
inline bool is_start_byte(uint32_t byte) const noexcept {
return kTable[256 + 0 /* start state */ + kTable[byte]] != REJECT;
}
private: private:
uint32_t m_state{ACCEPT}; uint32_t m_state{ACCEPT};
uint32_t m_codepoint{0}; uint32_t m_codepoint{0};
......
...@@ -3588,18 +3588,15 @@ Terminal::process_incoming() ...@@ -3588,18 +3588,15 @@ Terminal::process_incoming()
for ( ; ip < iend; ++ip) { for ( ; ip < iend; ++ip) {
switch (m_utf8_decoder.decode(*ip)) { switch (m_utf8_decoder.decode(*ip)) {
case vte::base::UTF8Decoder::REJECT: case vte::base::UTF8Decoder::REJECT_REWIND:
m_utf8_decoder.reset(); /* Rewind the stream.
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the * Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed. * next round this byte *will* be consumed.
*/ */
if (m_utf8_decoder.is_start_byte(*ip)) --ip;
--ip; /* [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT:
m_utf8_decoder.reset();
/* Fall through to insert the U+FFFD replacement character. */ /* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */ /* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: { case vte::base::UTF8Decoder::ACCEPT: {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment