Commit b8b1aa4e authored by Christian Persch's avatar Christian Persch

utf8: Make decoder conform to recommendation on replacement characters

With this change, the decoder conforms to the W3 Encoding TR and
the Unicode recommendation on inserting replacement characters
from §3.9 of the Unicode core spec.

https://gitlab.gnome.org/GNOME/vte/issues/30
parent 9bda7de4
......@@ -347,18 +347,15 @@ process_file_utf8(int fd,
for (auto sptr = buf; sptr < bufend; ++sptr) {
switch (decoder.decode(*sptr)) {
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
case vte::base::UTF8Decoder::REJECT_REWIND:
/* Rewind the stream.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (decoder.is_start_byte(*sptr))
--sptr;
--sptr;
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: {
......
......@@ -48,21 +48,6 @@ test_utf8_decoder_decode(void)
}
}
static constexpr bool
is_utf8_start_byte(uint32_t c)
{
return (c < 0x80u || (c >= 0xc2u && c <= 0xf4u));
}
static void
test_utf8_decoder_start(void)
{
decoder.reset();
for (uint32_t c = 0; c < 0x100u; ++c) {
g_assert_cmpint(decoder.is_start_byte(c), ==, is_utf8_start_byte(c));
}
}
static void
decode(uint8_t const* in,
size_t len,
......@@ -74,19 +59,15 @@ decode(uint8_t const* in,
uint32_t state = UTF8Decoder::ACCEPT;
for (auto iptr = in; iptr < iend; ++iptr) {
switch ((state = decoder.decode(*iptr))) {
case vte::base::UTF8Decoder::REJECT_REWIND:
/* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
--iptr;
// [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
state = UTF8Decoder::ACCEPT;
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (decoder.is_start_byte(*iptr))
--iptr;
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT:
......@@ -210,13 +191,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC0\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC0\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
// Lowest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One below lowest single-byte
assert_decode("a\xFF", -1, U"a\uFFFD"s);
assert_decode("a\xFFZ", -1, U"a\uFFFDZ"s);
......@@ -227,13 +206,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC1\xBF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\xBFZ", -1, U"a\uFFFD\uFFFDZ"s);
// Highest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One past highest single byte (also lone continuation)
assert_decode("a\x80Z", -1, U"a\uFFFDZ"s);
assert_decode("a\x80", -1, U"a\uFFFD"s);
......@@ -250,13 +227,11 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xC2\x80", -1, U"a\u0080"s);
assert_decode("a\xC2\x80Z", -1, U"a\u0080Z"s);
// Lowest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lead one below lowest two-byte
assert_decode("a\xC1\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
......@@ -267,26 +242,21 @@ test_utf8_decoder_replacement(void)
assert_decode("a\xDF\xBF", -1, U"a\u07FF"s);
assert_decode("a\xDF\xBFZ", -1, U"a\u07FFZ"s);
// Highest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest three-byte
assert_decode("a\xE0\xA0\x80", -1, U"a\u0800"s);
assert_decode("a\xE0\xA0\x80Z", -1, U"a\u0800Z"s);
// Lowest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x80\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest below surrogates
assert_decode("a\xED\x9F\xBF", -1, U"a\uD7FF"s);
assert_decode("a\xED\x9F\xBFZ", -1, U"a\uD7FFZ"s);
// Highest below surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8D\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// First surrogate
......@@ -301,38 +271,31 @@ test_utf8_decoder_replacement(void)
// Last surrogate as four-byte overlong sequence
assert_decode("a\xF0\x8D\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest above surrogates
assert_decode("a\xEE\x80\x80", -1, U"a\uE000"s);
assert_decode("a\xEE\x80\x80Z", -1, U"a\uE000Z"s);
// Lowest above surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8E\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8E\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest three-byte
assert_decode("a\xEF\xBF\xBF", -1, U"a\uFFFF"s);
assert_decode("a\xEF\xBF\xBFZ", -1, U"a\uFFFFZ"s);
// Highest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8F\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8F\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// Lowest four-byte
assert_decode("a\xF0\x90\x80\x80", -1, U"a\u10000"s);
assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\u10000Z"s);
assert_decode("a\xF0\x90\x80\x80", -1, U"a\U00010000"s);
assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\U00010000Z"s);
// Highest four-byte
assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\u10FFFF"s);
assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\u10FFFFZ"s);
assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\U0010FFFF"s);
assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\U0010FFFFZ"s);
// One past highest four-byte
assert_decode("a\xF4\x90\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF4\x90\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest four-byte with last byte replaced with 0xFF
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF4\x8F\xBF\xFF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xF4\x8F\xBF\xFFZ", -1, U"a\uFFFD\uFFFDZ"s);
#endif
}
int
......@@ -342,7 +305,6 @@ main(int argc,
g_test_init(&argc, &argv, nullptr);
g_test_add_func("/vte/utf8/decoder/decode", test_utf8_decoder_decode);
g_test_add_func("/vte/utf8/decoder/start", test_utf8_decoder_start);
g_test_add_func("/vte/utf8/decoder/replacement", test_utf8_decoder_replacement);
return g_test_run();
......
......@@ -26,6 +26,7 @@
#include "utf8.hh"
#define RJ vte::base::UTF8Decoder::REJECT
#define RW vte::base::UTF8Decoder::REJECT_REWIND
uint8_t const vte::base::UTF8Decoder::kTable[] = {
// The first part of the table maps bytes to character classes that
......@@ -40,7 +41,7 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
// 0xe0: 10
// 0xe1..0xec: 3
// 0xed: 4
// 0xee..0xff: 3
// 0xee..0xef: 3
// 0xf0: 11
// 0xf1..0xf3: 6
// 0xf4: 5
......@@ -64,6 +65,35 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
// To understand this DFA, see transitions graph on the website
// linked above.
//
// The following translates the states of the DFA to the
// algorithm of the UTF-8 decoder from the W3 Encodings spec
// [https://www.w3.org/TR/encoding/#utf-8]:
//
// DFA │ bytes bytes lower upper
// state │ seen needed bound bound
// ──────┼─────────────────────────────────
// 0 │ 0 0 0x80 0xbf
// 12 │
// 24 │ 1,2,3 1 0x80 0xbf
// 36 │ 1,2 2 0x80 0xbf
// 48 │ 1 2 0xa0 0xbf
// 60 │ 1 2 0x80 0x9f
// 72 │ 1 3 0x90 0xbf
// 84 │ 1 3 0x80 0xbf
// 96 │ 1 3 0x80 0x8f
// 108 │
//
// If an unexpected byte is read in a non-ACCEPT/REJECT* state,
// transition to REJECT_REWIND so that the decoder will read that
// byte again after being reset; this makes the decoder conform
// to the Unicode recommendation for insering replacement
// characters, and to the W3 Encoding TR spec.
//
// If an unexpected byte is read in the ACCEPT or a REJECT* state,
// transition to REJECT; that byte must not be read again, since
// that would lead to an infinite loop.
//
// For each state (row), the table records which state will
// be transitioned to when consuming a character of the class
// (column).
......@@ -72,11 +102,12 @@ uint8_t const vte::base::UTF8Decoder::kTable[] = {
*/
0, RJ, 24, 36, 60, 96, 84, RJ, RJ, RJ, 48, 72, // state 0 (accept)
RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 12 (reject)
RJ, 0, RJ, RJ, RJ, RJ, RJ, 0, RJ, 0, RJ, RJ, // state 24
RJ, 24, RJ, RJ, RJ, RJ, RJ, 24, RJ, 24, RJ, RJ, // state 36
RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, RJ, RJ, // state 48
RJ, 24, RJ, RJ, RJ, RJ, RJ, RJ, RJ, 24, RJ, RJ, // state 60
RJ, RJ, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 72
RJ, 36, RJ, RJ, RJ, RJ, RJ, 36, RJ, 36, RJ, RJ, // state 84
RJ, 36, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 96
RW, 0, RW, RW, RW, RW, RW, 0, RW, 0, RW, RW, // state 24
RW, 24, RW, RW, RW, RW, RW, 24, RW, 24, RW, RW, // state 36
RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, RW, RW, // state 48
RW, 24, RW, RW, RW, RW, RW, RW, RW, 24, RW, RW, // state 60
RW, RW, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 72
RW, 36, RW, RW, RW, RW, RW, 36, RW, 36, RW, RW, // state 84
RW, 36, RW, RW, RW, RW, RW, RW, RW, RW, RW, RW, // state 96
RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, RJ, // state 108 (reject-rewind)
};
......@@ -38,7 +38,8 @@ class UTF8Decoder {
public:
enum {
ACCEPT = 0,
REJECT = 12
REJECT = 12,
REJECT_REWIND = 108
};
UTF8Decoder() noexcept = default;
......@@ -66,10 +67,6 @@ public:
m_codepoint = 0xfffdU;
}
inline bool is_start_byte(uint32_t byte) const noexcept {
return kTable[256 + 0 /* start state */ + kTable[byte]] != REJECT;
}
private:
uint32_t m_state{ACCEPT};
uint32_t m_codepoint{0};
......
......@@ -3588,18 +3588,15 @@ Terminal::process_incoming()
for ( ; ip < iend; ++ip) {
switch (m_utf8_decoder.decode(*ip)) {
case vte::base::UTF8Decoder::REJECT:
m_utf8_decoder.reset();
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
case vte::base::UTF8Decoder::REJECT_REWIND:
/* Rewind the stream.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (m_utf8_decoder.is_start_byte(*ip))
--ip;
--ip;
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::REJECT:
m_utf8_decoder.reset();
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment