Commit bc2f1b39 authored by Christian Persch's avatar Christian Persch

utf8: Don't swallow start bytes in the middle of a sequence

https://gitlab.gnome.org/GNOME/vte/issues/30
parent 57c3a079
......@@ -189,7 +189,7 @@ vteresources.cc: vte.gresource.xml Makefile $(shell $(GLIB_COMPILE_RESOURCES) --
# Misc unit tests and utilities
noinst_PROGRAMS += parser-cat slowcat test-modes test-tabstops test-parser test-refptr
noinst_PROGRAMS += parser-cat slowcat test-modes test-tabstops test-parser test-refptr test-utf8
noinst_SCRIPTS = decset osc window
EXTRA_DIST += $(noinst_SCRIPTS)
......@@ -213,6 +213,7 @@ TESTS = \
test-parser \
test-refptr \
test-tabstops \
test-utf8 \
reaper \
test-vtetypes \
vtestream-file \
......@@ -365,6 +366,22 @@ test_refptr_LDADD = \
$(GOBJECT_LIBS) \
$(NULL)
test_utf8_SOURCES = \
utf8-test.cc \
utf8.cc \
utf8.hh \
$(NULL)
test_utf8_CPPFLAGS = \
-I$(builddir) \
-I$(srcdir) \
$(AM_CPPFLAGS)
test_utf8_CXXFLAGS = \
$(GLIB_CFLAGS) \
$(AM_CXXFLAGS)
test_utf8_LDADD = \
$(GLIB_LIBS) \
$(NULL)
test_vtetypes_SOURCES = \
vtetypes.cc \
vtetypes.hh \
......
......@@ -349,6 +349,17 @@ process_file_utf8(int fd,
switch (decoder.decode(*sptr)) {
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (decoder.is_start_byte(*sptr))
--sptr;
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: {
auto ret = vte_parser_feed(&parser, decoder.codepoint());
......
/*
* Copyright © 2018 Christian Persch
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#include "config.h"
#include "utf8.hh"
#include <cstring>
#include <string>
#include <glib.h>
using namespace std::literals;
using namespace vte::base;
UTF8Decoder decoder{};
static void
test_utf8_decoder_decode(void)
{
decoder.reset();
uint8_t buf[7];
uint32_t state = UTF8Decoder::ACCEPT;
for (uint32_t cp = 0; cp < 0x110000u; ++cp) {
if ((cp & 0xfffff800) == 0xd800u)
continue; // surrogate
int len = g_unichar_to_utf8(cp, (char*)buf);
for (int i = 0; i < len; ++i)
state = decoder.decode(buf[i]);
g_assert_cmpint(state, ==, UTF8Decoder::ACCEPT);
g_assert_cmpuint(decoder.codepoint(), ==, cp);
}
}
static constexpr bool
is_utf8_start_byte(uint32_t c)
{
return (c < 0x80u || (c >= 0xc2u && c <= 0xf4u));
}
static void
test_utf8_decoder_start(void)
{
decoder.reset();
for (uint32_t c = 0; c < 0x100u; ++c) {
g_assert_cmpint(decoder.is_start_byte(c), ==, is_utf8_start_byte(c));
}
}
static void
decode(uint8_t const* in,
size_t len,
std::u32string& out)
{
decoder.reset();
auto const iend = in + len;
uint32_t state = UTF8Decoder::ACCEPT;
for (auto iptr = in; iptr < iend; ++iptr) {
switch ((state = decoder.decode(*iptr))) {
case vte::base::UTF8Decoder::REJECT:
decoder.reset();
state = UTF8Decoder::ACCEPT;
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (decoder.is_start_byte(*iptr))
--iptr;
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT:
out.push_back(decoder.codepoint());
break;
default:
break;
}
}
/* If we get EOS without having just accepted a character,
* we need to insert a replacement character since we're
* aborting a sequence mid-way.
*/
if (state != UTF8Decoder::ACCEPT) {
out.push_back(0xfffdu);
}
}
static void
assert_u32streq(std::u32string const& str1,
std::u32string const& str2)
{
g_assert_cmpuint(str1.size(), ==, str2.size());
g_assert_true(str1 == str2);
}
static void
assert_decode(char const* in,
ssize_t len,
std::u32string const& expected)
{
std::u32string converted;
decode((uint8_t const*)in, len != -1 ? size_t(len) : strlen(in), converted);
assert_u32streq(converted, expected);
}
static void
test_utf8_decoder_replacement(void)
{
/* The following test vectors are copied from rust encoding_rs/src/utf8.rs:
*
* Copyright 2015-2016 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any
* person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the
* Software without restriction, including without
* limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software
* is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice
* shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
* TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
* PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
* SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
* IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
// Empty
assert_decode("", -1, U""s);
// NUL
assert_decode("\0", 1, U"\0"s);
// ASCII
assert_decode("ab", -1, U"ab"s);
// Low BMP
assert_decode("a\xC3\xA4Z", -1, U"a\u00E4Z"s);
// High BMP
assert_decode("a\xE2\x98\x83Z", -1, U"a\u2603Z"s);
// Astral
assert_decode("a\xF0\x9F\x92\xa9Z", -1, U"a\U0001F4A9Z"s);
// Low BMP with last byte missing
assert_decode("a\xC3Z", -1, U"a\uFFFDZ"s);
assert_decode("a\xC3", -1, U"a\uFFFD"s);
// High BMP with last byte missing
assert_decode("a\xE2\x98Z", -1, U"a\uFFFDZ"s);
assert_decode("a\xE2\x98", -1, U"a\uFFFD"s);
// Astral with last byte missing
assert_decode("a\xF0\x9F\x92Z", -1, U"a\uFFFDZ"s);
assert_decode("a\xF0\x9F\x92", -1, U"a\uFFFD"s);
// Lone highest continuation
assert_decode("a\xBFZ", -1, U"a\uFFFDZ"s);
assert_decode("a\xBF", -1, U"a\uFFFD"s);
// Two lone highest continuations
assert_decode("a\xBF\xBFZ", -1, U"a\uFFFD\uFFFDZ"s);
assert_decode("a\xBF\xBF", -1, U"a\uFFFD\uFFFD"s);
// Low BMP followed by lowest lone continuation
assert_decode("a\xC3\xA4\x80Z", -1, U"a\u00E4\uFFFDZ"s);
assert_decode("a\xC3\xA4\x80", -1, U"a\u00E4\uFFFD"s);
// Low BMP followed by highest lone continuation
assert_decode("a\xC3\xA4\xBFZ", -1, U"a\u00E4\uFFFDZ"s);
assert_decode("a\xC3\xA4\xBF", -1, U"a\u00E4\uFFFD"s);
// High BMP followed by lowest lone continuation
assert_decode("a\xE2\x98\x83\x80Z", -1, U"a\u2603\uFFFDZ"s);
assert_decode("a\xE2\x98\x83\x80", -1, U"a\u2603\uFFFD"s);
// High BMP followed by highest lone continuation
assert_decode("a\xE2\x98\x83\xBFZ", -1, U"a\u2603\uFFFDZ"s);
assert_decode("a\xE2\x98\x83\xBF", -1, U"a\u2603\uFFFD"s);
// Astral followed by lowest lone continuation
assert_decode("a\xF0\x9F\x92\xA9\x80Z", -1, U"a\U0001F4A9\uFFFDZ"s);
assert_decode("a\xF0\x9F\x92\xA9\x80", -1, U"a\U0001F4A9\uFFFD"s);
// Astral followed by highest lone continuation
assert_decode("a\xF0\x9F\x92\xA9\xBFZ", -1, U"a\U0001F4A9\uFFFDZ"s);
assert_decode("a\xF0\x9F\x92\xA9\xBF", -1, U"a\U0001F4A9\uFFFD"s);
// Boundary conditions
// Lowest single-byte
assert_decode("Z\x00", 2, U"Z\0"s);
assert_decode("Z\x00Z", 3, U"Z\0Z"s);
// Lowest single-byte as two-byte overlong sequence
assert_decode("a\xC0\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC0\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
// Lowest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One below lowest single-byte
assert_decode("a\xFF", -1, U"a\uFFFD"s);
assert_decode("a\xFFZ", -1, U"a\uFFFDZ"s);
// Highest single-byte
assert_decode("a\x7F", -1, U"a\u007F"s);
assert_decode("a\x7FZ", -1, U"a\u007FZ"s);
// Highest single-byte as two-byte overlong sequence
assert_decode("a\xC1\xBF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\xBFZ", -1, U"a\uFFFD\uFFFDZ"s);
// Highest single-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest single-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x81\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x81\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// One past highest single byte (also lone continuation)
assert_decode("a\x80Z", -1, U"a\uFFFDZ"s);
assert_decode("a\x80", -1, U"a\uFFFD"s);
// Two lone continuations
assert_decode("a\x80\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
assert_decode("a\x80\x80", -1, U"a\uFFFD\uFFFD"s);
// Three lone continuations
assert_decode("a\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
assert_decode("a\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
// Four lone continuations
assert_decode("a\x80\x80\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
assert_decode("a\x80\x80\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
// Lowest two-byte
assert_decode("a\xC2\x80", -1, U"a\u0080"s);
assert_decode("a\xC2\x80Z", -1, U"a\u0080Z"s);
// Lowest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Lowest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x82\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x82\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lead one below lowest two-byte
assert_decode("a\xC1\x80", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xC1\x80Z", -1, U"a\uFFFD\uFFFDZ"s);
// Trail one below lowest two-byte
assert_decode("a\xC2\x7F", -1, U"a\uFFFD\u007F"s);
assert_decode("a\xC2\x7FZ", -1, U"a\uFFFD\u007FZ"s);
// Highest two-byte
assert_decode("a\xDF\xBF", -1, U"a\u07FF"s);
assert_decode("a\xDF\xBFZ", -1, U"a\u07FFZ"s);
// Highest two-byte as three-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xE0\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xE0\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Highest two-byte as four-byte overlong sequence
assert_decode("a\xF0\x80\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest three-byte
assert_decode("a\xE0\xA0\x80", -1, U"a\u0800"s);
assert_decode("a\xE0\xA0\x80Z", -1, U"a\u0800Z"s);
// Lowest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x80\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x80\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest below surrogates
assert_decode("a\xED\x9F\xBF", -1, U"a\uD7FF"s);
assert_decode("a\xED\x9F\xBFZ", -1, U"a\uD7FFZ"s);
// Highest below surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8D\x9F\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\x9F\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// First surrogate
assert_decode("a\xED\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xED\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// First surrogate as four-byte overlong sequence
assert_decode("a\xF0\x8D\xA0\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\xA0\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// Last surrogate
assert_decode("a\xED\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xED\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFDZ"s);
// Last surrogate as four-byte overlong sequence
assert_decode("a\xF0\x8D\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8D\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Lowest above surrogates
assert_decode("a\xEE\x80\x80", -1, U"a\uE000"s);
assert_decode("a\xEE\x80\x80Z", -1, U"a\uE000Z"s);
// Lowest above surrogates as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8E\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8E\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest three-byte
assert_decode("a\xEF\xBF\xBF", -1, U"a\uFFFF"s);
assert_decode("a\xEF\xBF\xBFZ", -1, U"a\uFFFFZ"s);
// Highest three-byte as four-byte overlong sequence
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF0\x8F\xBF\xBF", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF0\x8F\xBF\xBFZ", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
// Lowest four-byte
assert_decode("a\xF0\x90\x80\x80", -1, U"a\u10000"s);
assert_decode("a\xF0\x90\x80\x80Z", -1, U"a\u10000Z"s);
// Highest four-byte
assert_decode("a\xF4\x8F\xBF\xBF", -1, U"a\u10FFFF"s);
assert_decode("a\xF4\x8F\xBF\xBFZ", -1, U"a\u10FFFFZ"s);
// One past highest four-byte
assert_decode("a\xF4\x90\x80\x80", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFD"s);
assert_decode("a\xF4\x90\x80\x80Z", -1, U"a\uFFFD\uFFFD\uFFFD\uFFFDZ"s);
#endif
// Highest four-byte with last byte replaced with 0xFF
#ifdef INCLUDE_KNOWN_FAIL
assert_decode("a\xF4\x8F\xBF\xFF", -1, U"a\uFFFD\uFFFD"s);
assert_decode("a\xF4\x8F\xBF\xFFZ", -1, U"a\uFFFD\uFFFDZ"s);
#endif
}
int
main(int argc,
char* argv[])
{
g_test_init(&argc, &argv, nullptr);
g_test_add_func("/vte/utf8/decoder/decode", test_utf8_decoder_decode);
g_test_add_func("/vte/utf8/decoder/start", test_utf8_decoder_start);
g_test_add_func("/vte/utf8/decoder/replacement", test_utf8_decoder_replacement);
return g_test_run();
}
......@@ -66,6 +66,10 @@ public:
m_codepoint = 0xfffdU;
}
inline bool is_start_byte(uint32_t byte) const noexcept {
return kTable[256 + 0 /* start state */ + kTable[byte]] != REJECT;
}
private:
uint32_t m_state{ACCEPT};
uint32_t m_codepoint{0};
......
......@@ -3590,6 +3590,17 @@ Terminal::process_incoming()
switch (m_utf8_decoder.decode(*ip)) {
case vte::base::UTF8Decoder::REJECT:
m_utf8_decoder.reset();
/* If a start byte occurred in the middle of a sequence,
* rewind the stream so we try to start a new character
* with it.
* Note that this will never lead to a loop, since in the
* next round this byte *will* be consumed.
*/
if (m_utf8_decoder.is_start_byte(*ip))
--ip;
/* Fall through to insert the U+FFFD replacement character. */
/* [[fallthrough]]; */
case vte::base::UTF8Decoder::ACCEPT: {
auto rv = m_parser.feed(m_utf8_decoder.codepoint());
......@@ -3741,6 +3752,7 @@ Terminal::process_incoming()
break;
}
}
break;
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment