1
1
forked from 0ad/0ad

Decode invalid utf-8 more gracefully (invalid bytes becomes U+FFFD)

This was SVN commit r6999.
This commit is contained in:
Ykkrosh 2009-07-16 15:52:18 +00:00
parent 271823cf7e
commit fcf9db0d53
3 changed files with 23 additions and 12 deletions

View File

@ -156,13 +156,17 @@ CStrW CStr8::FromUTF8() const
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd)
{
//debug_warn("Invalid UTF-8 (fell off end)");
return L"";
// Error - fell of the the end of the string
result += (wchar_t)0xFFFD;
source++;
continue;
}
if (! isLegalUTF8(source, extraBytesToRead+1)) {
//debug_warn("Invalid UTF-8 (illegal data)");
return L"";
// Error - illegal data
result += (wchar_t)0xFFFD;
source++;
continue;
}
switch (extraBytesToRead)

View File

@ -161,8 +161,7 @@ public:
// Conversion to/from UTF-8, encoded in a CStr8.
// Common non-ASCII characters are handled correctly.
// Characters outside the BMP (above 0xFFFF) are *not* handled correctly.
// FromUTF8 may fail, if converting from invalid UTF-8 data - the empty
// string will be returned.
// FromUTF8 will silently convert invalid bytes to U+FFFD replacement characters.
#ifdef _UNICODE
CStr8 ToUTF8() const;
#else

View File

@ -58,12 +58,20 @@ public:
void test_invalid_utf8()
{
const unsigned char chr_utf8_a[] = { 'a', 0xef };
const unsigned char chr_utf8_b[] = { 'b', 0xef, 0xbf };
const unsigned char chr_utf8_c[] = { 'c', 0xef, 0xbf, 0x01 };
struct { const char* utf8; const wchar_t* utf16; } tests[] = {
{ "a\xef", L"a\xfffd" },
{ "b\xef\xbf", L"b\xfffd\xfffd" },
{ "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" },
{ "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" }
};
for (size_t i = 0; i < ARRAY_SIZE(tests); ++i)
{
CStr8 str_utf8 (tests[i].utf8);
CStrW str_utf16 (tests[i].utf16);
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_a, sizeof(chr_utf8_a)).FromUTF8(), L"");
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_b, sizeof(chr_utf8_b)).FromUTF8(), L"");
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_c, sizeof(chr_utf8_c)).FromUTF8(), L"");
CStrW str_utf8to16 = str_utf8.FromUTF8();
TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length());
TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t));
}
}
};