Decode invalid utf-8 more gracefully (invalid bytes becomes U+FFFD)
This was SVN commit r6999.
This commit is contained in:
parent
271823cf7e
commit
fcf9db0d53
@ -156,13 +156,17 @@ CStrW CStr8::FromUTF8() const
|
||||
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
||||
if (source + extraBytesToRead >= sourceEnd)
|
||||
{
|
||||
//debug_warn("Invalid UTF-8 (fell off end)");
|
||||
return L"";
|
||||
// Error - fell of the the end of the string
|
||||
result += (wchar_t)0xFFFD;
|
||||
source++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (! isLegalUTF8(source, extraBytesToRead+1)) {
|
||||
//debug_warn("Invalid UTF-8 (illegal data)");
|
||||
return L"";
|
||||
// Error - illegal data
|
||||
result += (wchar_t)0xFFFD;
|
||||
source++;
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (extraBytesToRead)
|
||||
|
@ -161,8 +161,7 @@ public:
|
||||
// Conversion to/from UTF-8, encoded in a CStr8.
|
||||
// Common non-ASCII characters are handled correctly.
|
||||
// Characters outside the BMP (above 0xFFFF) are *not* handled correctly.
|
||||
// FromUTF8 may fail, if converting from invalid UTF-8 data - the empty
|
||||
// string will be returned.
|
||||
// FromUTF8 will silently convert invalid bytes to U+FFFD replacement characters.
|
||||
#ifdef _UNICODE
|
||||
CStr8 ToUTF8() const;
|
||||
#else
|
||||
|
@ -58,12 +58,20 @@ public:
|
||||
|
||||
void test_invalid_utf8()
|
||||
{
|
||||
const unsigned char chr_utf8_a[] = { 'a', 0xef };
|
||||
const unsigned char chr_utf8_b[] = { 'b', 0xef, 0xbf };
|
||||
const unsigned char chr_utf8_c[] = { 'c', 0xef, 0xbf, 0x01 };
|
||||
struct { const char* utf8; const wchar_t* utf16; } tests[] = {
|
||||
{ "a\xef", L"a\xfffd" },
|
||||
{ "b\xef\xbf", L"b\xfffd\xfffd" },
|
||||
{ "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" },
|
||||
{ "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" }
|
||||
};
|
||||
for (size_t i = 0; i < ARRAY_SIZE(tests); ++i)
|
||||
{
|
||||
CStr8 str_utf8 (tests[i].utf8);
|
||||
CStrW str_utf16 (tests[i].utf16);
|
||||
|
||||
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_a, sizeof(chr_utf8_a)).FromUTF8(), L"");
|
||||
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_b, sizeof(chr_utf8_b)).FromUTF8(), L"");
|
||||
TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_c, sizeof(chr_utf8_c)).FromUTF8(), L"");
|
||||
CStrW str_utf8to16 = str_utf8.FromUTF8();
|
||||
TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length());
|
||||
TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user