From fcf9db0d5380fb046a3b9ddc1a31683549ed8e72 Mon Sep 17 00:00:00 2001 From: Ykkrosh Date: Thu, 16 Jul 2009 15:52:18 +0000 Subject: [PATCH] Decode invalid utf-8 more gracefully (invalid bytes becomes U+FFFD) This was SVN commit r6999. --- source/ps/CStr.cpp | 12 ++++++++---- source/ps/CStr.h | 3 +-- source/ps/tests/test_CStr.h | 20 ++++++++++++++------ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/source/ps/CStr.cpp b/source/ps/CStr.cpp index 036c15e893..bf060aae2c 100644 --- a/source/ps/CStr.cpp +++ b/source/ps/CStr.cpp @@ -156,13 +156,17 @@ CStrW CStr8::FromUTF8() const unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (source + extraBytesToRead >= sourceEnd) { - //debug_warn("Invalid UTF-8 (fell off end)"); - return L""; + // Error - fell of the the end of the string + result += (wchar_t)0xFFFD; + source++; + continue; } if (! isLegalUTF8(source, extraBytesToRead+1)) { - //debug_warn("Invalid UTF-8 (illegal data)"); - return L""; + // Error - illegal data + result += (wchar_t)0xFFFD; + source++; + continue; } switch (extraBytesToRead) diff --git a/source/ps/CStr.h b/source/ps/CStr.h index fd4de3918c..5238dccfe2 100644 --- a/source/ps/CStr.h +++ b/source/ps/CStr.h @@ -161,8 +161,7 @@ public: // Conversion to/from UTF-8, encoded in a CStr8. // Common non-ASCII characters are handled correctly. // Characters outside the BMP (above 0xFFFF) are *not* handled correctly. - // FromUTF8 may fail, if converting from invalid UTF-8 data - the empty - // string will be returned. + // FromUTF8 will silently convert invalid bytes to U+FFFD replacement characters. #ifdef _UNICODE CStr8 ToUTF8() const; #else diff --git a/source/ps/tests/test_CStr.h b/source/ps/tests/test_CStr.h index 56a0618106..74a860df46 100644 --- a/source/ps/tests/test_CStr.h +++ b/source/ps/tests/test_CStr.h @@ -58,12 +58,20 @@ public: void test_invalid_utf8() { - const unsigned char chr_utf8_a[] = { 'a', 0xef }; - const unsigned char chr_utf8_b[] = { 'b', 0xef, 0xbf }; - const unsigned char chr_utf8_c[] = { 'c', 0xef, 0xbf, 0x01 }; + struct { const char* utf8; const wchar_t* utf16; } tests[] = { + { "a\xef", L"a\xfffd" }, + { "b\xef\xbf", L"b\xfffd\xfffd" }, + { "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" }, + { "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" } + }; + for (size_t i = 0; i < ARRAY_SIZE(tests); ++i) + { + CStr8 str_utf8 (tests[i].utf8); + CStrW str_utf16 (tests[i].utf16); - TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_a, sizeof(chr_utf8_a)).FromUTF8(), L""); - TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_b, sizeof(chr_utf8_b)).FromUTF8(), L""); - TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_c, sizeof(chr_utf8_c)).FromUTF8(), L""); + CStrW str_utf8to16 = str_utf8.FromUTF8(); + TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length()); + TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t)); + } } };