Decode invalid utf-8 more gracefully (invalid bytes becomes U+FFFD)

This was SVN commit r6999.
2009-07-16 15:52:18 +00:00 · 2009-07-16 15:52:18 +00:00 · fcf9db0d53
commit fcf9db0d53
parent 271823cf7e
3 changed files with 23 additions and 12 deletions
--- a/source/ps/CStr.cpp
+++ b/source/ps/CStr.cpp
@ -156,13 +156,17 @@ CStrW CStr8::FromUTF8() const
 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 		if (source + extraBytesToRead >= sourceEnd)
 		{
-			//debug_warn("Invalid UTF-8 (fell off end)");
-			return L"";
+			// Error - fell of the the end of the string
+			result += (wchar_t)0xFFFD;
+			source++;
+			continue;
 		}

 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
-			//debug_warn("Invalid UTF-8 (illegal data)");
-			return L"";
+			// Error - illegal data
+			result += (wchar_t)0xFFFD;
+			source++;
+			continue;
 		}

 		switch (extraBytesToRead)
--- a/source/ps/CStr.h
+++ b/source/ps/CStr.h
@ -161,8 +161,7 @@ public:
 	// Conversion to/from UTF-8, encoded in a CStr8.
 	// Common non-ASCII characters are handled correctly.
 	// Characters outside the BMP (above 0xFFFF) are *not* handled correctly.
-	// FromUTF8 may fail, if converting from invalid UTF-8 data - the empty
-	// string will be returned.
+	// FromUTF8 will silently convert invalid bytes to U+FFFD replacement characters.
 	#ifdef _UNICODE
 		CStr8 ToUTF8() const;
 	#else
--- a/source/ps/tests/test_CStr.h
+++ b/source/ps/tests/test_CStr.h
@ -58,12 +58,20 @@ public:

 	void test_invalid_utf8()
 	{
-		const unsigned char chr_utf8_a[] = { 'a', 0xef };
-		const unsigned char chr_utf8_b[] = { 'b', 0xef, 0xbf };
-		const unsigned char chr_utf8_c[] = { 'c', 0xef, 0xbf, 0x01 };
+		struct { const char* utf8; const wchar_t* utf16; } tests[] = {
+			{ "a\xef", L"a\xfffd" },
+			{ "b\xef\xbf", L"b\xfffd\xfffd" },
+			{ "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" },
+			{ "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" }
+		};
+		for (size_t i = 0; i < ARRAY_SIZE(tests); ++i)
+		{
+			CStr8 str_utf8 (tests[i].utf8);
+			CStrW str_utf16 (tests[i].utf16);

-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_a, sizeof(chr_utf8_a)).FromUTF8(), L"");
-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_b, sizeof(chr_utf8_b)).FromUTF8(), L"");
-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_c, sizeof(chr_utf8_c)).FromUTF8(), L"");
+			CStrW str_utf8to16 = str_utf8.FromUTF8();
+			TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length());
+			TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t));
+		}
 	}
 };