From fcf9db0d5380fb046a3b9ddc1a31683549ed8e72 Mon Sep 17 00:00:00 2001
From: Ykkrosh <philip@wildfiregames.com>
Date: Thu, 16 Jul 2009 15:52:18 +0000
Subject: [PATCH] Decode invalid utf-8 more gracefully (invalid bytes becomes
 U+FFFD)

This was SVN commit r6999.
---
 source/ps/CStr.cpp          | 12 ++++++++----
 source/ps/CStr.h            |  3 +--
 source/ps/tests/test_CStr.h | 20 ++++++++++++++------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/source/ps/CStr.cpp b/source/ps/CStr.cpp
index 036c15e893..bf060aae2c 100644
--- a/source/ps/CStr.cpp
+++ b/source/ps/CStr.cpp
@@ -156,13 +156,17 @@ CStrW CStr8::FromUTF8() const
 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 		if (source + extraBytesToRead >= sourceEnd)
 		{
-			//debug_warn("Invalid UTF-8 (fell off end)");
-			return L"";
+			// Error - fell of the the end of the string
+			result += (wchar_t)0xFFFD;
+			source++;
+			continue;
 		}
 
 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
-			//debug_warn("Invalid UTF-8 (illegal data)");
-			return L"";
+			// Error - illegal data
+			result += (wchar_t)0xFFFD;
+			source++;
+			continue;
 		}
 
 		switch (extraBytesToRead)
diff --git a/source/ps/CStr.h b/source/ps/CStr.h
index fd4de3918c..5238dccfe2 100644
--- a/source/ps/CStr.h
+++ b/source/ps/CStr.h
@@ -161,8 +161,7 @@ public:
 	// Conversion to/from UTF-8, encoded in a CStr8.
 	// Common non-ASCII characters are handled correctly.
 	// Characters outside the BMP (above 0xFFFF) are *not* handled correctly.
-	// FromUTF8 may fail, if converting from invalid UTF-8 data - the empty
-	// string will be returned.
+	// FromUTF8 will silently convert invalid bytes to U+FFFD replacement characters.
 	#ifdef _UNICODE
 		CStr8 ToUTF8() const;
 	#else
diff --git a/source/ps/tests/test_CStr.h b/source/ps/tests/test_CStr.h
index 56a0618106..74a860df46 100644
--- a/source/ps/tests/test_CStr.h
+++ b/source/ps/tests/test_CStr.h
@@ -58,12 +58,20 @@ public:
 
 	void test_invalid_utf8()
 	{
-		const unsigned char chr_utf8_a[] = { 'a', 0xef };
-		const unsigned char chr_utf8_b[] = { 'b', 0xef, 0xbf };
-		const unsigned char chr_utf8_c[] = { 'c', 0xef, 0xbf, 0x01 };
+		struct { const char* utf8; const wchar_t* utf16; } tests[] = {
+			{ "a\xef", L"a\xfffd" },
+			{ "b\xef\xbf", L"b\xfffd\xfffd" },
+			{ "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" },
+			{ "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" }
+		};
+		for (size_t i = 0; i < ARRAY_SIZE(tests); ++i)
+		{
+			CStr8 str_utf8 (tests[i].utf8);
+			CStrW str_utf16 (tests[i].utf16);
 
-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_a, sizeof(chr_utf8_a)).FromUTF8(), L"");
-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_b, sizeof(chr_utf8_b)).FromUTF8(), L"");
-		TS_ASSERT_WSTR_EQUALS(CStr8((const char*)chr_utf8_c, sizeof(chr_utf8_c)).FromUTF8(), L"");
+			CStrW str_utf8to16 = str_utf8.FromUTF8();
+			TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length());
+			TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t));
+		}
 	}
 };