use lib/wchar.h for UTF8 conversion (avoid duplication, more aware of wchar_t differences and surrogate pairs)

fixes #400 This was SVN commit r7201.
2009-11-16 20:05:03 +00:00 · 2009-11-16 20:05:03 +00:00 · 781538313c
commit 781538313c
parent b51a0187bf
1 changed files with 3 additions and 125 deletions
--- a/source/ps/CStr.cpp
+++ b/source/ps/CStr.cpp
@ -28,6 +28,7 @@
 #include "lib/posix/posix_sock.h" // htons, ntohs
 #include "lib/fnv_hash.h"
 #include "lib/wchar.h"
 #include "network/Serialization.h"
 #include <cassert>
@ -42,31 +43,6 @@
 CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {}
 CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {}
 // UTF conversion code adapted from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 /**
 * Used by ToUTF8
 **/
 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 /**
 * Used by FromUTF8
 **/
 static const char trailingBytesForUTF8[256] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 /**
 * Used by FromUTF8
 **/
 static const u32 offsetsFromUTF8[6] = {
 	0x00000000UL, 0x00003080UL, 0x000E2080UL,
 	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 /**
 * Convert CStr to UTF-8
 *
@ -74,68 +50,9 @@ static const u32 offsetsFromUTF8[6] = {
 **/
 CStr8 CStrW::ToUTF8() const
 {
-	CStr8 result;
+	return utf8_from_wstring(*this);
 	for (size_t i = 0; i < length(); ++i)
 	{
 		unsigned short bytesToWrite;
 		wchar_t ch = (*this)[i];
 		if (ch < 0x80) bytesToWrite = 1;
 		else if (ch < 0x800) bytesToWrite = 2;
 		else if (ch < 0x10000) bytesToWrite = 3;
 		else if (ch < 0x110000) bytesToWrite = 4;
 		else bytesToWrite = 3, ch = 0xFFFD; // replacement character
 		char buf[4];
 		char* target = &buf[bytesToWrite];
 		switch (bytesToWrite)
 		{
 		case 4: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
 		case 3: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
 		case 2: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
 		case 1: *--target = (ch | firstByteMark[bytesToWrite]);
 		}
 		result += CStr8(buf, bytesToWrite);
 	}
 	return result;
 }
 /**
 * Test for valid UTF-8 string
 *
 * @param const unsigned char * source pointer to string to test.
 * @param int Length of string to test.
 * @return bool true if source string is legal UTF-8,
 *				false if not.
 **/
 static bool isLegalUTF8(const unsigned char *source, int Length)
 {
 	unsigned char a;
 	const unsigned char *srcptr = source+Length;
 	switch (Length) {
 	default: return false;
 	case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 	case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 	switch (*source) {
 		case 0xE0: if (a < 0xA0) return false; break;
 		case 0xED: if (a > 0x9F) return false; break;
 		case 0xF0: if (a < 0x90) return false; break;
 		case 0xF4: if (a > 0x8F) return false; break;
 		default:   if (a < 0x80) return false;
 	}
 	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
 	}
 	if (*source > 0xF4) return false;
 	return true;
 }
 /**
 * Convert UTF-8 to CStr
 *
@ -143,46 +60,7 @@ static bool isLegalUTF8(const unsigned char *source, int Length)
 **/
 CStrW CStr8::FromUTF8() const
 {
-	CStrW result;
+	return wstring_from_utf8(*this);
 	if (empty())
 		return result;
 	const unsigned char* source = (const unsigned char*)&*begin();
 	const unsigned char* sourceEnd = source + length();
 	while (source < sourceEnd)
 	{
 		wchar_t ch = 0;
 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 		if (source + extraBytesToRead >= sourceEnd)
 		{
 			// Error - fell of the the end of the string
 			result += (wchar_t)0xFFFD;
 			source++;
 			continue;
 		}
 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
 			// Error - illegal data
 			result += (wchar_t)0xFFFD;
 			source++;
 			continue;
 		}
 		switch (extraBytesToRead)
 		{
 		case 5: ch += *source++; ch <<= 6;
 		case 4: ch += *source++; ch <<= 6;
 		case 3: ch += *source++; ch <<= 6;
 		case 2: ch += *source++; ch <<= 6;
 		case 1: ch += *source++; ch <<= 6;
 		case 0: ch += *source++;
 		}
 		ch -= offsetsFromUTF8[extraBytesToRead];
 		result += ch;
 	}
 	return result;
 }
 #else