use lib/wchar.h for UTF8 conversion (avoid duplication, more aware of wchar_t differences and surrogate pairs)

fixes #400 This was SVN commit r7201.
2009-11-16 20:05:03 +00:00 · 2009-11-16 20:05:03 +00:00 · 781538313c
commit 781538313c
parent b51a0187bf
1 changed files with 3 additions and 125 deletions
--- a/source/ps/CStr.cpp
+++ b/source/ps/CStr.cpp
@ -28,6 +28,7 @@

 #include "lib/posix/posix_sock.h" // htons, ntohs
 #include "lib/fnv_hash.h"
+#include "lib/wchar.h"
 #include "network/Serialization.h"
 #include <cassert>

@ -42,31 +43,6 @@
 CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {}
 CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {}

-// UTF conversion code adapted from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
-
-/**
- * Used by ToUTF8
- **/
-static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-/**
- * Used by FromUTF8
- **/
-static const char trailingBytesForUTF8[256] = {
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
-/**
- * Used by FromUTF8
- **/
-static const u32 offsetsFromUTF8[6] = {
-	0x00000000UL, 0x00003080UL, 0x000E2080UL,
-	0x03C82080UL, 0xFA082080UL, 0x82082080UL };
-
 /**
 * Convert CStr to UTF-8
 *
@ -74,68 +50,9 @@ static const u32 offsetsFromUTF8[6] = {
 **/
 CStr8 CStrW::ToUTF8() const
 {
-	CStr8 result;
-
-	for (size_t i = 0; i < length(); ++i)
-	{
-		unsigned short bytesToWrite;
-		wchar_t ch = (*this)[i];
-
-		if (ch < 0x80) bytesToWrite = 1;
-		else if (ch < 0x800) bytesToWrite = 2;
-		else if (ch < 0x10000) bytesToWrite = 3;
-		else if (ch < 0x110000) bytesToWrite = 4;
-		else bytesToWrite = 3, ch = 0xFFFD; // replacement character
-
-		char buf[4];
-		char* target = &buf[bytesToWrite];
-		switch (bytesToWrite)
-		{
-		case 4: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
-		case 3: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
-		case 2: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
-		case 1: *--target = (ch | firstByteMark[bytesToWrite]);
-		}
-		result += CStr8(buf, bytesToWrite);
-	}
-
-	return result;
+	return utf8_from_wstring(*this);
 }

-/**
- * Test for valid UTF-8 string
- *
- * @param const unsigned char * source pointer to string to test.
- * @param int Length of string to test.
- * @return bool true if source string is legal UTF-8,
- *				false if not.
- **/
-static bool isLegalUTF8(const unsigned char *source, int Length)
-{
-	unsigned char a;
-	const unsigned char *srcptr = source+Length;
-
-	switch (Length) {
-	default: return false;
-	case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
-	case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
-	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
-
-	switch (*source) {
-		case 0xE0: if (a < 0xA0) return false; break;
-		case 0xED: if (a > 0x9F) return false; break;
-		case 0xF0: if (a < 0x90) return false; break;
-		case 0xF4: if (a > 0x8F) return false; break;
-		default:   if (a < 0x80) return false;
-	}
-	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
-	}
-
-	if (*source > 0xF4) return false;
-	return true;
-}
-
-
 /**
 * Convert UTF-8 to CStr
 *
@ -143,46 +60,7 @@ static bool isLegalUTF8(const unsigned char *source, int Length)
 **/
 CStrW CStr8::FromUTF8() const
 {
-	CStrW result;
-
-	if (empty())
-		return result;
-
-	const unsigned char* source = (const unsigned char*)&*begin();
-	const unsigned char* sourceEnd = source + length();
-	while (source < sourceEnd)
-	{
-		wchar_t ch = 0;
-		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-		if (source + extraBytesToRead >= sourceEnd)
-		{
-			// Error - fell of the the end of the string
-			result += (wchar_t)0xFFFD;
-			source++;
-			continue;
-		}
-
-		if (! isLegalUTF8(source, extraBytesToRead+1)) {
-			// Error - illegal data
-			result += (wchar_t)0xFFFD;
-			source++;
-			continue;
-		}
-
-		switch (extraBytesToRead)
-		{
-		case 5: ch += *source++; ch <<= 6;
-		case 4: ch += *source++; ch <<= 6;
-		case 3: ch += *source++; ch <<= 6;
-		case 2: ch += *source++; ch <<= 6;
-		case 1: ch += *source++; ch <<= 6;
-		case 0: ch += *source++;
-		}
-		ch -= offsetsFromUTF8[extraBytesToRead];
-
-		result += ch;
-	}
-	return result;
+	return wstring_from_utf8(*this);
 }

 #else