From 781538313c349479c4375e36fda22bea56058e5d Mon Sep 17 00:00:00 2001 From: janwas Date: Mon, 16 Nov 2009 20:05:03 +0000 Subject: [PATCH] use lib/wchar.h for UTF8 conversion (avoid duplication, more aware of wchar_t differences and surrogate pairs) fixes #400 This was SVN commit r7201. --- source/ps/CStr.cpp | 128 ++------------------------------------------- 1 file changed, 3 insertions(+), 125 deletions(-) diff --git a/source/ps/CStr.cpp b/source/ps/CStr.cpp index a9827f2558..06d166a0d5 100644 --- a/source/ps/CStr.cpp +++ b/source/ps/CStr.cpp @@ -28,6 +28,7 @@ #include "lib/posix/posix_sock.h" // htons, ntohs #include "lib/fnv_hash.h" +#include "lib/wchar.h" #include "network/Serialization.h" #include @@ -42,31 +43,6 @@ CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {} CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {} -// UTF conversion code adapted from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c - -/** - * Used by ToUTF8 - **/ -static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; -/** - * Used by FromUTF8 - **/ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; -/** - * Used by FromUTF8 - **/ -static const u32 offsetsFromUTF8[6] = { - 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - /** * Convert CStr to UTF-8 * @@ -74,68 +50,9 @@ static const u32 offsetsFromUTF8[6] = { **/ CStr8 CStrW::ToUTF8() const { - CStr8 result; - - for (size_t i = 0; i < length(); ++i) - { - unsigned short bytesToWrite; - wchar_t ch = (*this)[i]; - - if (ch < 0x80) bytesToWrite = 1; - else if (ch < 0x800) bytesToWrite = 2; - else if (ch < 0x10000) bytesToWrite = 3; - else if (ch < 0x110000) bytesToWrite = 4; - else bytesToWrite = 3, ch = 0xFFFD; // replacement character - - char buf[4]; - char* target = &buf[bytesToWrite]; - switch (bytesToWrite) - { - case 4: *--target = ((ch | 0x80) & 0xBF); ch >>= 6; - case 3: *--target = ((ch | 0x80) & 0xBF); ch >>= 6; - case 2: *--target = ((ch | 0x80) & 0xBF); ch >>= 6; - case 1: *--target = (ch | firstByteMark[bytesToWrite]); - } - result += CStr8(buf, bytesToWrite); - } - - return result; + return utf8_from_wstring(*this); } -/** - * Test for valid UTF-8 string - * - * @param const unsigned char * source pointer to string to test. - * @param int Length of string to test. - * @return bool true if source string is legal UTF-8, - * false if not. - **/ -static bool isLegalUTF8(const unsigned char *source, int Length) -{ - unsigned char a; - const unsigned char *srcptr = source+Length; - - switch (Length) { - default: return false; - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - - if (*source > 0xF4) return false; - return true; -} - - /** * Convert UTF-8 to CStr * @@ -143,46 +60,7 @@ static bool isLegalUTF8(const unsigned char *source, int Length) **/ CStrW CStr8::FromUTF8() const { - CStrW result; - - if (empty()) - return result; - - const unsigned char* source = (const unsigned char*)&*begin(); - const unsigned char* sourceEnd = source + length(); - while (source < sourceEnd) - { - wchar_t ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) - { - // Error - fell of the the end of the string - result += (wchar_t)0xFFFD; - source++; - continue; - } - - if (! isLegalUTF8(source, extraBytesToRead+1)) { - // Error - illegal data - result += (wchar_t)0xFFFD; - source++; - continue; - } - - switch (extraBytesToRead) - { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - result += ch; - } - return result; + return wstring_from_utf8(*this); } #else