1
0
forked from 0ad/0ad

use lib/wchar.h for UTF8 conversion (avoid duplication, more aware of wchar_t differences and surrogate pairs)

fixes #400

This was SVN commit r7201.
This commit is contained in:
janwas 2009-11-16 20:05:03 +00:00
parent b51a0187bf
commit 781538313c

View File

@ -28,6 +28,7 @@
#include "lib/posix/posix_sock.h" // htons, ntohs
#include "lib/fnv_hash.h"
#include "lib/wchar.h"
#include "network/Serialization.h"
#include <cassert>
@ -42,31 +43,6 @@
CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {}
CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {}
// UTF conversion code adapted from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
/**
* Used by ToUTF8
**/
static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
/**
* Used by FromUTF8
**/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
/**
* Used by FromUTF8
**/
static const u32 offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
/**
* Convert CStr to UTF-8
*
@ -74,68 +50,9 @@ static const u32 offsetsFromUTF8[6] = {
**/
CStr8 CStrW::ToUTF8() const
{
CStr8 result;
for (size_t i = 0; i < length(); ++i)
{
unsigned short bytesToWrite;
wchar_t ch = (*this)[i];
if (ch < 0x80) bytesToWrite = 1;
else if (ch < 0x800) bytesToWrite = 2;
else if (ch < 0x10000) bytesToWrite = 3;
else if (ch < 0x110000) bytesToWrite = 4;
else bytesToWrite = 3, ch = 0xFFFD; // replacement character
char buf[4];
char* target = &buf[bytesToWrite];
switch (bytesToWrite)
{
case 4: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
case 3: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
case 2: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
case 1: *--target = (ch | firstByteMark[bytesToWrite]);
}
result += CStr8(buf, bytesToWrite);
}
return result;
return utf8_from_wstring(*this);
}
/**
* Test for valid UTF-8 string
*
* @param const unsigned char * source pointer to string to test.
* @param int Length of string to test.
* @return bool true if source string is legal UTF-8,
* false if not.
**/
static bool isLegalUTF8(const unsigned char *source, int Length)
{
unsigned char a;
const unsigned char *srcptr = source+Length;
switch (Length) {
default: return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
case 0xE0: if (a < 0xA0) return false; break;
case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
}
if (*source > 0xF4) return false;
return true;
}
/**
* Convert UTF-8 to CStr
*
@ -143,46 +60,7 @@ static bool isLegalUTF8(const unsigned char *source, int Length)
**/
CStrW CStr8::FromUTF8() const
{
CStrW result;
if (empty())
return result;
const unsigned char* source = (const unsigned char*)&*begin();
const unsigned char* sourceEnd = source + length();
while (source < sourceEnd)
{
wchar_t ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd)
{
// Error - fell of the the end of the string
result += (wchar_t)0xFFFD;
source++;
continue;
}
if (! isLegalUTF8(source, extraBytesToRead+1)) {
// Error - illegal data
result += (wchar_t)0xFFFD;
source++;
continue;
}
switch (extraBytesToRead)
{
case 5: ch += *source++; ch <<= 6;
case 4: ch += *source++; ch <<= 6;
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
result += ch;
}
return result;
return wstring_from_utf8(*this);
}
#else