forked from 0ad/0ad
use lib/wchar.h for UTF8 conversion (avoid duplication, more aware of wchar_t differences and surrogate pairs)
fixes #400 This was SVN commit r7201.
This commit is contained in:
parent
b51a0187bf
commit
781538313c
@ -28,6 +28,7 @@
|
|||||||
|
|
||||||
#include "lib/posix/posix_sock.h" // htons, ntohs
|
#include "lib/posix/posix_sock.h" // htons, ntohs
|
||||||
#include "lib/fnv_hash.h"
|
#include "lib/fnv_hash.h"
|
||||||
|
#include "lib/wchar.h"
|
||||||
#include "network/Serialization.h"
|
#include "network/Serialization.h"
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
@ -42,31 +43,6 @@
|
|||||||
CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {}
|
CStrW::CStrW(const CStr8 &asciStr) : std::wstring(asciStr.begin(), asciStr.end()) {}
|
||||||
CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {}
|
CStr8::CStr8(const CStrW& wideStr) : std:: string(wideStr.begin(), wideStr.end()) {}
|
||||||
|
|
||||||
// UTF conversion code adapted from http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Used by ToUTF8
|
|
||||||
**/
|
|
||||||
static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
||||||
/**
|
|
||||||
* Used by FromUTF8
|
|
||||||
**/
|
|
||||||
static const char trailingBytesForUTF8[256] = {
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
|
||||||
/**
|
|
||||||
* Used by FromUTF8
|
|
||||||
**/
|
|
||||||
static const u32 offsetsFromUTF8[6] = {
|
|
||||||
0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
||||||
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert CStr to UTF-8
|
* Convert CStr to UTF-8
|
||||||
*
|
*
|
||||||
@ -74,68 +50,9 @@ static const u32 offsetsFromUTF8[6] = {
|
|||||||
**/
|
**/
|
||||||
CStr8 CStrW::ToUTF8() const
|
CStr8 CStrW::ToUTF8() const
|
||||||
{
|
{
|
||||||
CStr8 result;
|
return utf8_from_wstring(*this);
|
||||||
|
|
||||||
for (size_t i = 0; i < length(); ++i)
|
|
||||||
{
|
|
||||||
unsigned short bytesToWrite;
|
|
||||||
wchar_t ch = (*this)[i];
|
|
||||||
|
|
||||||
if (ch < 0x80) bytesToWrite = 1;
|
|
||||||
else if (ch < 0x800) bytesToWrite = 2;
|
|
||||||
else if (ch < 0x10000) bytesToWrite = 3;
|
|
||||||
else if (ch < 0x110000) bytesToWrite = 4;
|
|
||||||
else bytesToWrite = 3, ch = 0xFFFD; // replacement character
|
|
||||||
|
|
||||||
char buf[4];
|
|
||||||
char* target = &buf[bytesToWrite];
|
|
||||||
switch (bytesToWrite)
|
|
||||||
{
|
|
||||||
case 4: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
|
|
||||||
case 3: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
|
|
||||||
case 2: *--target = ((ch | 0x80) & 0xBF); ch >>= 6;
|
|
||||||
case 1: *--target = (ch | firstByteMark[bytesToWrite]);
|
|
||||||
}
|
|
||||||
result += CStr8(buf, bytesToWrite);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Test for valid UTF-8 string
|
|
||||||
*
|
|
||||||
* @param const unsigned char * source pointer to string to test.
|
|
||||||
* @param int Length of string to test.
|
|
||||||
* @return bool true if source string is legal UTF-8,
|
|
||||||
* false if not.
|
|
||||||
**/
|
|
||||||
static bool isLegalUTF8(const unsigned char *source, int Length)
|
|
||||||
{
|
|
||||||
unsigned char a;
|
|
||||||
const unsigned char *srcptr = source+Length;
|
|
||||||
|
|
||||||
switch (Length) {
|
|
||||||
default: return false;
|
|
||||||
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
|
||||||
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
|
||||||
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
|
|
||||||
|
|
||||||
switch (*source) {
|
|
||||||
case 0xE0: if (a < 0xA0) return false; break;
|
|
||||||
case 0xED: if (a > 0x9F) return false; break;
|
|
||||||
case 0xF0: if (a < 0x90) return false; break;
|
|
||||||
case 0xF4: if (a > 0x8F) return false; break;
|
|
||||||
default: if (a < 0x80) return false;
|
|
||||||
}
|
|
||||||
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*source > 0xF4) return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert UTF-8 to CStr
|
* Convert UTF-8 to CStr
|
||||||
*
|
*
|
||||||
@ -143,46 +60,7 @@ static bool isLegalUTF8(const unsigned char *source, int Length)
|
|||||||
**/
|
**/
|
||||||
CStrW CStr8::FromUTF8() const
|
CStrW CStr8::FromUTF8() const
|
||||||
{
|
{
|
||||||
CStrW result;
|
return wstring_from_utf8(*this);
|
||||||
|
|
||||||
if (empty())
|
|
||||||
return result;
|
|
||||||
|
|
||||||
const unsigned char* source = (const unsigned char*)&*begin();
|
|
||||||
const unsigned char* sourceEnd = source + length();
|
|
||||||
while (source < sourceEnd)
|
|
||||||
{
|
|
||||||
wchar_t ch = 0;
|
|
||||||
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
|
||||||
if (source + extraBytesToRead >= sourceEnd)
|
|
||||||
{
|
|
||||||
// Error - fell of the the end of the string
|
|
||||||
result += (wchar_t)0xFFFD;
|
|
||||||
source++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (! isLegalUTF8(source, extraBytesToRead+1)) {
|
|
||||||
// Error - illegal data
|
|
||||||
result += (wchar_t)0xFFFD;
|
|
||||||
source++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (extraBytesToRead)
|
|
||||||
{
|
|
||||||
case 5: ch += *source++; ch <<= 6;
|
|
||||||
case 4: ch += *source++; ch <<= 6;
|
|
||||||
case 3: ch += *source++; ch <<= 6;
|
|
||||||
case 2: ch += *source++; ch <<= 6;
|
|
||||||
case 1: ch += *source++; ch <<= 6;
|
|
||||||
case 0: ch += *source++;
|
|
||||||
}
|
|
||||||
ch -= offsetsFromUTF8[extraBytesToRead];
|
|
||||||
|
|
||||||
result += ch;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
Loading…
Reference in New Issue
Block a user