1
0
forked from 0ad/0ad

implement changes suggested by Philip:

- add self-test
- allow decoding UTF8 values beyond BMP (and replace them later)
- quietly replace invalid bytes

This was SVN commit r7187.
This commit is contained in:
janwas 2009-11-09 20:53:48 +00:00
parent 593bf6a571
commit ff15c522fe
18 changed files with 138 additions and 60 deletions

View File

@ -1557,7 +1557,7 @@ void CGUI::Xeromyces_ReadImage(XMBElement Element, CXeromyces* pFile, CGUISprite
if (attr_name == "texture")
{
image.m_TextureName = VfsPath(L"art/textures/ui")/wstring_from_UTF8(attr_value);
image.m_TextureName = VfsPath(L"art/textures/ui")/wstring_from_utf8(attr_value);
}
else
if (attr_name == "size")

View File

@ -370,7 +370,7 @@ void GUIRenderer::UpdateDrawCallCache(DrawCalls &Calls, const CStr& SpriteName,
if (SpriteName.substr(0, 10) == "stretched:")
{
SGUIImage Image;
Image.m_TextureName = VfsPath(L"art/textures/ui")/wstring_from_UTF8(SpriteName.substr(10));
Image.m_TextureName = VfsPath(L"art/textures/ui")/wstring_from_utf8(SpriteName.substr(10));
CClientArea ca("0 0 100% 100%");
Image.m_Size = ca;
Image.m_TextureSize = ca;

View File

@ -27,7 +27,7 @@
#include "lib/bits.h"
#include "lib/byte_order.h"
#include "lib/wchar.h" // wstring_from_UTF8
#include "lib/wchar.h" // wstring_from_utf8
#include "lib/fat_time.h"
#include "lib/path_util.h"
#include "lib/allocators/pool.h"
@ -135,7 +135,7 @@ public:
{
const size_t length = (size_t)read_le16(&m_fn_len);
const char* pathname = (const char*)this + sizeof(CDFH); // not 0-terminated!
return wstring_from_UTF8(std::string(pathname, length));
return wstring_from_utf8(std::string(pathname, length));
}
off_t HeaderOffset() const

View File

@ -23,7 +23,7 @@
#include <string>
#include "lib/path_util.h"
#include "lib/wchar.h" // wstring_from_UTF8
#include "lib/wchar.h" // wstring_from_utf8
#include "lib/posix/posix_filesystem.h"
@ -67,7 +67,7 @@ LibError GetDirectoryEntries(const fs::wpath& path, FileInfos* files, DirectoryN
return LibError_from_errno();
}
const std::wstring name = wstring_from_UTF8(osEnt->d_name);
const std::wstring name = wstring_from_utf8(osEnt->d_name);
RETURN_ERR(path_component_validate(name.c_str()));
// get file information (mode, size, mtime)

View File

@ -128,10 +128,10 @@ const wchar_t* path_name_only(const wchar_t* path)
fs::wpath wpath_from_path(const fs::path& pathname)
{
return wstring_from_UTF8(pathname.string());
return wstring_from_utf8(pathname.string());
}
fs::path path_from_wpath(const fs::wpath& pathname)
{
return UTF8_from_wstring(pathname.string());
return utf8_from_wstring(pathname.string());
}

View File

@ -257,8 +257,8 @@ int tsprintf_s(tchar* buf, size_t max_chars, const tchar* fmt, ...)
errno_t _wfopen_s(FILE** pfile, const wchar_t* filename, const wchar_t* mode)
{
*pfile = NULL;
const std::string filename_c = UTF8_from_wstring(filename);
const std::string mode_c = UTF8_from_wstring(mode);
const std::string filename_c = utf8_from_wstring(filename);
const std::string mode_c = utf8_from_wstring(mode);
return fopen_s(pfile, filename_c.c_str(), mode_c.c_str());
}

View File

@ -281,7 +281,7 @@ long __stdcall wseh_ExceptionFilter(struct _EXCEPTION_POINTERS* ep)
if(ep->ExceptionRecord->ExceptionFlags & EXCEPTION_NONCONTINUABLE)
flags = DE_NO_CONTINUE;
const wchar_t* const lastFuncToSkip = WIDEN(STRINGIZE(DECORATED_NAME(wseh_ExceptionFilter)));
ErrorReaction er = debug_DisplayError(message, flags, ep->ContextRecord, lastFuncToSkip, file,line,UTF8_from_wstring(func).c_str(), 0);
ErrorReaction er = debug_DisplayError(message, flags, ep->ContextRecord, lastFuncToSkip, file,line,utf8_from_wstring(func).c_str(), 0);
debug_assert(er == ER_CONTINUE); // nothing else possible
// invoke the Win32 default handler - it calls ExitProcess for

View File

@ -108,7 +108,7 @@ public:
sprintf_s(root, ARRAY_SIZE(root), "%s/pyrogenesis-test-sysdep-XXXXXX", tmpdir);
TS_ASSERT(mkdtemp(root));
std::string rootstr(root);
std::wstring rootstrw(wstring_from_UTF8(rootstr));
std::wstring rootstrw(wstring_from_utf8(rootstr));
const char* dirs[] = {
"/example",

View File

@ -0,0 +1,78 @@
/* Copyright (C) 2009 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* 0 A.D. is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with 0 A.D. If not, see <http://www.gnu.org/licenses/>.
*/
#include "lib/self_test.h"
#include "lib/wchar.h"
// (copied from CStr test)
class Test_wchar : public CxxTest::TestSuite
{
public:
void test_utf8_utf16_conversion()
{
const wchar_t chr_utf16[] = {
0x12,
0xff,
0x1234,
0x3456,
0x5678,
0x7890,
0x9abc,
0xbcde,
0xfffd
};
const unsigned char chr_utf8[] = {
0x12,
0xc3, 0xbf,
0xe1, 0x88, 0xb4,
0xe3, 0x91, 0x96,
0xe5, 0x99, 0xb8,
0xe7, 0xa2, 0x90,
0xe9, 0xaa, 0xbc,
0xeb, 0xb3, 0x9e,
0xef, 0xbf, 0xbd
};
const std::wstring str_utf16(chr_utf16, ARRAY_SIZE(chr_utf16));
const std::string str_utf8 = utf8_from_wstring(str_utf16);
TS_ASSERT_EQUALS(str_utf8.length(), ARRAY_SIZE(chr_utf8));
TS_ASSERT_SAME_DATA(str_utf8.data(), chr_utf8, ARRAY_SIZE(chr_utf8)*sizeof(char));
const std::wstring str_utf16b = wstring_from_utf8(str_utf8);
TS_ASSERT_WSTR_EQUALS(str_utf16b, str_utf16);
}
void test_invalid_utf8()
{
struct { const char* utf8; const wchar_t* utf16; } tests[] = {
{ "a\xef", L"a\xfffd" },
{ "b\xef\xbf", L"b\xfffd\xfffd" },
{ "c\xef\xbf\x01", L"c\xfffd\xfffd\x0001" },
{ "d\xffX\x80Y\x80" , L"d\xfffdX\xfffdY\xfffd" }
};
for (size_t i = 0; i < ARRAY_SIZE(tests); ++i)
{
const std::string str_utf8(tests[i].utf8);
const std::wstring str_utf16(tests[i].utf16);
const std::wstring str_utf8to16 = wstring_from_utf8(str_utf8);
TS_ASSERT_EQUALS(str_utf16.length(), str_utf8to16.length());
TS_ASSERT_SAME_DATA(str_utf8to16.data(), str_utf16.data(), str_utf16.length()*sizeof(wchar_t));
}
}
};

View File

@ -55,25 +55,20 @@
typedef u8 UTF8;
typedef u32 UTF32;
static const UTF32 replacementCharacter = 0xFFFDul; // used by ReplaceIfInvalid and UTF8::Decode
static UTF32 ReplaceIfInvalid(UTF32 u)
{
struct IsValid
{
bool operator()(UTF32 u) const
{
// disallow surrogates
if(0xDC00ul <= u && u <= 0xDFFFul)
return false;
// greater: UTF-16 representation would require surrogates
// equal: permanently unassigned codepoint, may correspond to WEOF
// (which raises errors when used in VC's swprintf)
if(u >= 0xFFFFul)
return false;
return true;
}
};
return IsValid()(u)? u : 0xFFFD;
// disallow surrogates
if(0xD800ul <= u && u <= 0xDFFFul)
return replacementCharacter;
// 0xFFFE: byte order marker (invalid character)
// 0xFFFF: permanently unassigned code point, may correspond to WEOF
// (raises errors when used in VC's swprintf)
// greater: UTF-16 representation would require surrogates
if(u >= 0xFFFEul)
return replacementCharacter;
return u;
}
@ -84,21 +79,26 @@ public:
{
const size_t size = Size(u);
static const UTF8 firstByteMarks[1+3] = { 0, 0x00, 0xC0, 0xE0 };
*dstPos++ = (UTF8)(u | firstByteMarks[size]);
for(size_t i = 1; i < size; i++)
{
*dstPos++ = (UTF8)((u|0x80u) & 0xBFu);
dstPos[size-i] = UTF8((u|0x80u) & 0xBFu);
u >>= 6;
}
dstPos[0] = UTF8(u | firstByteMarks[size]);
dstPos += size;
}
static bool Decode(const UTF8*& srcPos, const UTF8* const srcEnd, UTF32& u)
// @return decoded scalar, or replacementCharacter on error
static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd)
{
const size_t size = SizeFromFirstByte(*srcPos);
if(!IsValid(srcPos, size, srcEnd))
return false;
{
srcPos += 1; // only skip the offending byte (increases chances of resynchronization)
return replacementCharacter;
}
u = 0;
UTF32 u = 0;
for(size_t i = 0; i < size-1; i++)
{
u += UTF32(*srcPos++);
@ -106,10 +106,9 @@ public:
}
u += UTF32(*srcPos++);
static const UTF32 offsets[1+3] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul };
static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
u -= offsets[size];
return true;
return u;
}
private:
@ -129,8 +128,10 @@ private:
return 1;
if(firstByte < 0xE0)
return 2;
// IsValid rejects firstByte values that would cause > 3 byte encodings.
return 3;
if(firstByte < 0xF0)
return 3;
// IsValid rejects firstByte values that would cause > 4 byte encodings.
return 4;
}
// c.f. Unicode 3.1 Table 3-7
@ -142,7 +143,7 @@ private:
if(src[0] < 0x80)
return true;
if(!(0xC2 <= src[0] && src[0] <= 0xEF))
if(!(0xC2 <= src[0] && src[0] <= 0xF4))
return false;
// special cases (stricter than the loop)
@ -150,6 +151,10 @@ private:
return false;
if(src[0] == 0xED && src[1] > 0x9F)
return false;
if(src[0] == 0xF0 && src[1] < 0x90)
return false;
if(src[0] == 0xF4 && src[1] > 0x8F)
return false;
for(size_t i = 1; i < size; i++)
{
@ -164,9 +169,9 @@ private:
//-----------------------------------------------------------------------------
std::string UTF8_from_wstring(const std::wstring& src)
std::string utf8_from_wstring(const std::wstring& src)
{
std::string dst(src.size()*3, ' '); // see UTF8Codec::Size
std::string dst(src.size()*3+1, ' '); // see UTF8Codec::Size; +1 ensures &dst[0] is valid
UTF8* dstPos = (UTF8*)&dst[0];
for(size_t i = 0; i < src.size(); i++)
{
@ -178,7 +183,7 @@ std::string UTF8_from_wstring(const std::wstring& src)
}
std::wstring wstring_from_UTF8(const std::string& src)
std::wstring wstring_from_utf8(const std::string& src)
{
std::wstring dst;
dst.reserve(src.size());
@ -186,12 +191,7 @@ std::wstring wstring_from_UTF8(const std::string& src)
const UTF8* const srcEnd = srcPos + src.size();
while(srcPos < srcEnd)
{
UTF32 u;
if(!UTF8Codec::Decode(srcPos, srcEnd, u))
{
debug_assert(0);
return L"(wstring_from_UTF8: invalid input)";
}
const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd);
dst.push_back((wchar_t)ReplaceIfInvalid(u));
}
return dst;

View File

@ -18,7 +18,7 @@
#ifndef INCLUDED_WCHAR
#define INCLUDED_WCHAR
LIB_API std::wstring wstring_from_UTF8(const std::string& s);
LIB_API std::string UTF8_from_wstring(const std::wstring& s);
LIB_API std::wstring wstring_from_utf8(const std::string& s);
LIB_API std::string utf8_from_wstring(const std::wstring& s);
#endif // #ifndef INCLUDED_WCHAR

View File

@ -375,7 +375,7 @@ void CNetLogFileSink::OpenFile( const fs::wpath& fileName, bool append )
if ( m_File.is_open() ) m_File.close();
// Open the file and log start
m_File.open( UTF8_from_wstring(fileName.string()).c_str(), append ? std::ios::app : std::ios::out );
m_File.open( utf8_from_wstring(fileName.string()).c_str(), append ? std::ios::app : std::ios::out );
if ( !m_File.is_open() )
{
// throw std::ios_base::failure

View File

@ -56,10 +56,10 @@ const wchar_t* html_footer = L"";
CLogger::CLogger()
{
fs::wpath mainlogPath(psLogDir()/L"mainlog.html");
m_MainLog = new std::wofstream(UTF8_from_wstring(mainlogPath.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
m_MainLog = new std::wofstream(utf8_from_wstring(mainlogPath.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
fs::wpath interestinglogPath(psLogDir()/L"interestinglog.html");
m_InterestingLog = new std::wofstream(UTF8_from_wstring(interestinglogPath.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
m_InterestingLog = new std::wofstream(utf8_from_wstring(interestinglogPath.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
m_OwnsStreams = true;
m_UseDebugPrintf = true;

View File

@ -74,7 +74,7 @@ CStr g_AutostartMap = "";
static void LoadProfile( const CStr& profile )
{
VfsPath path = VfsPath(L"profiles") / wstring_from_UTF8(profile);
VfsPath path = VfsPath(L"profiles") / wstring_from_utf8(profile);
VfsPath configFilename = path / L"settings/user.cfg";
g_ConfigDB.SetConfigFile(CFG_USER, true, configFilename.string().c_str());

View File

@ -567,7 +567,7 @@ static void InitVfs(const CmdLineArgs& args)
{
size_t priority = i;
int flags = VFS_MOUNT_WATCH|VFS_MOUNT_ARCHIVABLE;
std::wstring modName (wstring_from_UTF8(mods[i]));
std::wstring modName (wstring_from_utf8(mods[i]));
g_VFS->Mount(L"", AddSlash(modLoosePath/modName), flags, priority);
g_VFS->Mount(L"", AddSlash(modArchivePath/modName), flags, priority);
}

View File

@ -51,7 +51,7 @@ Paths::Paths(const CmdLineArgs& args)
#else
const char* envHome = getenv("HOME");
debug_assert(envHome);
const fs::wpath home(wstring_from_UTF8(envHome));
const fs::wpath home(wstring_from_utf8(envHome));
m_data = AddSlash(XDG_Path("XDG_DATA_HOME", home, home/L".local/share/")/subdirectoryName);
m_config = AddSlash(XDG_Path("XDG_CONFIG_HOME", home, home/L".config/")/subdirectoryName);
m_cache = AddSlash(XDG_Path("XDG_CACHE_HOME", home, home/L".cache/")/subdirectoryName);
@ -92,8 +92,8 @@ Paths::Paths(const CmdLineArgs& args)
if(path)
{
if(path[0] != '/') // relative to $HOME
return AddSlash(home/wstring_from_UTF8(path));
return AddSlash(fs::wpath(wstring_from_UTF8(path)));
return AddSlash(home/wstring_from_utf8(path));
return AddSlash(fs::wpath(wstring_from_utf8(path)));
}
return AddSlash(defaultPath);
}

View File

@ -432,7 +432,7 @@ void CProfileViewer::SaveToFile()
// Open the file. (It will be closed when the CProfileViewer
// destructor is called.)
fs::wpath path(psLogDir()/L"profile.txt");
m->outputStream.open(UTF8_from_wstring(path.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
m->outputStream.open(utf8_from_wstring(path.string()).c_str(), std::ofstream::out | std::ofstream::trunc);
if (m->outputStream.fail())
{

View File

@ -59,7 +59,7 @@ bool I18n::LoadLanguage(const char* name)
// Automatically delete the pointer when returning early
std::auto_ptr<CLocale_interface> locale (locale_ptr);
VfsPath dirname = AddSlash(VfsPath(L"language")/wstring_from_UTF8(name));
VfsPath dirname = AddSlash(VfsPath(L"language")/wstring_from_utf8(name));
// Open *.lng with LoadStrings
VfsPaths pathnames;