Source code for C_Unicode
Download
The source files for this module are listed below. You can also download the module C_Unicode as a zip archive; this archive contains all the source files and documentation.
Description
The module Unicode contains functions to convert ASCII, UTF8 and UTF16 strings from and to Windows NT Unicode.
Information
The Unicode2<xxx> functions convert a Windows NT Unicode string into a string with another encoding, while the <xxx> functions perform the opposite action. The External<xxx>2Unicode functions try to infer the source string's encoding first by looking for a Unicode BOM; when it is not found, they will assume the encoding is the one you specify.
Files
Each file belonging to this source code module is listed below.
Unicode.h
/*******************************************************************************
Version: 1
Author: Carl Colijn, TwoLogs
Contact: c.colijn@twologs.com
Source: http://www.twologs.com/sourcecode
This code is freely distributable, as long as this comment remains intact.
If you find this source useful, you may use this code in your own projects
free of charge, but some acknowledgement to the author of this code is always
appreciated :)
The source is however distributed 'as is' without waranty and/or support, and
may not be fit for each and every application. Use it at your own discretion
and at your own risk.
The source already has undergone testing. This doesn't mean however that all
bugs are removed from this piece of code. If you find one of them, please
contact me about it. I can however not guarantee when and if the bug will be
fixed.
More information about this module can be found in the accompanying HTML file.
*******************************************************************************/
#ifndef INCLUDE_TWOLOGS_COMMON_UNICODE_H
#define INCLUDE_TWOLOGS_COMMON_UNICODE_H
#include <windows.h>
#include <string>
// All supported unicode formats
enum EUnicodeFormat {
g_ceUnicodeFormatASCII = 0,
g_ceUnicodeFormatUTF8,
g_ceUnicodeFormatUTF16,
g_ceUnicodeFormatUTF16BE
};
// All format names
extern const wchar_t* g_casUnicodeEncodings[];
// External unicode format
class CExternalUnicodeFormat {
friend CExternalUnicodeFormat GetExternalStringFormat(
const char* sxString,
EUnicodeFormat eDefaultFormat,
size_t nNumBytes
);
public:
// Constructor
CExternalUnicodeFormat();
CExternalUnicodeFormat(EUnicodeFormat eFormat, bool bWithBOM = false);
// Sets the format
void Set(EUnicodeFormat eFormat, bool bWithBOM = false);
// The format in use
EUnicodeFormat Format() const {
return m_eFormat;
}
// With BOM
bool WithBOM() const {
return m_bWithBOM;
};
// The size of any leading BOM, in bytes
size_t BOMSize() const {
return m_nBOMSize;
};
// Whether the format could be deduced
bool FormatDeduced() const {
return m_bFormatDeduced;
};
private:
// The format in use
EUnicodeFormat m_eFormat;
// With BOM
bool m_bWithBOM;
// The size of any leading BOM, in bytes
size_t m_nBOMSize;
// Whether the format could be deduced
bool m_bFormatDeduced;
};
// Unknown string size
extern const size_t g_cnUnicodeSizeUnknown;
// Encodes the given Unicode string to the given format
struct CEncodeFromUnicodeResult {
bool bSuccess;
std::string sxResult;
bool bUnknownCharsReplaced;
};
CEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& sUnicode,
EUnicodeFormat eFormat,
bool bWithBOM = false
);
CEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* sUnicode,
EUnicodeFormat eFormat,
bool bWithBOM = false,
size_t nNumChars = g_cnUnicodeSizeUnknown
);
CEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& sUnicode,
const CExternalUnicodeFormat& oFormat
);
CEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* sUnicode,
const CExternalUnicodeFormat& oFormat,
size_t nNumChars = g_cnUnicodeSizeUnknown
);
// Decodes the given string from the given format
struct CDecodeToUnicodeResult {
bool bSuccess;
std::wstring sResult;
};
CDecodeToUnicodeResult DecodeToUnicode(
const std::string& sxString,
EUnicodeFormat eFormat
);
CDecodeToUnicodeResult DecodeToUnicode(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes = g_cnUnicodeSizeUnknown
);
CDecodeToUnicodeResult DecodeToUnicode(
const std::string& sxString,
const CExternalUnicodeFormat& oFormat
);
CDecodeToUnicodeResult DecodeToUnicode(
const char* sxString,
const CExternalUnicodeFormat& oFormat,
size_t nNumBytes = g_cnUnicodeSizeUnknown
);
// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
CExternalUnicodeFormat GetExternalStringFormat(
const std::string& sxString,
EUnicodeFormat eDefaultFormat
);
CExternalUnicodeFormat GetExternalStringFormat(
const char* sxString,
EUnicodeFormat eDefaultFormat,
size_t nNumBytes = g_cnUnicodeSizeUnknown
);
// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
CExternalUnicodeFormat DetectBOM(
const std::string& sxString,
EUnicodeFormat eFormat
);
CExternalUnicodeFormat DetectBOM(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes = g_cnUnicodeSizeUnknown
);
// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
const std::string& sxString,
EUnicodeFormat eFormat
);
bool StringIs7BitASCII(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes = g_cnUnicodeSizeUnknown
);
bool StringIs7BitASCII(
const std::wstring& sUnicode
);
bool StringIs7BitASCII(
const wchar_t* sUnicode,
size_t nNumChars = g_cnUnicodeSizeUnknown
);
#endif // INCLUDE_TWOLOGS_COMMON_UNICODE_H
Unicode.cpp
/*******************************************************************************
Version: 1
Author: Carl Colijn, TwoLogs
Contact: c.colijn@twologs.com
Source: http://www.twologs.com/sourcecode
This code is freely distributable, as long as this comment remains intact.
If you find this source useful, you may use this code in your own projects
free of charge, but some acknowledgement to the author of this code is always
appreciated :)
The source is however distributed 'as is' without waranty and/or support, and
may not be fit for each and every application. Use it at your own discretion
and at your own risk.
The source already has undergone testing. This doesn't mean however that all
bugs are removed from this piece of code. If you find one of them, please
contact me about it. I can however not guarantee when and if the bug will be
fixed.
More information about this module can be found in the accompanying HTML file.
*******************************************************************************/
#include "Unicode.h"
// All format names
const wchar_t* g_casUnicodeEncodings[] = {
L"ASCII",
L"UTF-8",
L"UTF-16",
L"UTF-16 BE"
};
// BOM's for the different encodings
struct CBOMInfo {
const char* sSignature;
size_t nSize;
};
const CBOMInfo g_aoBOMs[] = {
{"", 0},
{"\xEF\xBB\xBF", 3},
{"\xFF\xFE", 2},
{"\xFE\xFF", 2}
};
// Unknown string size
const size_t g_cnUnicodeSizeUnknown = (size_t)-1;
// Constructor
CExternalUnicodeFormat::CExternalUnicodeFormat():
m_eFormat(g_ceUnicodeFormatASCII),
m_bWithBOM(false),
m_nBOMSize(0),
m_bFormatDeduced(false) {
}
CExternalUnicodeFormat::CExternalUnicodeFormat(EUnicodeFormat eFormat, bool bWithBOM):
m_eFormat(eFormat),
m_bWithBOM(bWithBOM),
m_bFormatDeduced(false) {
if (bWithBOM) {
m_nBOMSize = g_aoBOMs[eFormat].nSize;
} else {
m_nBOMSize = 0;
}
}
// Sets the format
void CExternalUnicodeFormat::Set(EUnicodeFormat eFormat, bool bWithBOM) {
m_eFormat = eFormat;
m_bWithBOM = bWithBOM;
m_bFormatDeduced = false;
if (bWithBOM) {
m_nBOMSize = g_aoBOMs[eFormat].nSize;
} else {
m_nBOMSize = 0;
}
}
// Makes sure the size of the given string is determined
void EnsureSizeKnown(const char* sxString, size_t& nNumBytes, EUnicodeFormat eFormat) {
// Look if the string's size is known
if (nNumBytes == g_cnUnicodeSizeUnknown) {
// No -> determine the size of the source
switch (eFormat) {
case g_ceUnicodeFormatASCII:
case g_ceUnicodeFormatUTF8: {
nNumBytes = strlen(sxString);
break;
}
case g_ceUnicodeFormatUTF16:
case g_ceUnicodeFormatUTF16BE: {
nNumBytes = wcslen(reinterpret_cast<const wchar_t*>(sxString)) * sizeof(wchar_t);
break;
}
}
}
}
// Copies over the buffer but toggles the endiannes
void CopyToggleEndiannes(
const char* sxSource,
char* sxDest,
size_t nNumBytes
) {
for (
size_t nByteNr = 0;
nByteNr < nNumBytes;
nByteNr += 2, sxDest += 2, sxSource += 2
) {
*sxDest = *(sxSource + 1);
*(sxDest + 1) = *sxSource;
}
}
// Encodes the given Unicode string to the given format
CEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& sUnicode,
EUnicodeFormat eFormat,
bool bWithBOM
) {
return EncodeFromUnicode(
sUnicode.c_str(),
eFormat,
bWithBOM,
sUnicode.size()
);
}
CEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* sUnicode,
EUnicodeFormat eFormat,
bool bWithBOM,
size_t nNumChars
) {
// Determine the size of the source
if (nNumChars == g_cnUnicodeSizeUnknown) {
nNumChars = wcslen(sUnicode);
}
// Get info on the BOM to add
const char* sBOM = "";
size_t nBOMSize = 0;
if (bWithBOM) {
const CBOMInfo* poBOMInfo = g_aoBOMs + eFormat;
sBOM = poBOMInfo->sSignature;
nBOMSize = poBOMInfo->nSize;
}
// Look if to do any conversion
CEncodeFromUnicodeResult oResult;
oResult.bSuccess = true;
oResult.bUnknownCharsReplaced = false;
if (nNumChars == 0) {
// No -> we're done with an empty string, apart from any BOM
oResult.sxResult = sBOM;
} else {
// Yes -> look what to convert to
switch (eFormat) {
case g_ceUnicodeFormatASCII:
case g_ceUnicodeFormatUTF8: {
// To ASCII or UTF8 -> prepare a large enough working buffer
size_t nBufferSize = nNumChars * 4 + 5;
oResult.sxResult.resize(nBufferSize);
// Add the BOM, if needed
char* sxResultPos = const_cast<char*>(oResult.sxResult.c_str());
if (nBOMSize > 0) {
memcpy(sxResultPos, sBOM, nBOMSize);
sxResultPos += nBOMSize;
nBufferSize -= nBOMSize;
}
// Convert the string itself
bool bToUTF8 = eFormat == g_ceUnicodeFormatUTF8;
BOOL bUnknownCharsReplaced;
int nNumBytesWritten = WideCharToMultiByte(
bToUTF8? CP_UTF8: CP_THREAD_ACP,
0,
sUnicode,
nNumChars,
sxResultPos,
nBufferSize,
NULL,
bToUTF8? NULL: &bUnknownCharsReplaced
);
if (!bToUTF8) {
oResult.bUnknownCharsReplaced = bUnknownCharsReplaced;
}
oResult.bSuccess = nNumBytesWritten != 0;
if (oResult.bSuccess) {
// Done -> use the correct string size
oResult.sxResult.resize(nNumBytesWritten + nBOMSize);
} else {
// Error -> no output
oResult.sxResult.clear();
}
break;
}
case g_ceUnicodeFormatUTF16:
case g_ceUnicodeFormatUTF16BE: {
// UTF16 LE or BE -> prepare the output string
size_t nNumBytes = nNumChars * sizeof(wchar_t);
oResult.sxResult.resize(nNumBytes + nBOMSize);
// Add the BOM, if needed
char* sxResultPos = const_cast<char*>(oResult.sxResult.c_str());
if (nBOMSize > 0) {
memcpy(sxResultPos, sBOM, nBOMSize);
sxResultPos += nBOMSize;
}
// And look how to copy over the string itself
if (eFormat == g_ceUnicodeFormatUTF16) {
// Ad-is
memcpy(sxResultPos, sUnicode, nNumBytes);
} else {
// Endianness-reversed
CopyToggleEndiannes(
reinterpret_cast<const char*>(sUnicode),
sxResultPos,
nNumBytes
);
}
break;
}
}
}
// And return the result
return oResult;
}
CEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& sUnicode,
const CExternalUnicodeFormat& oFormat
) {
return EncodeFromUnicode(
sUnicode.c_str(),
oFormat.Format(),
oFormat.WithBOM(),
sUnicode.size()
);
}
CEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* sUnicode,
const CExternalUnicodeFormat& oFormat,
size_t nNumChars
) {
return EncodeFromUnicode(
sUnicode,
oFormat.Format(),
oFormat.WithBOM(),
nNumChars
);
}
// Decodes the given string from the given format
CDecodeToUnicodeResult DecodeToUnicode(
const char* sxString,
size_t nNumBytes,
EUnicodeFormat eFormat,
bool bWithBOM,
size_t nBOMSize
) {
// Get any BOM out of the way
sxString += nBOMSize;
// Look if the string's size is known
if (nNumBytes != g_cnUnicodeSizeUnknown) {
// Yes -> get rid of the BOM there, too
nNumBytes -= nBOMSize;
} else {
// No -> ensure it is known
EnsureSizeKnown(sxString, nNumBytes, eFormat);
}
// Look if there is anything to convert
CDecodeToUnicodeResult oResult;
oResult.bSuccess = true;
if (nNumBytes == 0) {
// No -> we're done with an empty string
oResult.sResult.clear();
} else {
// Yes -> look what to convert to
switch (eFormat) {
case g_ceUnicodeFormatASCII:
case g_ceUnicodeFormatUTF8: {
// ASCII or UTF8 -> prepare the conversion buffer
size_t nMaxNumCharsNeeded = nNumBytes * 3 + 1;
oResult.sResult.resize(nMaxNumCharsNeeded);
// Do the conversion
int nNumCharsWritten = MultiByteToWideChar(
eFormat == g_ceUnicodeFormatUTF8?
CP_UTF8:
CP_THREAD_ACP,
eFormat == g_ceUnicodeFormatUTF8? 0: MB_ERR_INVALID_CHARS,
sxString,
nNumBytes,
const_cast<wchar_t*>(oResult.sResult.c_str()),
nMaxNumCharsNeeded
);
// Look if it succeeded
bool bUTF8ConversionFailed = (
eFormat == g_ceUnicodeFormatUTF8 &&
nNumCharsWritten == ERROR_NO_UNICODE_TRANSLATION
);
oResult.bSuccess = nNumCharsWritten != 0 && !bUTF8ConversionFailed;
if (oResult.bSuccess) {
// Done -> truncate the buffer
oResult.sResult.resize(nNumCharsWritten);
} else {
// Error -> no result
oResult.sResult.clear();
// And make sure GetLastError is usefull
if (bUTF8ConversionFailed) {
SetLastError(ERROR_NO_UNICODE_TRANSLATION);
}
}
break;
}
case g_ceUnicodeFormatUTF16: {
// UTF16 LE -> just copy over the string as-is
oResult.sResult.resize(nNumBytes / sizeof(wchar_t));
memcpy(
const_cast<wchar_t*>(oResult.sResult.c_str()),
sxString,
nNumBytes
);
break;
}
case g_ceUnicodeFormatUTF16BE: {
// UTF16 BE -> just copy over the string, but endianness-reversed
oResult.sResult.resize(nNumBytes / sizeof(wchar_t));
CopyToggleEndiannes(
sxString,
reinterpret_cast<char*>(const_cast<wchar_t*>(oResult.sResult.c_str())),
nNumBytes
);
break;
}
}
}
// And return the result
return oResult;
}
CDecodeToUnicodeResult DecodeToUnicode(
const std::string& sxString,
EUnicodeFormat eFormat
) {
return DecodeToUnicode(
sxString.c_str(),
sxString.size(),
eFormat,
false,
0
);
}
CDecodeToUnicodeResult DecodeToUnicode(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes
) {
return DecodeToUnicode(
sxString,
nNumBytes,
eFormat,
false,
0
);
}
CDecodeToUnicodeResult DecodeToUnicode(
const std::string& sxString,
const CExternalUnicodeFormat& oFormat
) {
return DecodeToUnicode(
sxString.c_str(),
sxString.size(),
oFormat.Format(),
oFormat.WithBOM(),
oFormat.BOMSize()
);
}
CDecodeToUnicodeResult DecodeToUnicode(
const char* sxString,
const CExternalUnicodeFormat& oFormat,
size_t nNumBytes
) {
return DecodeToUnicode(
sxString,
nNumBytes,
oFormat.Format(),
oFormat.WithBOM(),
oFormat.BOMSize()
);
}
// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
CExternalUnicodeFormat GetExternalStringFormat(
const std::string& sxString,
EUnicodeFormat eDefaultFormat
) {
return GetExternalStringFormat(
sxString.c_str(),
eDefaultFormat,
sxString.size()
);
}
CExternalUnicodeFormat GetExternalStringFormat(
const char* sxString,
EUnicodeFormat eDefaultFormat,
size_t nNumBytes
) {
// Assume we won't recognize the format
CExternalUnicodeFormat oResult(eDefaultFormat, false);
// Look if any of the BOM's match
const char* sxSource = sxString;
const CBOMInfo* poNextBOMInfo = g_aoBOMs;
for (
long eFormat = g_ceUnicodeFormatASCII;
eFormat <= g_ceUnicodeFormatUTF16BE && !oResult.m_bFormatDeduced;
++eFormat, ++poNextBOMInfo
) {
// Look if the string is long enough to contain this format's BOM
if (poNextBOMInfo->nSize > 0 && poNextBOMInfo->nSize <= nNumBytes) {
// Yes -> look if it matches
oResult.m_bFormatDeduced =
oResult.m_bWithBOM = 0 == memcmp(
sxSource,
poNextBOMInfo->sSignature,
poNextBOMInfo->nSize
);
if (oResult.m_bWithBOM) {
// Yes -> note the format
oResult.m_eFormat = (EUnicodeFormat)eFormat;
oResult.m_nBOMSize = poNextBOMInfo->nSize;
}
}
}
// And return the result
return oResult;
}
// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
CExternalUnicodeFormat DetectBOM(
const std::string& sxString,
EUnicodeFormat eFormat
) {
return DetectBOM(
sxString.c_str(),
eFormat,
sxString.size()
);
}
CExternalUnicodeFormat DetectBOM(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes
) {
// Look if the string is long enough to contain this format's BOM
const CBOMInfo* poBOMInfo = g_aoBOMs + eFormat;
bool bDetectedBOM = false;
if (poBOMInfo->nSize > 0 && poBOMInfo->nSize <= nNumBytes) {
// Yes -> look if it matches
bDetectedBOM = 0 == memcmp(
sxString,
poBOMInfo->sSignature,
poBOMInfo->nSize
);
}
// And return the result
return CExternalUnicodeFormat(eFormat, bDetectedBOM);
}
// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
const std::string& sxString,
EUnicodeFormat eFormat
) {
return StringIs7BitASCII(
sxString.c_str(),
eFormat,
sxString.size()
);
}
bool StringIs7BitASCII(
const char* sxString,
EUnicodeFormat eFormat,
size_t nNumBytes
) {
// Ensure the string's size is known
EnsureSizeKnown(sxString, nNumBytes, eFormat);
// Inspect each character
bool bIs7BitASCII = true;
while (nNumBytes > 0 && bIs7BitASCII) {
bIs7BitASCII = *(unsigned char*)sxString < 0x80;
++sxString;
--nNumBytes;
}
// And return the verdict
return bIs7BitASCII;
}
bool StringIs7BitASCII(
const std::wstring& sUnicode
) {
return StringIs7BitASCII(
sUnicode.c_str(),
sUnicode.size()
);
}
bool StringIs7BitASCII(
const wchar_t* sUnicode,
size_t nNumChars
) {
// Ensure the string's size is known
if (nNumChars == g_cnUnicodeSizeUnknown) {
nNumChars = wcslen(sUnicode);
}
// Inspect each character
bool bIs7BitASCII = true;
while (nNumChars > 0 && bIs7BitASCII) {
bIs7BitASCII = *sUnicode < 0x80;
--nNumChars;
++sUnicode;
}
// And return the verdict
return bIs7BitASCII;
}

Products
Overview
C_Unicode