The source files for this module are listed below. You can also download the module C_Unicode as a zip archive; this archive contains all the source files and documentation.
The module Unicode contains functions to convert ASCII, UTF8 and UTF16 strings from and to Windows NT Unicode.
The Unicode2<xxx> functions convert a Windows NT Unicode string into a string with another encoding, while the <xxx> functions perform the opposite action. The External<xxx>2Unicode functions try to infer the source string's encoding first by looking for a Unicode BOM; when it is not found, they will assume the encoding is the one you specify.
Each file belonging to this source code module is listed below.
/*******************************************************************************
Version: 2
Author: Carl Colijn, TwoLogs
Contact: c.colijn@twologs.com
Source: https://www.twologs.com/sourcecode
This code is freely distributable, as long as this comment remains intact.
If you find this source useful, you may use this code in your own projects
free of charge, but some acknowledgement to the author of this code is always
appreciated :)
The source is however distributed 'as is' without waranty and/or support, and
may not be fit for each and every application. Use it at your own discretion
and at your own risk.
The source already has undergone testing. This doesn't mean however that all
bugs are removed from this piece of code. If you find one of them, please
contact me about it. I can however not guarantee when and if the bug will be
fixed.
More information about this module can be found in the accompanying HTML file.
*******************************************************************************/
#ifndef INCLUDE_TWOLOGS_COMMON_UNICODE_H
#define INCLUDE_TWOLOGS_COMMON_UNICODE_H
#include <windows.h>
#include <string>
// All supported unicode formats
enum class TUnicodeFormat {
ascii = 0,
utf8,
utf16,
utf16BE
};
// BOM's for the different encodings
struct TBOMInfo {
const char* signature;
size_t size;
};
extern const TBOMInfo g_boms[];
// All format names
extern const wchar_t* g_unicodeEncodings[];
// External unicode format
class TExternalUnicodeFormat {
friend TExternalUnicodeFormat GetExternalStringFormat(
const char* stringRaw,
TUnicodeFormat defaultFormat,
size_t numBytes
);
public:
// Constructor
TExternalUnicodeFormat();
TExternalUnicodeFormat(TUnicodeFormat format, bool withBOM = false);
// Sets the format
void Set(TUnicodeFormat format, bool withBOM = false);
// The format in use
TUnicodeFormat Format() const {
return m_format;
}
// With BOM
bool WithBOM() const {
return m_withBOM;
};
// The size of any leading BOM, in bytes
size_t BOMSize() const {
return m_bomSize;
};
// Whether the format could be deduced
bool FormatDeduced() const {
return m_formatDeduced;
};
private:
// The format in use
TUnicodeFormat m_format;
// With BOM
bool m_withBOM;
// The size of any leading BOM, in bytes
size_t m_bomSize;
// Whether the format could be deduced
bool m_formatDeduced;
};
// Unknown string size
extern const size_t g_unicodeSizeUnknown;
// Encodes the given Unicode string to the given format
struct TEncodeFromUnicodeResult {
bool allOK;
std::string resultRaw;
bool unknownCharsReplaced;
};
TEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& unicode,
TUnicodeFormat format,
bool withBOM = false
);
TEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* unicode,
TUnicodeFormat format,
bool withBOM = false,
size_t numChars = g_unicodeSizeUnknown
);
TEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& unicode,
const TExternalUnicodeFormat& format
);
TEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* unicode,
const TExternalUnicodeFormat& format,
size_t numChars = g_unicodeSizeUnknown
);
// Decodes the given string from the given format
struct TDecodeToUnicodeResult {
bool allOK;
std::wstring result;
};
TDecodeToUnicodeResult DecodeToUnicode(
const std::string& stringRaw,
TUnicodeFormat format
);
TDecodeToUnicodeResult DecodeToUnicode(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes = g_unicodeSizeUnknown
);
TDecodeToUnicodeResult DecodeToUnicode(
const std::string& stringRaw,
const TExternalUnicodeFormat& format
);
TDecodeToUnicodeResult DecodeToUnicode(
const char* stringRaw,
const TExternalUnicodeFormat& format,
size_t numBytes = g_unicodeSizeUnknown
);
// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
TExternalUnicodeFormat GetExternalStringFormat(
const std::string& stringRaw,
TUnicodeFormat defaultFormat
);
TExternalUnicodeFormat GetExternalStringFormat(
const char* stringRaw,
TUnicodeFormat defaultFormat,
size_t numBytes = g_unicodeSizeUnknown
);
// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
TExternalUnicodeFormat DetectBOM(
const std::string& stringRaw,
TUnicodeFormat format
);
TExternalUnicodeFormat DetectBOM(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes = g_unicodeSizeUnknown
);
// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
const std::string& stringRaw,
TUnicodeFormat format
);
bool StringIs7BitASCII(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes = g_unicodeSizeUnknown
);
bool StringIs7BitASCII(
const std::wstring& unicode
);
bool StringIs7BitASCII(
const wchar_t* unicode,
size_t numChars = g_unicodeSizeUnknown
);
// UTF8 char size
// Works on all code points 0-10FFFF (full Unicode range)
#define POINTEDTOUTF8CHARSIZE(utf8CharPtr) \
((0xE5000000 >> ((*utf8CharPtr >> 3) & 0x1e)) & 3) + 1
// Advances the given UTF8 text pointer one character, assuming the encoding is not corrupt
inline const char* NextUTF8Char(const char* utf8CharPtr) {
return utf8CharPtr + POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline char* NextUTF8Char(char* utf8CharPtr) {
return utf8CharPtr + POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline void AdvanceUTF8Char(const char*& utf8CharPtr) {
utf8CharPtr += POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline void AdvanceUTF8Char(char*& utf8CharPtr) {
utf8CharPtr += POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
// Leave no mess
#undef POINTEDTOUTF8CHARSIZE
#endif // INCLUDE_TWOLOGS_COMMON_UNICODE_H
/*******************************************************************************
Version: 2
Author: Carl Colijn, TwoLogs
Contact: c.colijn@twologs.com
Source: https://www.twologs.com/sourcecode
This code is freely distributable, as long as this comment remains intact.
If you find this source useful, you may use this code in your own projects
free of charge, but some acknowledgement to the author of this code is always
appreciated :)
The source is however distributed 'as is' without waranty and/or support, and
may not be fit for each and every application. Use it at your own discretion
and at your own risk.
The source already has undergone testing. This doesn't mean however that all
bugs are removed from this piece of code. If you find one of them, please
contact me about it. I can however not guarantee when and if the bug will be
fixed.
More information about this module can be found in the accompanying HTML file.
*******************************************************************************/
#include "Unicode.h"
// All format names
const wchar_t* g_unicodeEncodings[] = {
L"ASCII",
L"UTF-8",
L"UTF-16",
L"UTF-16 BE"
};
// BOM's for the different encodings
const TBOMInfo g_boms[] = {
{"", 0},
{"\xEF\xBB\xBF", 3},
{"\xFF\xFE", 2},
{"\xFE\xFF", 2}
};
// Unknown string size
const size_t g_unicodeSizeUnknown = (size_t)-1;
// Constructor
TExternalUnicodeFormat::TExternalUnicodeFormat():
m_format(TUnicodeFormat::ascii),
m_withBOM(false),
m_bomSize(0),
m_formatDeduced(false) {
}
TExternalUnicodeFormat::TExternalUnicodeFormat(TUnicodeFormat format, bool withBOM):
m_format(format),
m_withBOM(withBOM),
m_formatDeduced(false) {
if (withBOM) {
m_bomSize = g_boms[(long)format].size;
} else {
m_bomSize = 0;
}
}
// Sets the format
void TExternalUnicodeFormat::Set(TUnicodeFormat format, bool withBOM) {
m_format = format;
m_withBOM = withBOM;
m_formatDeduced = false;
if (withBOM) {
m_bomSize = g_boms[(long)format].size;
} else {
m_bomSize = 0;
}
}
// Makes sure the size of the given string is determined
void EnsureSizeKnown(const char* stringRaw, size_t& numBytes, TUnicodeFormat format) {
// Look if the string's size is known
if (numBytes == g_unicodeSizeUnknown) {
// No -> determine the size of the source
switch (format) {
case TUnicodeFormat::ascii:
case TUnicodeFormat::utf8: {
numBytes = strlen(stringRaw);
break;
}
case TUnicodeFormat::utf16:
case TUnicodeFormat::utf16BE: {
numBytes = wcslen(reinterpret_cast<const wchar_t*>(stringRaw)) * sizeof(wchar_t);
break;
}
}
}
}
// Copies over the buffer but toggles the endiannes
void CopyToggleEndiannes(
const char* sourceRaw,
char* destRaw,
size_t numBytes
) {
for (
size_t byteNr = 0;
byteNr < numBytes;
byteNr += 2, destRaw += 2, sourceRaw += 2
) {
*destRaw = *(sourceRaw + 1);
*(destRaw + 1) = *sourceRaw;
}
}
// Encodes the given Unicode string to the given format
TEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& unicode,
TUnicodeFormat format,
bool withBOM
) {
return EncodeFromUnicode(
unicode.c_str(),
format,
withBOM,
unicode.size()
);
}
TEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* unicode,
TUnicodeFormat format,
bool withBOM,
size_t numChars
) {
// Determine the size of the source
if (numChars == g_unicodeSizeUnknown) {
numChars = wcslen(unicode);
}
// Get info on the BOM to add
const char* bom = "";
size_t bomSize = 0;
if (withBOM) {
const TBOMInfo* bomInfoPtr = &g_boms[(long)format];
bom = bomInfoPtr->signature;
bomSize = bomInfoPtr->size;
}
// Look if to do any conversion
TEncodeFromUnicodeResult result;
result.allOK = true;
result.unknownCharsReplaced = false;
if (numChars == 0) {
// No -> we're done with an empty string, apart from any BOM
result.resultRaw = bom;
} else {
// Yes -> look what to convert to
switch (format) {
case TUnicodeFormat::ascii:
case TUnicodeFormat::utf8: {
// To ASCII or UTF8 -> prepare a large enough working buffer
size_t bufferSize = numChars * 4 + 5;
result.resultRaw.resize(bufferSize);
// Add the BOM, if needed
char* resultRawPos = const_cast<char*>(result.resultRaw.c_str());
if (bomSize > 0) {
memcpy(resultRawPos, bom, bomSize);
resultRawPos += bomSize;
bufferSize -= bomSize;
}
// Convert the string itself
bool toUTF8 = format == TUnicodeFormat::utf8;
BOOL unknownCharsReplaced;
int numBytesWritten = WideCharToMultiByte(
toUTF8? CP_UTF8: CP_THREAD_ACP,
0,
unicode,
numChars,
resultRawPos,
bufferSize,
nullptr,
toUTF8? nullptr: &unknownCharsReplaced
);
if (!toUTF8) {
result.unknownCharsReplaced = unknownCharsReplaced;
}
result.allOK = numBytesWritten != 0;
if (result.allOK) {
// Done -> use the correct string size
result.resultRaw.resize(numBytesWritten + bomSize);
} else {
// Error -> no output
result.resultRaw.clear();
}
break;
}
case TUnicodeFormat::utf16:
case TUnicodeFormat::utf16BE: {
// UTF16 LE or BE -> prepare the output string
size_t numBytes = numChars * sizeof(wchar_t);
result.resultRaw.resize(numBytes + bomSize);
// Add the BOM, if needed
char* resultRawPos = const_cast<char*>(result.resultRaw.c_str());
if (bomSize > 0) {
memcpy(resultRawPos, bom, bomSize);
resultRawPos += bomSize;
}
// And look how to copy over the string itself
if (format == TUnicodeFormat::utf16) {
// Ad-is
memcpy(resultRawPos, unicode, numBytes);
} else {
// Endianness-reversed
CopyToggleEndiannes(
reinterpret_cast<const char*>(unicode),
resultRawPos,
numBytes
);
}
break;
}
}
}
// And return the result
return result;
}
TEncodeFromUnicodeResult EncodeFromUnicode(
const std::wstring& unicode,
const TExternalUnicodeFormat& format
) {
return EncodeFromUnicode(
unicode.c_str(),
format.Format(),
format.WithBOM(),
unicode.size()
);
}
TEncodeFromUnicodeResult EncodeFromUnicode(
const wchar_t* unicode,
const TExternalUnicodeFormat& format,
size_t numChars
) {
return EncodeFromUnicode(
unicode,
format.Format(),
format.WithBOM(),
numChars
);
}
// Decodes the given string from the given format
TDecodeToUnicodeResult DecodeToUnicode(
const char* stringRaw,
size_t numBytes,
TUnicodeFormat format,
bool withBOM,
size_t bomSize
) {
// Get any BOM out of the way
stringRaw += bomSize;
// Look if the string's size is known
if (numBytes != g_unicodeSizeUnknown) {
// Yes -> get rid of the BOM there, too
numBytes -= bomSize;
} else {
// No -> ensure it is known
EnsureSizeKnown(stringRaw, numBytes, format);
}
// Look if there is anything to convert
TDecodeToUnicodeResult result;
result.allOK = true;
if (numBytes == 0) {
// No -> we're done with an empty string
result.result.clear();
} else {
// Yes -> look what to convert to
switch (format) {
case TUnicodeFormat::ascii:
case TUnicodeFormat::utf8: {
// ASCII or UTF8 -> prepare the conversion buffer
size_t maxNumCharsNeeded = numBytes * 3 + 1;
result.result.resize(maxNumCharsNeeded);
// Do the conversion
int numCharsWritten = MultiByteToWideChar(
format == TUnicodeFormat::utf8?
CP_UTF8:
CP_THREAD_ACP,
format == TUnicodeFormat::utf8? 0: MB_ERR_INVALID_CHARS,
stringRaw,
numBytes,
const_cast<wchar_t*>(result.result.c_str()),
maxNumCharsNeeded
);
// Look if it succeeded
bool utf8ConversionFailed = (
format == TUnicodeFormat::utf8 &&
numCharsWritten == ERROR_NO_UNICODE_TRANSLATION
);
result.allOK = numCharsWritten != 0 && !utf8ConversionFailed;
if (result.allOK) {
// Done -> truncate the buffer
result.result.resize(numCharsWritten);
} else {
// Error -> no result
result.result.clear();
// And make sure GetLastError is usefull
if (utf8ConversionFailed) {
SetLastError(ERROR_NO_UNICODE_TRANSLATION);
}
}
break;
}
case TUnicodeFormat::utf16: {
// UTF16 LE -> just copy over the string as-is
result.result.resize(numBytes / sizeof(wchar_t));
memcpy(
const_cast<wchar_t*>(result.result.c_str()),
stringRaw,
numBytes
);
break;
}
case TUnicodeFormat::utf16BE: {
// UTF16 BE -> just copy over the string, but endianness-reversed
result.result.resize(numBytes / sizeof(wchar_t));
CopyToggleEndiannes(
stringRaw,
reinterpret_cast<char*>(const_cast<wchar_t*>(result.result.c_str())),
numBytes
);
break;
}
}
}
// And return the result
return result;
}
TDecodeToUnicodeResult DecodeToUnicode(
const std::string& stringRaw,
TUnicodeFormat format
) {
return DecodeToUnicode(
stringRaw.c_str(),
stringRaw.size(),
format,
false,
0
);
}
TDecodeToUnicodeResult DecodeToUnicode(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes
) {
return DecodeToUnicode(
stringRaw,
numBytes,
format,
false,
0
);
}
TDecodeToUnicodeResult DecodeToUnicode(
const std::string& stringRaw,
const TExternalUnicodeFormat& format
) {
return DecodeToUnicode(
stringRaw.c_str(),
stringRaw.size(),
format.Format(),
format.WithBOM(),
format.BOMSize()
);
}
TDecodeToUnicodeResult DecodeToUnicode(
const char* stringRaw,
const TExternalUnicodeFormat& format,
size_t numBytes
) {
return DecodeToUnicode(
stringRaw,
numBytes,
format.Format(),
format.WithBOM(),
format.BOMSize()
);
}
// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
TExternalUnicodeFormat GetExternalStringFormat(
const std::string& stringRaw,
TUnicodeFormat defaultFormat
) {
return GetExternalStringFormat(
stringRaw.c_str(),
defaultFormat,
stringRaw.size()
);
}
TExternalUnicodeFormat GetExternalStringFormat(
const char* stringRaw,
TUnicodeFormat defaultFormat,
size_t numBytes
) {
// Assume we won't recognize the format
TExternalUnicodeFormat result(defaultFormat, false);
// Look if any of the BOM's match
const char* sourceRaw = stringRaw;
const TBOMInfo* nextBOMInfoPtr = g_boms;
for (
long format = (long)TUnicodeFormat::ascii;
format <= (long)TUnicodeFormat::utf16BE && !result.m_formatDeduced;
++format, ++nextBOMInfoPtr
) {
// Look if the string is long enough to contain this format's BOM
if (nextBOMInfoPtr->size > 0 && nextBOMInfoPtr->size <= numBytes) {
// Yes -> look if it matches
result.m_formatDeduced =
result.m_withBOM = 0 == memcmp(
sourceRaw,
nextBOMInfoPtr->signature,
nextBOMInfoPtr->size
);
if (result.m_withBOM) {
// Yes -> note the format
result.m_format = (TUnicodeFormat)format;
result.m_bomSize = nextBOMInfoPtr->size;
}
}
}
// And return the result
return result;
}
// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
TExternalUnicodeFormat DetectBOM(
const std::string& stringRaw,
TUnicodeFormat format
) {
return DetectBOM(
stringRaw.c_str(),
format,
stringRaw.size()
);
}
TExternalUnicodeFormat DetectBOM(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes
) {
// Look if the string is long enough to contain this format's BOM
const TBOMInfo* bomInfoPtr = &g_boms[(long)format];
bool detectedBOM = false;
if (bomInfoPtr->size > 0 && bomInfoPtr->size <= numBytes) {
// Yes -> look if it matches
detectedBOM = 0 == memcmp(
stringRaw,
bomInfoPtr->signature,
bomInfoPtr->size
);
}
// And return the result
return TExternalUnicodeFormat(format, detectedBOM);
}
// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
const std::string& stringRaw,
TUnicodeFormat format
) {
return StringIs7BitASCII(
stringRaw.c_str(),
format,
stringRaw.size()
);
}
bool StringIs7BitASCII(
const char* stringRaw,
TUnicodeFormat format,
size_t numBytes
) {
// Ensure the string's size is known
EnsureSizeKnown(stringRaw, numBytes, format);
// Inspect each character
bool is7BitASCII = true;
while (numBytes > 0 && is7BitASCII) {
is7BitASCII = *(unsigned char*)stringRaw < 0x80;
++stringRaw;
--numBytes;
}
// And return the verdict
return is7BitASCII;
}
bool StringIs7BitASCII(
const std::wstring& unicode
) {
return StringIs7BitASCII(
unicode.c_str(),
unicode.size()
);
}
bool StringIs7BitASCII(
const wchar_t* unicode,
size_t numChars
) {
// Ensure the string's size is known
if (numChars == g_unicodeSizeUnknown) {
numChars = wcslen(unicode);
}
// Inspect each character
bool is7BitASCII = true;
while (numChars > 0 && is7BitASCII) {
is7BitASCII = *unicode < 0x80;
--numChars;
++unicode;
}
// And return the verdict
return is7BitASCII;
}