Skip navigation links
IT services and product development
Menu
TwoLogs
IT services and product development

Source code for C_Unicode

Download

The source files for this module are listed below.  You can also download the module C_Unicode as a zip archive; this archive contains all the source files and documentation.

Description

The module Unicode contains functions to convert ASCII, UTF8 and UTF16 strings from and to Windows NT Unicode.

Information

The Unicode2<xxx> functions convert a Windows NT Unicode string into a string with another encoding, while the <xxx> functions perform the opposite action.  The External<xxx>2Unicode functions try to infer the source string's encoding first by looking for a Unicode BOM; when it is not found, they will assume the encoding is the one you specify.

Files

Each file belonging to this source code module is listed below.

Unicode.h

/*******************************************************************************

  Version: 2
  Author:  Carl Colijn, TwoLogs
  Contact: c.colijn@twologs.com
  Source:  http://www.twologs.com/sourcecode

  This code is freely distributable, as long as this comment remains intact.
  If you find this source useful, you may use this code in your own projects
  free of charge, but some acknowledgement to the author of this code is always
  appreciated :)
  The source is however distributed 'as is' without waranty and/or support, and
  may not be fit for each and every application.  Use it at your own discretion
  and at your own risk.
  The source already has undergone testing.  This doesn't mean however that all
  bugs are removed from this piece of code.  If you find one of them, please
  contact me about it.  I can however not guarantee when and if the bug will be
  fixed.

  More information about this module can be found in the accompanying HTML file.

*******************************************************************************/

#ifndef INCLUDE_TWOLOGS_COMMON_UNICODE_H
#define INCLUDE_TWOLOGS_COMMON_UNICODE_H

#include <windows.h>
#include <string>

// All supported unicode formats
enum class EUnicodeFormat {
  eASCII = 0,
  eUTF8,
  eUTF16,
  eUTF16BE
};

// BOM's for the different encodings
struct CBOMInfo {
  const char* sSignature;
  size_t nSize;
};
extern const CBOMInfo g_aoBOMs[];

// All format names
extern const wchar_t* g_casUnicodeEncodings[];

// External unicode format
class CExternalUnicodeFormat {
friend CExternalUnicodeFormat GetExternalStringFormat(
  const char* sxString,
  EUnicodeFormat eDefaultFormat,
  size_t nNumBytes
);
public:
  // Constructor
  CExternalUnicodeFormat();
  CExternalUnicodeFormat(EUnicodeFormat eFormat, bool bWithBOM = false);

  // Sets the format
  void Set(EUnicodeFormat eFormat, bool bWithBOM = false);

  // The format in use
  EUnicodeFormat Format() const {
    return m_eFormat;
  }

  // With BOM
  bool WithBOM() const {
    return m_bWithBOM;
  };

  // The size of any leading BOM, in bytes
  size_t BOMSize() const {
    return m_nBOMSize;
  };

  // Whether the format could be deduced
  bool FormatDeduced() const {
    return m_bFormatDeduced;
  };

private:
  // The format in use
  EUnicodeFormat m_eFormat;

  // With BOM
  bool m_bWithBOM;

  // The size of any leading BOM, in bytes
  size_t m_nBOMSize;

  // Whether the format could be deduced
  bool m_bFormatDeduced;
};

// Unknown string size
extern const size_t g_cnUnicodeSizeUnknown;

// Encodes the given Unicode string to the given format
struct CEncodeFromUnicodeResult {
  bool bSuccess;
  std::string sxResult;
  bool bUnknownCharsReplaced;
};
CEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& sUnicode,
  EUnicodeFormat eFormat,
  bool bWithBOM = false
);
CEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* sUnicode,
  EUnicodeFormat eFormat,
  bool bWithBOM = false,
  size_t nNumChars = g_cnUnicodeSizeUnknown
);
CEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& sUnicode,
  const CExternalUnicodeFormat& oFormat
);
CEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* sUnicode,
  const CExternalUnicodeFormat& oFormat,
  size_t nNumChars = g_cnUnicodeSizeUnknown
);

// Decodes the given string from the given format
struct CDecodeToUnicodeResult {
  bool bSuccess;
  std::wstring sResult;
};
CDecodeToUnicodeResult DecodeToUnicode(
  const std::string& sxString,
  EUnicodeFormat eFormat
);
CDecodeToUnicodeResult DecodeToUnicode(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes = g_cnUnicodeSizeUnknown
);
CDecodeToUnicodeResult DecodeToUnicode(
  const std::string& sxString,
  const CExternalUnicodeFormat& oFormat
);
CDecodeToUnicodeResult DecodeToUnicode(
  const char* sxString,
  const CExternalUnicodeFormat& oFormat,
  size_t nNumBytes = g_cnUnicodeSizeUnknown
);

// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
CExternalUnicodeFormat GetExternalStringFormat(
  const std::string& sxString,
  EUnicodeFormat eDefaultFormat
);
CExternalUnicodeFormat GetExternalStringFormat(
  const char* sxString,
  EUnicodeFormat eDefaultFormat,
  size_t nNumBytes = g_cnUnicodeSizeUnknown
);

// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
CExternalUnicodeFormat DetectBOM(
  const std::string& sxString,
  EUnicodeFormat eFormat
);
CExternalUnicodeFormat DetectBOM(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes = g_cnUnicodeSizeUnknown
);

// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
  const std::string& sxString,
  EUnicodeFormat eFormat
);
bool StringIs7BitASCII(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes = g_cnUnicodeSizeUnknown
);
bool StringIs7BitASCII(
  const std::wstring& sUnicode
);
bool StringIs7BitASCII(
  const wchar_t* sUnicode,
  size_t nNumChars = g_cnUnicodeSizeUnknown
);

// UTF8 char size
// Works on all code points 0-10FFFF (full Unicode range)
#define POINTEDTOUTF8CHARSIZE(psu8Char) \
((0xE5000000 >> ((*psu8Char >> 3) & 0x1e)) & 3) + 1

// Advances the given UTF8 text pointer one character, assuming the encoding is not corrupt
inline const char* NextUTF8Char(const char* psu8Char) {
  return psu8Char + POINTEDTOUTF8CHARSIZE(psu8Char);
}
inline char* NextUTF8Char(char* psu8Char) {
  return psu8Char + POINTEDTOUTF8CHARSIZE(psu8Char);
}
inline void AdvanceUTF8Char(const char*& psu8Char) {
  psu8Char += POINTEDTOUTF8CHARSIZE(psu8Char);
}
inline void AdvanceUTF8Char(char*& psu8Char) {
  psu8Char += POINTEDTOUTF8CHARSIZE(psu8Char);
}

// Leave no mess
#undef POINTEDTOUTF8CHARSIZE

#endif // INCLUDE_TWOLOGS_COMMON_UNICODE_H

Unicode.cpp

/*******************************************************************************

  Version: 2
  Author:  Carl Colijn, TwoLogs
  Contact: c.colijn@twologs.com
  Source:  http://www.twologs.com/sourcecode

  This code is freely distributable, as long as this comment remains intact.
  If you find this source useful, you may use this code in your own projects
  free of charge, but some acknowledgement to the author of this code is always
  appreciated :)
  The source is however distributed 'as is' without waranty and/or support, and
  may not be fit for each and every application.  Use it at your own discretion
  and at your own risk.
  The source already has undergone testing.  This doesn't mean however that all
  bugs are removed from this piece of code.  If you find one of them, please
  contact me about it.  I can however not guarantee when and if the bug will be
  fixed.

  More information about this module can be found in the accompanying HTML file.

*******************************************************************************/

#include "Unicode.h"

// All format names
const wchar_t* g_casUnicodeEncodings[] = {
  L"ASCII",
  L"UTF-8",
  L"UTF-16",
  L"UTF-16 BE"
};

// BOM's for the different encodings
const CBOMInfo g_aoBOMs[] = {
  {"", 0},
  {"\xEF\xBB\xBF", 3},
  {"\xFF\xFE", 2},
  {"\xFE\xFF", 2}
};

// Unknown string size
const size_t g_cnUnicodeSizeUnknown = (size_t)-1;

// Constructor
CExternalUnicodeFormat::CExternalUnicodeFormat():
 m_eFormat(EUnicodeFormat::eASCII),
 m_bWithBOM(false),
 m_nBOMSize(0),
 m_bFormatDeduced(false) {
}
CExternalUnicodeFormat::CExternalUnicodeFormat(EUnicodeFormat eFormat, bool bWithBOM):
 m_eFormat(eFormat),
 m_bWithBOM(bWithBOM),
 m_bFormatDeduced(false) {
  if (bWithBOM) {
    m_nBOMSize = g_aoBOMs[(long)eFormat].nSize;
  } else {
    m_nBOMSize = 0;
  }
}

// Sets the format
void CExternalUnicodeFormat::Set(EUnicodeFormat eFormat, bool bWithBOM) {
  m_eFormat = eFormat;
  m_bWithBOM = bWithBOM;
  m_bFormatDeduced = false;
  if (bWithBOM) {
    m_nBOMSize = g_aoBOMs[(long)eFormat].nSize;
  } else {
    m_nBOMSize = 0;
  }
}

// Makes sure the size of the given string is determined
void EnsureSizeKnown(const char* sxString, size_t& nNumBytes, EUnicodeFormat eFormat) {
  // Look if the string's size is known
  if (nNumBytes == g_cnUnicodeSizeUnknown) {
    // No -> determine the size of the source
    switch (eFormat) {
      case EUnicodeFormat::eASCII:
      case EUnicodeFormat::eUTF8: {
        nNumBytes = strlen(sxString);
        break;
      }
      case EUnicodeFormat::eUTF16:
      case EUnicodeFormat::eUTF16BE: {
        nNumBytes = wcslen(reinterpret_cast<const wchar_t*>(sxString)) * sizeof(wchar_t);
        break;
      }
    }
  }
}


// Copies over the buffer but toggles the endiannes
void CopyToggleEndiannes(
  const char* sxSource,
  char* sxDest,
  size_t nNumBytes
) {
  for (
    size_t nByteNr = 0;
    nByteNr < nNumBytes;
    nByteNr += 2, sxDest += 2, sxSource += 2
  ) {
    *sxDest = *(sxSource + 1);
    *(sxDest + 1) = *sxSource;
  }
}

// Encodes the given Unicode string to the given format
CEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& sUnicode,
  EUnicodeFormat eFormat,
  bool bWithBOM
) {
  return EncodeFromUnicode(
    sUnicode.c_str(),
    eFormat,
    bWithBOM,
    sUnicode.size()
  );
}
CEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* sUnicode,
  EUnicodeFormat eFormat,
  bool bWithBOM,
  size_t nNumChars
) {
  // Determine the size of the source
  if (nNumChars == g_cnUnicodeSizeUnknown) {
    nNumChars = wcslen(sUnicode);
  }

  // Get info on the BOM to add
  const char* sBOM = "";
  size_t nBOMSize = 0;
  if (bWithBOM) {
    const CBOMInfo* poBOMInfo = &g_aoBOMs[(long)eFormat];
    sBOM = poBOMInfo->sSignature;
    nBOMSize = poBOMInfo->nSize;
  }

  // Look if to do any conversion
  CEncodeFromUnicodeResult oResult;
  oResult.bSuccess = true;
  oResult.bUnknownCharsReplaced = false;
  if (nNumChars == 0) {
    // No -> we're done with an empty string, apart from any BOM
    oResult.sxResult = sBOM;
  } else {
    // Yes -> look what to convert to
    switch (eFormat) {
      case EUnicodeFormat::eASCII:
      case EUnicodeFormat::eUTF8: {
        // To ASCII or UTF8 -> prepare a large enough working buffer
        size_t nBufferSize = nNumChars * 4 + 5;
        oResult.sxResult.resize(nBufferSize);

        // Add the BOM, if needed
        char* sxResultPos = const_cast<char*>(oResult.sxResult.c_str());
        if (nBOMSize > 0) {
          memcpy(sxResultPos, sBOM, nBOMSize);
          sxResultPos += nBOMSize;
          nBufferSize -= nBOMSize;
        }

        // Convert the string itself
        bool bToUTF8 = eFormat == EUnicodeFormat::eUTF8;
        BOOL bUnknownCharsReplaced;
        int nNumBytesWritten = WideCharToMultiByte(
          bToUTF8? CP_UTF8: CP_THREAD_ACP,
          0,
          sUnicode,
          nNumChars,
          sxResultPos,
          nBufferSize,
          nullptr,
          bToUTF8? nullptr: &bUnknownCharsReplaced
        );
        if (!bToUTF8) {
          oResult.bUnknownCharsReplaced = bUnknownCharsReplaced;
        }
        oResult.bSuccess = nNumBytesWritten != 0;
        if (oResult.bSuccess) {
          // Done -> use the correct string size
          oResult.sxResult.resize(nNumBytesWritten + nBOMSize);
        } else {
          // Error -> no output
          oResult.sxResult.clear();
        }
        break;
      }
      case EUnicodeFormat::eUTF16:
      case EUnicodeFormat::eUTF16BE: {
        // UTF16 LE or BE -> prepare the output string
        size_t nNumBytes = nNumChars * sizeof(wchar_t);
        oResult.sxResult.resize(nNumBytes + nBOMSize);

        // Add the BOM, if needed
        char* sxResultPos = const_cast<char*>(oResult.sxResult.c_str());
        if (nBOMSize > 0) {
          memcpy(sxResultPos, sBOM, nBOMSize);
          sxResultPos += nBOMSize;
        }

        // And look how to copy over the string itself
        if (eFormat == EUnicodeFormat::eUTF16) {
          // Ad-is
          memcpy(sxResultPos, sUnicode, nNumBytes);
        } else {
          // Endianness-reversed
          CopyToggleEndiannes(
            reinterpret_cast<const char*>(sUnicode),
            sxResultPos,
            nNumBytes
          );
        }
        break;
      }
    }
  }

  // And return the result
  return oResult;
}
CEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& sUnicode,
  const CExternalUnicodeFormat& oFormat
) {
  return EncodeFromUnicode(
    sUnicode.c_str(),
    oFormat.Format(),
    oFormat.WithBOM(),
    sUnicode.size()
  );
}
CEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* sUnicode,
  const CExternalUnicodeFormat& oFormat,
  size_t nNumChars
) {
  return EncodeFromUnicode(
    sUnicode,
    oFormat.Format(),
    oFormat.WithBOM(),
    nNumChars
  );
}

// Decodes the given string from the given format
CDecodeToUnicodeResult DecodeToUnicode(
  const char* sxString,
  size_t nNumBytes,
  EUnicodeFormat eFormat,
  bool bWithBOM,
  size_t nBOMSize
) {
  // Get any BOM out of the way
  sxString += nBOMSize;

  // Look if the string's size is known
  if (nNumBytes != g_cnUnicodeSizeUnknown) {
    // Yes -> get rid of the BOM there, too
    nNumBytes -= nBOMSize;
  } else {
    // No -> ensure it is known
    EnsureSizeKnown(sxString, nNumBytes, eFormat);
  }

  // Look if there is anything to convert
  CDecodeToUnicodeResult oResult;
  oResult.bSuccess = true;
  if (nNumBytes == 0) {
    // No -> we're done with an empty string
    oResult.sResult.clear();
  } else {
    // Yes -> look what to convert to
    switch (eFormat) {
      case EUnicodeFormat::eASCII:
      case EUnicodeFormat::eUTF8: {
        // ASCII or UTF8 -> prepare the conversion buffer
        size_t nMaxNumCharsNeeded = nNumBytes * 3 + 1;
        oResult.sResult.resize(nMaxNumCharsNeeded);

        // Do the conversion
        int nNumCharsWritten = MultiByteToWideChar(
          eFormat == EUnicodeFormat::eUTF8?
            CP_UTF8:
            CP_THREAD_ACP,
          eFormat == EUnicodeFormat::eUTF8? 0: MB_ERR_INVALID_CHARS,
          sxString,
          nNumBytes,
          const_cast<wchar_t*>(oResult.sResult.c_str()),
          nMaxNumCharsNeeded
        );

        // Look if it succeeded
        bool bUTF8ConversionFailed = (
          eFormat == EUnicodeFormat::eUTF8 &&
          nNumCharsWritten == ERROR_NO_UNICODE_TRANSLATION
        );
        oResult.bSuccess = nNumCharsWritten != 0 && !bUTF8ConversionFailed;
        if (oResult.bSuccess) {
          // Done -> truncate the buffer
          oResult.sResult.resize(nNumCharsWritten);
        } else {
          // Error -> no result
          oResult.sResult.clear();

          // And make sure GetLastError is usefull
          if (bUTF8ConversionFailed) {
            SetLastError(ERROR_NO_UNICODE_TRANSLATION);
          }
        }
        break;
      }
      case EUnicodeFormat::eUTF16: {
        // UTF16 LE -> just copy over the string as-is
        oResult.sResult.resize(nNumBytes / sizeof(wchar_t));
        memcpy(
          const_cast<wchar_t*>(oResult.sResult.c_str()),
          sxString,
          nNumBytes
        );
        break;
      }
      case EUnicodeFormat::eUTF16BE: {
        // UTF16 BE -> just copy over the string, but endianness-reversed
        oResult.sResult.resize(nNumBytes / sizeof(wchar_t));
        CopyToggleEndiannes(
          sxString,
          reinterpret_cast<char*>(const_cast<wchar_t*>(oResult.sResult.c_str())),
          nNumBytes
        );
        break;
      }
    }
  }

  // And return the result
  return oResult;
}
CDecodeToUnicodeResult DecodeToUnicode(
  const std::string& sxString,
  EUnicodeFormat eFormat
) {
  return DecodeToUnicode(
    sxString.c_str(),
    sxString.size(),
    eFormat,
    false,
    0
  );
}
CDecodeToUnicodeResult DecodeToUnicode(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes
) {
  return DecodeToUnicode(
    sxString,
    nNumBytes,
    eFormat,
    false,
    0
  );
}
CDecodeToUnicodeResult DecodeToUnicode(
  const std::string& sxString,
  const CExternalUnicodeFormat& oFormat
) {
  return DecodeToUnicode(
    sxString.c_str(),
    sxString.size(),
    oFormat.Format(),
    oFormat.WithBOM(),
    oFormat.BOMSize()
  );
}
CDecodeToUnicodeResult DecodeToUnicode(
  const char* sxString,
  const CExternalUnicodeFormat& oFormat,
  size_t nNumBytes
) {
  return DecodeToUnicode(
    sxString,
    nNumBytes,
    oFormat.Format(),
    oFormat.WithBOM(),
    oFormat.BOMSize()
  );
}

// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
CExternalUnicodeFormat GetExternalStringFormat(
  const std::string& sxString,
  EUnicodeFormat eDefaultFormat
) {
  return GetExternalStringFormat(
    sxString.c_str(),
    eDefaultFormat,
    sxString.size()
  );
}
CExternalUnicodeFormat GetExternalStringFormat(
  const char* sxString,
  EUnicodeFormat eDefaultFormat,
  size_t nNumBytes
) {
  // Assume we won't recognize the format
  CExternalUnicodeFormat oResult(eDefaultFormat, false);

  // Look if any of the BOM's match
  const char* sxSource = sxString;
  const CBOMInfo* poNextBOMInfo = g_aoBOMs;
  for (
    long eFormat = (long)EUnicodeFormat::eASCII;
    eFormat <= (long)EUnicodeFormat::eUTF16BE && !oResult.m_bFormatDeduced;
    ++eFormat, ++poNextBOMInfo
  ) {
    // Look if the string is long enough to contain this format's BOM
    if (poNextBOMInfo->nSize > 0 && poNextBOMInfo->nSize <= nNumBytes) {
      // Yes -> look if it matches
      oResult.m_bFormatDeduced =
      oResult.m_bWithBOM = 0 == memcmp(
        sxSource,
        poNextBOMInfo->sSignature,
        poNextBOMInfo->nSize
      );
      if (oResult.m_bWithBOM) {
        // Yes -> note the format
        oResult.m_eFormat = (EUnicodeFormat)eFormat;
        oResult.m_nBOMSize = poNextBOMInfo->nSize;
      }
    }
  }

  // And return the result
  return oResult;
}

// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
CExternalUnicodeFormat DetectBOM(
  const std::string& sxString,
  EUnicodeFormat eFormat
) {
  return DetectBOM(
    sxString.c_str(),
    eFormat,
    sxString.size()
  );
}
CExternalUnicodeFormat DetectBOM(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes
) {
  // Look if the string is long enough to contain this format's BOM
  const CBOMInfo* poBOMInfo = &g_aoBOMs[(long)eFormat];
  bool bDetectedBOM = false;
  if (poBOMInfo->nSize > 0 && poBOMInfo->nSize <= nNumBytes) {
    // Yes -> look if it matches
    bDetectedBOM = 0 == memcmp(
      sxString,
      poBOMInfo->sSignature,
      poBOMInfo->nSize
    );
  }

  // And return the result
  return CExternalUnicodeFormat(eFormat, bDetectedBOM);
}

// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
  const std::string& sxString,
  EUnicodeFormat eFormat
) {
  return StringIs7BitASCII(
    sxString.c_str(),
    eFormat,
    sxString.size()
  );
}
bool StringIs7BitASCII(
  const char* sxString,
  EUnicodeFormat eFormat,
  size_t nNumBytes
) {
  // Ensure the string's size is known
  EnsureSizeKnown(sxString, nNumBytes, eFormat);

  // Inspect each character
  bool bIs7BitASCII = true;
  while (nNumBytes > 0 && bIs7BitASCII) {
    bIs7BitASCII = *(unsigned char*)sxString < 0x80;
    ++sxString;
    --nNumBytes;
  }

  // And return the verdict
  return bIs7BitASCII;
}
bool StringIs7BitASCII(
  const std::wstring& sUnicode
) {
  return StringIs7BitASCII(
    sUnicode.c_str(),
    sUnicode.size()
  );
}
bool StringIs7BitASCII(
  const wchar_t* sUnicode,
  size_t nNumChars
) {
  // Ensure the string's size is known
  if (nNumChars == g_cnUnicodeSizeUnknown) {
    nNumChars = wcslen(sUnicode);
  }

  // Inspect each character
  bool bIs7BitASCII = true;
  while (nNumChars > 0 && bIs7BitASCII) {
    bIs7BitASCII = *sUnicode < 0x80;
    --nNumChars;
    ++sUnicode;
  }

  // And return the verdict
  return bIs7BitASCII;
}