Source code for C_Unicode

Download

The source files for this module are listed below. You can also download the module C_Unicode as a zip archive; this archive contains all the source files and documentation.

Description

The module Unicode contains functions to convert ASCII, UTF8 and UTF16 strings from and to Windows NT Unicode.

Information

The Unicode2<xxx> functions convert a Windows NT Unicode string into a string with another encoding, while the <xxx> functions perform the opposite action. The External<xxx>2Unicode functions try to infer the source string's encoding first by looking for a Unicode BOM; when it is not found, they will assume the encoding is the one you specify.

Files

Each file belonging to this source code module is listed below.

Unicode.h

/*******************************************************************************

  Version: 2
  Author:  Carl Colijn, TwoLogs
  Contact: c.colijn@twologs.com
  Source:  https://www.twologs.com/sourcecode

  This code is freely distributable, as long as this comment remains intact.
  If you find this source useful, you may use this code in your own projects
  free of charge, but some acknowledgement to the author of this code is always
  appreciated :)
  The source is however distributed 'as is' without waranty and/or support, and
  may not be fit for each and every application.  Use it at your own discretion
  and at your own risk.
  The source already has undergone testing.  This doesn't mean however that all
  bugs are removed from this piece of code.  If you find one of them, please
  contact me about it.  I can however not guarantee when and if the bug will be
  fixed.

  More information about this module can be found in the accompanying HTML file.

*******************************************************************************/

#ifndef INCLUDE_TWOLOGS_COMMON_UNICODE_H
#define INCLUDE_TWOLOGS_COMMON_UNICODE_H

#include <windows.h>
#include <string>

// All supported unicode formats
enum class TUnicodeFormat {
  ascii = 0,
  utf8,
  utf16,
  utf16BE
};

// BOM's for the different encodings
struct TBOMInfo {
  const char* signature;
  size_t size;
};
extern const TBOMInfo g_boms[];

// All format names
extern const wchar_t* g_unicodeEncodings[];

// External unicode format
class TExternalUnicodeFormat {
friend TExternalUnicodeFormat GetExternalStringFormat(
  const char* stringRaw,
  TUnicodeFormat defaultFormat,
  size_t numBytes
);
public:
  // Constructor
  TExternalUnicodeFormat();
  TExternalUnicodeFormat(TUnicodeFormat format, bool withBOM = false);

  // Sets the format
  void Set(TUnicodeFormat format, bool withBOM = false);

  // The format in use
  TUnicodeFormat Format() const {
    return m_format;
  }

  // With BOM
  bool WithBOM() const {
    return m_withBOM;
  };

  // The size of any leading BOM, in bytes
  size_t BOMSize() const {
    return m_bomSize;
  };

  // Whether the format could be deduced
  bool FormatDeduced() const {
    return m_formatDeduced;
  };

private:
  // The format in use
  TUnicodeFormat m_format;

  // With BOM
  bool m_withBOM;

  // The size of any leading BOM, in bytes
  size_t m_bomSize;

  // Whether the format could be deduced
  bool m_formatDeduced;
};

// Unknown string size
extern const size_t g_unicodeSizeUnknown;

// Encodes the given Unicode string to the given format
struct TEncodeFromUnicodeResult {
  bool allOK;
  std::string resultRaw;
  bool unknownCharsReplaced;
};
TEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& unicode,
  TUnicodeFormat format,
  bool withBOM = false
);
TEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* unicode,
  TUnicodeFormat format,
  bool withBOM = false,
  size_t numChars = g_unicodeSizeUnknown
);
TEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& unicode,
  const TExternalUnicodeFormat& format
);
TEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* unicode,
  const TExternalUnicodeFormat& format,
  size_t numChars = g_unicodeSizeUnknown
);

// Decodes the given string from the given format
struct TDecodeToUnicodeResult {
  bool allOK;
  std::wstring result;
};
TDecodeToUnicodeResult DecodeToUnicode(
  const std::string& stringRaw,
  TUnicodeFormat format
);
TDecodeToUnicodeResult DecodeToUnicode(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes = g_unicodeSizeUnknown
);
TDecodeToUnicodeResult DecodeToUnicode(
  const std::string& stringRaw,
  const TExternalUnicodeFormat& format
);
TDecodeToUnicodeResult DecodeToUnicode(
  const char* stringRaw,
  const TExternalUnicodeFormat& format,
  size_t numBytes = g_unicodeSizeUnknown
);

// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
TExternalUnicodeFormat GetExternalStringFormat(
  const std::string& stringRaw,
  TUnicodeFormat defaultFormat
);
TExternalUnicodeFormat GetExternalStringFormat(
  const char* stringRaw,
  TUnicodeFormat defaultFormat,
  size_t numBytes = g_unicodeSizeUnknown
);

// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
TExternalUnicodeFormat DetectBOM(
  const std::string& stringRaw,
  TUnicodeFormat format
);
TExternalUnicodeFormat DetectBOM(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes = g_unicodeSizeUnknown
);

// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
  const std::string& stringRaw,
  TUnicodeFormat format
);
bool StringIs7BitASCII(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes = g_unicodeSizeUnknown
);
bool StringIs7BitASCII(
  const std::wstring& unicode
);
bool StringIs7BitASCII(
  const wchar_t* unicode,
  size_t numChars = g_unicodeSizeUnknown
);

// UTF8 char size
// Works on all code points 0-10FFFF (full Unicode range)
#define POINTEDTOUTF8CHARSIZE(utf8CharPtr) \
((0xE5000000 >> ((*utf8CharPtr >> 3) & 0x1e)) & 3) + 1

// Advances the given UTF8 text pointer one character, assuming the encoding is not corrupt
inline const char* NextUTF8Char(const char* utf8CharPtr) {
  return utf8CharPtr + POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline char* NextUTF8Char(char* utf8CharPtr) {
  return utf8CharPtr + POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline void AdvanceUTF8Char(const char*& utf8CharPtr) {
  utf8CharPtr += POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}
inline void AdvanceUTF8Char(char*& utf8CharPtr) {
  utf8CharPtr += POINTEDTOUTF8CHARSIZE(utf8CharPtr);
}

// Leave no mess
#undef POINTEDTOUTF8CHARSIZE

#endif // INCLUDE_TWOLOGS_COMMON_UNICODE_H

Unicode.cpp

/*******************************************************************************

  Version: 2
  Author:  Carl Colijn, TwoLogs
  Contact: c.colijn@twologs.com
  Source:  https://www.twologs.com/sourcecode

  This code is freely distributable, as long as this comment remains intact.
  If you find this source useful, you may use this code in your own projects
  free of charge, but some acknowledgement to the author of this code is always
  appreciated :)
  The source is however distributed 'as is' without waranty and/or support, and
  may not be fit for each and every application.  Use it at your own discretion
  and at your own risk.
  The source already has undergone testing.  This doesn't mean however that all
  bugs are removed from this piece of code.  If you find one of them, please
  contact me about it.  I can however not guarantee when and if the bug will be
  fixed.

  More information about this module can be found in the accompanying HTML file.

*******************************************************************************/

#include "Unicode.h"

// All format names
const wchar_t* g_unicodeEncodings[] = {
  L"ASCII",
  L"UTF-8",
  L"UTF-16",
  L"UTF-16 BE"
};

// BOM's for the different encodings
const TBOMInfo g_boms[] = {
  {"", 0},
  {"\xEF\xBB\xBF", 3},
  {"\xFF\xFE", 2},
  {"\xFE\xFF", 2}
};

// Unknown string size
const size_t g_unicodeSizeUnknown = (size_t)-1;

// Constructor
TExternalUnicodeFormat::TExternalUnicodeFormat():
 m_format(TUnicodeFormat::ascii),
 m_withBOM(false),
 m_bomSize(0),
 m_formatDeduced(false) {
}
TExternalUnicodeFormat::TExternalUnicodeFormat(TUnicodeFormat format, bool withBOM):
 m_format(format),
 m_withBOM(withBOM),
 m_formatDeduced(false) {
  if (withBOM) {
    m_bomSize = g_boms[(long)format].size;
  } else {
    m_bomSize = 0;
  }
}

// Sets the format
void TExternalUnicodeFormat::Set(TUnicodeFormat format, bool withBOM) {
  m_format = format;
  m_withBOM = withBOM;
  m_formatDeduced = false;
  if (withBOM) {
    m_bomSize = g_boms[(long)format].size;
  } else {
    m_bomSize = 0;
  }
}

// Makes sure the size of the given string is determined
void EnsureSizeKnown(const char* stringRaw, size_t& numBytes, TUnicodeFormat format) {
  // Look if the string's size is known
  if (numBytes == g_unicodeSizeUnknown) {
    // No -> determine the size of the source
    switch (format) {
      case TUnicodeFormat::ascii:
      case TUnicodeFormat::utf8: {
        numBytes = strlen(stringRaw);
        break;
      }
      case TUnicodeFormat::utf16:
      case TUnicodeFormat::utf16BE: {
        numBytes = wcslen(reinterpret_cast<const wchar_t*>(stringRaw)) * sizeof(wchar_t);
        break;
      }
    }
  }
}


// Copies over the buffer but toggles the endiannes
void CopyToggleEndiannes(
  const char* sourceRaw,
  char* destRaw,
  size_t numBytes
) {
  for (
    size_t byteNr = 0;
    byteNr < numBytes;
    byteNr += 2, destRaw += 2, sourceRaw += 2
  ) {
    *destRaw = *(sourceRaw + 1);
    *(destRaw + 1) = *sourceRaw;
  }
}

// Encodes the given Unicode string to the given format
TEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& unicode,
  TUnicodeFormat format,
  bool withBOM
) {
  return EncodeFromUnicode(
    unicode.c_str(),
    format,
    withBOM,
    unicode.size()
  );
}
TEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* unicode,
  TUnicodeFormat format,
  bool withBOM,
  size_t numChars
) {
  // Determine the size of the source
  if (numChars == g_unicodeSizeUnknown) {
    numChars = wcslen(unicode);
  }

  // Get info on the BOM to add
  const char* bom = "";
  size_t bomSize = 0;
  if (withBOM) {
    const TBOMInfo* bomInfoPtr = &g_boms[(long)format];
    bom = bomInfoPtr->signature;
    bomSize = bomInfoPtr->size;
  }

  // Look if to do any conversion
  TEncodeFromUnicodeResult result;
  result.allOK = true;
  result.unknownCharsReplaced = false;
  if (numChars == 0) {
    // No -> we're done with an empty string, apart from any BOM
    result.resultRaw = bom;
  } else {
    // Yes -> look what to convert to
    switch (format) {
      case TUnicodeFormat::ascii:
      case TUnicodeFormat::utf8: {
        // To ASCII or UTF8 -> prepare a large enough working buffer
        size_t bufferSize = numChars * 4 + 5;
        result.resultRaw.resize(bufferSize);

        // Add the BOM, if needed
        char* resultRawPos = const_cast<char*>(result.resultRaw.c_str());
        if (bomSize > 0) {
          memcpy(resultRawPos, bom, bomSize);
          resultRawPos += bomSize;
          bufferSize -= bomSize;
        }

        // Convert the string itself
        bool toUTF8 = format == TUnicodeFormat::utf8;
        BOOL unknownCharsReplaced;
        int numBytesWritten = WideCharToMultiByte(
          toUTF8? CP_UTF8: CP_THREAD_ACP,
          0,
          unicode,
          numChars,
          resultRawPos,
          bufferSize,
          nullptr,
          toUTF8? nullptr: &unknownCharsReplaced
        );
        if (!toUTF8) {
          result.unknownCharsReplaced = unknownCharsReplaced;
        }
        result.allOK = numBytesWritten != 0;
        if (result.allOK) {
          // Done -> use the correct string size
          result.resultRaw.resize(numBytesWritten + bomSize);
        } else {
          // Error -> no output
          result.resultRaw.clear();
        }
        break;
      }
      case TUnicodeFormat::utf16:
      case TUnicodeFormat::utf16BE: {
        // UTF16 LE or BE -> prepare the output string
        size_t numBytes = numChars * sizeof(wchar_t);
        result.resultRaw.resize(numBytes + bomSize);

        // Add the BOM, if needed
        char* resultRawPos = const_cast<char*>(result.resultRaw.c_str());
        if (bomSize > 0) {
          memcpy(resultRawPos, bom, bomSize);
          resultRawPos += bomSize;
        }

        // And look how to copy over the string itself
        if (format == TUnicodeFormat::utf16) {
          // Ad-is
          memcpy(resultRawPos, unicode, numBytes);
        } else {
          // Endianness-reversed
          CopyToggleEndiannes(
            reinterpret_cast<const char*>(unicode),
            resultRawPos,
            numBytes
          );
        }
        break;
      }
    }
  }

  // And return the result
  return result;
}
TEncodeFromUnicodeResult EncodeFromUnicode(
  const std::wstring& unicode,
  const TExternalUnicodeFormat& format
) {
  return EncodeFromUnicode(
    unicode.c_str(),
    format.Format(),
    format.WithBOM(),
    unicode.size()
  );
}
TEncodeFromUnicodeResult EncodeFromUnicode(
  const wchar_t* unicode,
  const TExternalUnicodeFormat& format,
  size_t numChars
) {
  return EncodeFromUnicode(
    unicode,
    format.Format(),
    format.WithBOM(),
    numChars
  );
}

// Decodes the given string from the given format
TDecodeToUnicodeResult DecodeToUnicode(
  const char* stringRaw,
  size_t numBytes,
  TUnicodeFormat format,
  bool withBOM,
  size_t bomSize
) {
  // Get any BOM out of the way
  stringRaw += bomSize;

  // Look if the string's size is known
  if (numBytes != g_unicodeSizeUnknown) {
    // Yes -> get rid of the BOM there, too
    numBytes -= bomSize;
  } else {
    // No -> ensure it is known
    EnsureSizeKnown(stringRaw, numBytes, format);
  }

  // Look if there is anything to convert
  TDecodeToUnicodeResult result;
  result.allOK = true;
  if (numBytes == 0) {
    // No -> we're done with an empty string
    result.result.clear();
  } else {
    // Yes -> look what to convert to
    switch (format) {
      case TUnicodeFormat::ascii:
      case TUnicodeFormat::utf8: {
        // ASCII or UTF8 -> prepare the conversion buffer
        size_t maxNumCharsNeeded = numBytes * 3 + 1;
        result.result.resize(maxNumCharsNeeded);

        // Do the conversion
        int numCharsWritten = MultiByteToWideChar(
          format == TUnicodeFormat::utf8?
            CP_UTF8:
            CP_THREAD_ACP,
          format == TUnicodeFormat::utf8? 0: MB_ERR_INVALID_CHARS,
          stringRaw,
          numBytes,
          const_cast<wchar_t*>(result.result.c_str()),
          maxNumCharsNeeded
        );

        // Look if it succeeded
        bool utf8ConversionFailed = (
          format == TUnicodeFormat::utf8 &&
          numCharsWritten == ERROR_NO_UNICODE_TRANSLATION
        );
        result.allOK = numCharsWritten != 0 && !utf8ConversionFailed;
        if (result.allOK) {
          // Done -> truncate the buffer
          result.result.resize(numCharsWritten);
        } else {
          // Error -> no result
          result.result.clear();

          // And make sure GetLastError is usefull
          if (utf8ConversionFailed) {
            SetLastError(ERROR_NO_UNICODE_TRANSLATION);
          }
        }
        break;
      }
      case TUnicodeFormat::utf16: {
        // UTF16 LE -> just copy over the string as-is
        result.result.resize(numBytes / sizeof(wchar_t));
        memcpy(
          const_cast<wchar_t*>(result.result.c_str()),
          stringRaw,
          numBytes
        );
        break;
      }
      case TUnicodeFormat::utf16BE: {
        // UTF16 BE -> just copy over the string, but endianness-reversed
        result.result.resize(numBytes / sizeof(wchar_t));
        CopyToggleEndiannes(
          stringRaw,
          reinterpret_cast<char*>(const_cast<wchar_t*>(result.result.c_str())),
          numBytes
        );
        break;
      }
    }
  }

  // And return the result
  return result;
}
TDecodeToUnicodeResult DecodeToUnicode(
  const std::string& stringRaw,
  TUnicodeFormat format
) {
  return DecodeToUnicode(
    stringRaw.c_str(),
    stringRaw.size(),
    format,
    false,
    0
  );
}
TDecodeToUnicodeResult DecodeToUnicode(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes
) {
  return DecodeToUnicode(
    stringRaw,
    numBytes,
    format,
    false,
    0
  );
}
TDecodeToUnicodeResult DecodeToUnicode(
  const std::string& stringRaw,
  const TExternalUnicodeFormat& format
) {
  return DecodeToUnicode(
    stringRaw.c_str(),
    stringRaw.size(),
    format.Format(),
    format.WithBOM(),
    format.BOMSize()
  );
}
TDecodeToUnicodeResult DecodeToUnicode(
  const char* stringRaw,
  const TExternalUnicodeFormat& format,
  size_t numBytes
) {
  return DecodeToUnicode(
    stringRaw,
    numBytes,
    format.Format(),
    format.WithBOM(),
    format.BOMSize()
  );
}

// Tries to detect the format used for the given external string
// The format is infered from the string itself via a BOM
// If no BOM is found then the given default format is assumed
TExternalUnicodeFormat GetExternalStringFormat(
  const std::string& stringRaw,
  TUnicodeFormat defaultFormat
) {
  return GetExternalStringFormat(
    stringRaw.c_str(),
    defaultFormat,
    stringRaw.size()
  );
}
TExternalUnicodeFormat GetExternalStringFormat(
  const char* stringRaw,
  TUnicodeFormat defaultFormat,
  size_t numBytes
) {
  // Assume we won't recognize the format
  TExternalUnicodeFormat result(defaultFormat, false);

  // Look if any of the BOM's match
  const char* sourceRaw = stringRaw;
  const TBOMInfo* nextBOMInfoPtr = g_boms;
  for (
    long format = (long)TUnicodeFormat::ascii;
    format <= (long)TUnicodeFormat::utf16BE && !result.m_formatDeduced;
    ++format, ++nextBOMInfoPtr
  ) {
    // Look if the string is long enough to contain this format's BOM
    if (nextBOMInfoPtr->size > 0 && nextBOMInfoPtr->size <= numBytes) {
      // Yes -> look if it matches
      result.m_formatDeduced =
      result.m_withBOM = 0 == memcmp(
        sourceRaw,
        nextBOMInfoPtr->signature,
        nextBOMInfoPtr->size
      );
      if (result.m_withBOM) {
        // Yes -> note the format
        result.m_format = (TUnicodeFormat)format;
        result.m_bomSize = nextBOMInfoPtr->size;
      }
    }
  }

  // And return the result
  return result;
}

// Detects if the given string starts with a BOM for the given
// format, and returns the complete format with/without BOM indication
TExternalUnicodeFormat DetectBOM(
  const std::string& stringRaw,
  TUnicodeFormat format
) {
  return DetectBOM(
    stringRaw.c_str(),
    format,
    stringRaw.size()
  );
}
TExternalUnicodeFormat DetectBOM(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes
) {
  // Look if the string is long enough to contain this format's BOM
  const TBOMInfo* bomInfoPtr = &g_boms[(long)format];
  bool detectedBOM = false;
  if (bomInfoPtr->size > 0 && bomInfoPtr->size <= numBytes) {
    // Yes -> look if it matches
    detectedBOM = 0 == memcmp(
      stringRaw,
      bomInfoPtr->signature,
      bomInfoPtr->size
    );
  }

  // And return the result
  return TExternalUnicodeFormat(format, detectedBOM);
}

// Returns if the given text string can be represented as 7-bit ASCII
bool StringIs7BitASCII(
  const std::string& stringRaw,
  TUnicodeFormat format
) {
  return StringIs7BitASCII(
    stringRaw.c_str(),
    format,
    stringRaw.size()
  );
}
bool StringIs7BitASCII(
  const char* stringRaw,
  TUnicodeFormat format,
  size_t numBytes
) {
  // Ensure the string's size is known
  EnsureSizeKnown(stringRaw, numBytes, format);

  // Inspect each character
  bool is7BitASCII = true;
  while (numBytes > 0 && is7BitASCII) {
    is7BitASCII = *(unsigned char*)stringRaw < 0x80;
    ++stringRaw;
    --numBytes;
  }

  // And return the verdict
  return is7BitASCII;
}
bool StringIs7BitASCII(
  const std::wstring& unicode
) {
  return StringIs7BitASCII(
    unicode.c_str(),
    unicode.size()
  );
}
bool StringIs7BitASCII(
  const wchar_t* unicode,
  size_t numChars
) {
  // Ensure the string's size is known
  if (numChars == g_unicodeSizeUnknown) {
    numChars = wcslen(unicode);
  }

  // Inspect each character
  bool is7BitASCII = true;
  while (numChars > 0 && is7BitASCII) {
    is7BitASCII = *unicode < 0x80;
    --numChars;
    ++unicode;
  }

  // And return the verdict
  return is7BitASCII;
}

Sitemap

Information