/*
Basic Unicode string class for Irrlicht.
Copyright (c) 2009-2011 John Norman
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any
damages arising from the use of this software.
Permission is granted to anyone to use this software for any
purpose, including commercial applications, and to alter it and
redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you
must not claim that you wrote the original software. If you use
this software in a product, an acknowledgment in the product
documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
The original version of this class can be located at:
http://irrlicht.suckerfreegames.com/
John Norman
john@suckerfreegames.com
*/
#ifndef __IRR_USTRING_H_INCLUDED__
#define __IRR_USTRING_H_INCLUDED__
#if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
# define USTRING_CPP0X
# if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
# define USTRING_CPP0X_NEWLITERALS
# endif
#endif
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifdef USTRING_CPP0X
# include <utility>
#endif
#ifndef USTRING_NO_STL
# include <string>
# include <iterator>
# include <ostream>
#endif
#include "irrTypes.h"
#include "irrAllocator.h"
#include "irrArray.h"
#include "irrMath.h"
#include "irrString.h"
#include "path.h"
//! UTF-16 surrogate start values.
static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
//! Is a UTF-16 code point a surrogate?
#define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800)
#define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800)
#define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00)
namespace irr
{
// Define our character types.
#ifdef USTRING_CPP0X_NEWLITERALS // C++0x
typedef char32_t uchar32_t;
typedef char16_t uchar16_t;
typedef char uchar8_t;
#else
typedef u32 uchar32_t;
typedef u16 uchar16_t;
typedef u8 uchar8_t;
#endif
namespace core
{
namespace unicode
{
//! The unicode replacement character. Used to replace invalid characters.
const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
//! Convert a UTF-16 surrogate pair into a UTF-32 character.
//! \param high The high value of the pair.
//! \param low The low value of the pair.
//! \return The UTF-32 character expressed by the surrogate pair.
inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
{
// Convert the surrogate pair into a single UTF-32 character.
uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
return (wu << 16) | x;
}
//! Swaps the endianness of a 16-bit value.
//! \return The new value.
inline uchar16_t swapEndian16(const uchar16_t& c)
{
return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
}
//! Swaps the endianness of a 32-bit value.
//! \return The new value.
inline uchar32_t swapEndian32(const uchar32_t& c)
{
return ((c >> 24) & 0x000000FF) |
((c >> 8) & 0x0000FF00) |
((c << 8) & 0x00FF0000) |
((c << 24) & 0xFF000000);
}
//! The Unicode byte order mark.
const u16 BOM = 0xFEFF;
//! The size of the Unicode byte order mark in terms of the Unicode character size.
const u8 BOM_UTF8_LEN = 3;
const u8 BOM_UTF16_LEN = 1;
const u8 BOM_UTF32_LEN = 1;
//! Unicode byte order marks for file operations.
const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
//! The size in bytes of the Unicode byte marks for file operations.
const u8 BOM_ENCODE_UTF8_LEN = 3;
const u8 BOM_ENCODE_UTF16_LEN = 2;
const u8 BOM_ENCODE_UTF32_LEN = 4;
//! Unicode encoding type.
enum EUTF_ENCODE
{
EUTFE_NONE = 0,
EUTFE_UTF8,
EUTFE_UTF16,
EUTFE_UTF16_LE,
EUTFE_UTF16_BE,
EUTFE_UTF32,
EUTFE_UTF32_LE,
EUTFE_UTF32_BE
};
//! Unicode endianness.
enum EUTF_ENDIAN
{
EUTFEE_NATIVE = 0,
EUTFEE_LITTLE,
EUTFEE_BIG
};
//! Returns the specified unicode byte order mark in a byte array.
//! The byte order mark is the first few bytes in a text file that signifies its encoding.
/** \param mode The Unicode encoding method that we want to get the byte order mark for.
If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
//! \return An array that contains a byte order mark.
inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
{
#define COPY_ARRAY(source, size) \
memcpy(ret.pointer(), source, size); \
ret.set_used(size)
core::array<u8> ret(4);
switch (mode)
{
case EUTFE_UTF8:
COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
break;
case EUTFE_UTF16:
#ifdef __BIG_ENDIAN__
COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
#else
COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
#endif
break;
case EUTFE_UTF16_BE:
COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
break;
case EUTFE_UTF16_LE:
COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
break;
case EUTFE_UTF32:
#ifdef __BIG_ENDIAN__
COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
#else
COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
#endif
break;
case EUTFE_UTF32_BE:
COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
break;
case EUTFE_UTF32_LE:
COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
break;
}
return ret;
#undef COPY_ARRAY
}
//! Detects if the given data stream starts with a unicode BOM.
//! \param data The data stream to check.
//! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
inline EUTF_ENCODE determineUnicodeBOM(const char* data)
{
if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
return EUTFE_NONE;
}
} // end namespace unicode
//! UTF-16 string class.
template <typename TAlloc = irrAllocator<uchar16_t> >
class ustring16
{
public:
///------------------///
/// iterator classes ///
///------------------///
//! Access an element in a unicode string, allowing one to change it.
class _ustring16_iterator_access
{
public:
_ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
//! Allow the class to be interpreted as a single UTF-32 character.
operator uchar32_t() const
{
return _get();
}
//! Allow one to change the character in the unicode string.
//! \param c The new character to use.
//! \return Myself.
_ustring16_iterator_access& operator=(const uchar32_t c)
{
_set(c);
return *this;
}
//! Increments the value by 1.
//! \return Myself.
_ustring16_iterator_access& operator++()
{
_set(_get() + 1);
return *this;
}
//! Increments the value by 1, returning the old value.
//! \return A unicode character.
uchar32_t operator++(int)
{
uchar32_t old = _get();
_set(old + 1);
return old;
}
//! Decrements the value by 1.
//! \return Myself.
_ustring16_iterator_access& operator--()
{
_set(_get() - 1);
return *this;
}
//! Decrements the value by 1, returning the old value.
//! \return A unicode character.
uchar32_t operator--(int)
{
uchar32_t old = _get();
_set(old - 1);
return old;
}
//! Adds to the value by a specified amount.
//! \param val The amount to add to this character.
//! \return Myself.
|