summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsfan5 <sfan5@live.de>2021-01-29 13:09:17 +0100
committersfan5 <sfan5@live.de>2021-02-02 20:46:08 +0100
commit5e392cf34f8e062dd0533619921223656e32598a (patch)
treef7cbd5f6d5290545adb4ce0beb07a94352043818
parent7ebd5da9cd4a227dcdc140a495f264a97277b3a3 (diff)
downloadminetest-5e392cf34f8e062dd0533619921223656e32598a.tar.gz
minetest-5e392cf34f8e062dd0533619921223656e32598a.tar.bz2
minetest-5e392cf34f8e062dd0533619921223656e32598a.zip
Refactor utf8_to_wide/wide_to_utf8 functions
-rw-r--r--src/unittest/test_utilities.cpp15
-rw-r--r--src/util/string.cpp57
-rw-r--r--src/util/string.h6
3 files changed, 40 insertions, 38 deletions
diff --git a/src/unittest/test_utilities.cpp b/src/unittest/test_utilities.cpp
index 447b591e1..5559cdbf2 100644
--- a/src/unittest/test_utilities.cpp
+++ b/src/unittest/test_utilities.cpp
@@ -302,9 +302,18 @@ void TestUtilities::testAsciiPrintableHelper()
void TestUtilities::testUTF8()
{
- UASSERT(wide_to_utf8(utf8_to_wide("")) == "");
- UASSERT(wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!"))
- == "the shovel dug a crumbly node!");
+ UASSERT(utf8_to_wide("¤") == L"¤");
+
+ UASSERT(wide_to_utf8(L"¤") == "¤");
+
+ UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("")), "");
+ UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("the shovel dug a crumbly node!")),
+ "the shovel dug a crumbly node!");
+ UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-ä-")),
+ "-ä-");
+ UASSERTEQ(std::string, wide_to_utf8(utf8_to_wide("-\xF0\xA0\x80\x8B-")),
+ "-\xF0\xA0\x80\x8B-");
+
}
void TestUtilities::testRemoveEscapes()
diff --git a/src/util/string.cpp b/src/util/string.cpp
index 3ac3b8cf0..7e6d6d3b3 100644
--- a/src/util/string.cpp
+++ b/src/util/string.cpp
@@ -50,8 +50,8 @@ static bool parseNamedColorString(const std::string &value, video::SColor &color
#ifndef _WIN32
-bool convert(const char *to, const char *from, char *outbuf,
- size_t outbuf_size, char *inbuf, size_t inbuf_size)
+static bool convert(const char *to, const char *from, char *outbuf,
+ size_t *outbuf_size, char *inbuf, size_t inbuf_size)
{
iconv_t cd = iconv_open(to, from);
@@ -60,15 +60,14 @@ bool convert(const char *to, const char *from, char *outbuf,
#else
char *inbuf_ptr = inbuf;
#endif
-
char *outbuf_ptr = outbuf;
size_t *inbuf_left_ptr = &inbuf_size;
- size_t *outbuf_left_ptr = &outbuf_size;
+ const size_t old_outbuf_size = *outbuf_size;
size_t old_size = inbuf_size;
while (inbuf_size > 0) {
- iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_left_ptr);
+ iconv(cd, &inbuf_ptr, inbuf_left_ptr, &outbuf_ptr, outbuf_size);
if (inbuf_size == old_size) {
iconv_close(cd);
return false;
@@ -77,11 +76,12 @@ bool convert(const char *to, const char *from, char *outbuf,
}
iconv_close(cd);
+ *outbuf_size = old_outbuf_size - *outbuf_size;
return true;
}
#ifdef __ANDROID__
-// Android need manual caring to support the full character set possible with wchar_t
+// On Android iconv disagrees how big a wchar_t is for whatever reason
const char *DEFAULT_ENCODING = "UTF-32LE";
#else
const char *DEFAULT_ENCODING = "WCHAR_T";
@@ -89,58 +89,52 @@ const char *DEFAULT_ENCODING = "WCHAR_T";
std::wstring utf8_to_wide(const std::string &input)
{
- size_t inbuf_size = input.length() + 1;
+ const size_t inbuf_size = input.length();
// maximum possible size, every character is sizeof(wchar_t) bytes
- size_t outbuf_size = (input.length() + 1) * sizeof(wchar_t);
+ size_t outbuf_size = input.length() * sizeof(wchar_t);
- char *inbuf = new char[inbuf_size];
+ char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
memcpy(inbuf, input.c_str(), inbuf_size);
- char *outbuf = new char[outbuf_size];
- memset(outbuf, 0, outbuf_size);
+ std::wstring out;
+ out.resize(outbuf_size / sizeof(wchar_t));
#ifdef __ANDROID__
- // Android need manual caring to support the full character set possible with wchar_t
SANITY_CHECK(sizeof(wchar_t) == 4);
#endif
- if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, outbuf_size, inbuf, inbuf_size)) {
+ char *outbuf = reinterpret_cast<char*>(&out[0]);
+ if (!convert(DEFAULT_ENCODING, "UTF-8", outbuf, &outbuf_size, inbuf, inbuf_size)) {
infostream << "Couldn't convert UTF-8 string 0x" << hex_encode(input)
<< " into wstring" << std::endl;
delete[] inbuf;
- delete[] outbuf;
return L"<invalid UTF-8 string>";
}
- std::wstring out((wchar_t *)outbuf);
-
delete[] inbuf;
- delete[] outbuf;
+ out.resize(outbuf_size / sizeof(wchar_t));
return out;
}
std::string wide_to_utf8(const std::wstring &input)
{
- size_t inbuf_size = (input.length() + 1) * sizeof(wchar_t);
- // maximum possible size: utf-8 encodes codepoints using 1 up to 6 bytes
- size_t outbuf_size = (input.length() + 1) * 6;
+ const size_t inbuf_size = input.length() * sizeof(wchar_t);
+ // maximum possible size: utf-8 encodes codepoints using 1 up to 4 bytes
+ size_t outbuf_size = input.length() * 4;
- char *inbuf = new char[inbuf_size];
+ char *inbuf = new char[inbuf_size]; // intentionally NOT null-terminated
memcpy(inbuf, input.c_str(), inbuf_size);
- char *outbuf = new char[outbuf_size];
- memset(outbuf, 0, outbuf_size);
+ std::string out;
+ out.resize(outbuf_size);
- if (!convert("UTF-8", DEFAULT_ENCODING, outbuf, outbuf_size, inbuf, inbuf_size)) {
+ if (!convert("UTF-8", DEFAULT_ENCODING, &out[0], &outbuf_size, inbuf, inbuf_size)) {
infostream << "Couldn't convert wstring 0x" << hex_encode(inbuf, inbuf_size)
<< " into UTF-8 string" << std::endl;
delete[] inbuf;
- delete[] outbuf;
- return "<invalid wstring>";
+ return "<invalid wide string>";
}
- std::string out(outbuf);
-
delete[] inbuf;
- delete[] outbuf;
+ out.resize(outbuf_size);
return out;
}
@@ -172,15 +166,12 @@ std::string wide_to_utf8(const std::wstring &input)
#endif // _WIN32
-// You must free the returned string!
-// The returned string is allocated using new
wchar_t *utf8_to_wide_c(const char *str)
{
std::wstring ret = utf8_to_wide(std::string(str));
size_t len = ret.length();
wchar_t *ret_c = new wchar_t[len + 1];
- memset(ret_c, 0, (len + 1) * sizeof(wchar_t));
- memcpy(ret_c, ret.c_str(), len * sizeof(wchar_t));
+ memcpy(ret_c, ret.c_str(), (len + 1) * sizeof(wchar_t));
return ret_c;
}
diff --git a/src/util/string.h b/src/util/string.h
index 6fd11fadc..ec14e9a2d 100644
--- a/src/util/string.h
+++ b/src/util/string.h
@@ -64,11 +64,13 @@ struct FlagDesc {
u32 flag;
};
-// try not to convert between wide/utf8 encodings; this can result in data loss
-// try to only convert between them when you need to input/output stuff via Irrlicht
+// Try to avoid converting between wide and UTF-8 unless you need to
+// input/output stuff via Irrlicht
std::wstring utf8_to_wide(const std::string &input);
std::string wide_to_utf8(const std::wstring &input);
+// You must free the returned string!
+// The returned string is allocated using new[]
wchar_t *utf8_to_wide_c(const char *str);
// NEVER use those two functions unless you have a VERY GOOD reason to