From 3b65a6a36c3e910359c69cd3e3e3fd89e50ba23e Mon Sep 17 00:00:00 2001 From: figec Date: Thu, 18 Jun 2015 21:34:17 +0300 Subject: Fix wrap_rows at inner byte of multibyte sequence Also fix UTF-8 inner byte bounds and make unittest for case this fixes. --- src/unittest/test_utilities.cpp | 24 +++++++++++++++++------- src/util/string.h | 14 +++++++++++--- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/unittest/test_utilities.cpp b/src/unittest/test_utilities.cpp index 9678a81eb..df90d37bd 100644 --- a/src/unittest/test_utilities.cpp +++ b/src/unittest/test_utilities.cpp @@ -243,13 +243,23 @@ void TestUtilities::testWrapRows() { UASSERT(wrap_rows("12345678",4) == "1234\n5678"); // test that wrap_rows doesn't wrap inside multibyte sequences - const unsigned char s[] = { - 0x2f, 0x68, 0x6f, 0x6d, 0x65, 0x2f, 0x72, 0x61, 0x70, 0x74, 0x6f, - 0x72, 0x2f, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0x2f, - 0x6d, 0x69, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x74, 0x2f, 0x62, 0x69, - 0x6e, 0x2f, 0x2e, 0x2e, 0}; - std::string str((char *)s); - UASSERT(utf8_to_wide(wrap_rows(str, 20)) != L""); + { + const unsigned char s[] = { + 0x2f, 0x68, 0x6f, 0x6d, 0x65, 0x2f, 0x72, 0x61, 0x70, 0x74, 0x6f, + 0x72, 0x2f, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0x2f, + 0x6d, 0x69, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x74, 0x2f, 0x62, 0x69, + 0x6e, 0x2f, 0x2e, 0x2e, 0}; + std::string str((char *)s); + UASSERT(utf8_to_wide(wrap_rows(str, 20)) != L""); + }; + { + const unsigned char s[] = { + 0x74, 0x65, 0x73, 0x74, 0x20, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, + 0xd1, 0x82, 0x20, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, + 0x20, 0xd1, 0x82, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82, 0}; + std::string str((char *)s); + UASSERT(utf8_to_wide(wrap_rows(str, 8)) != L""); + } } diff --git a/src/util/string.h b/src/util/string.h index 72d3c6075..b4ce5743d 100644 --- a/src/util/string.h +++ b/src/util/string.h @@ -33,7 +33,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #define TOSTRING(x) STRINGIFY(x) // Checks whether a byte is an inner byte for an utf-8 multibyte sequence -#define IS_UTF8_MULTB_INNER(x) (((unsigned char)x >= 0x80) && ((unsigned char)x <= 0xc0)) +#define IS_UTF8_MULTB_INNER(x) (((unsigned char)x >= 0x80) && ((unsigned char)x < 0xc0)) typedef std::map StringMap; @@ -426,12 +426,20 @@ inline std::string wrap_rows(const std::string &from, { std::string to; + bool need_to_wrap = false; + size_t character_idx = 0; for (size_t i = 0; i < from.size(); i++) { if (character_idx > 0 && character_idx % row_len == 0) - to += '\n'; - if (!IS_UTF8_MULTB_INNER(from[i])) + need_to_wrap = true; + if (!IS_UTF8_MULTB_INNER(from[i])) { + // Wrap string if needed before next char started + if (need_to_wrap) { + to += '\n'; + need_to_wrap = false; + } character_idx++; + } to += from[i]; } -- cgit v1.2.3