Skip to content

Commit 40393b8

Browse files
committed
feat(utf8): add UTF-8 string conversion and validation functions
1 parent 5e041b4 commit 40393b8

6 files changed

Lines changed: 240 additions & 32 deletions

File tree

Core/GameEngine/Source/Common/System/AsciiString.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
4646

4747
#include "Common/CriticalSection.h"
48+
#include "utf8.h"
4849

4950

5051
// -----------------------------------------------------
@@ -137,8 +138,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
137138
// TheSuperHackers @fix Mauller 04/04/2025 Replace strcpy with safer memmove as memory regions can overlap when part of string is copied to itself
138139
DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
139140
memmove(m_data->peek(), strToCopy, usableNumChars);
140-
m_data->peek()[usableNumChars] = 0;
141141
}
142+
m_data->peek()[usableNumChars] = 0;
142143
if (strToCat)
143144
strcat(m_data->peek(), strToCat);
144145
return;
@@ -166,8 +167,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
166167
{
167168
DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
168169
strncpy(newData->peek(), strToCopy, usableNumChars);
169-
newData->peek()[usableNumChars] = 0;
170170
}
171+
newData->peek()[usableNumChars] = 0;
171172
if (strToCat)
172173
strcat(newData->peek(), strToCat);
173174

@@ -272,11 +273,19 @@ char* AsciiString::getBufferForRead(Int len)
272273
void AsciiString::translate(const UnicodeString& stringSrc)
273274
{
274275
validate();
275-
/// @todo srj put in a real translation here; this will only work for 7-bit ascii
276-
clear();
277-
Int len = stringSrc.getLength();
278-
for (Int i = 0; i < len; i++)
279-
concat((char)stringSrc.getCharAt(i));
276+
// TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
277+
const WideChar* src = stringSrc.str();
278+
size_t srcLen = wcslen(src);
279+
size_t size = Get_Utf8_Size(src, srcLen);
280+
if (size == 0)
281+
{
282+
clear();
283+
return;
284+
}
285+
ensureUniqueBufferOfSize((Int)size + 1, false, nullptr, nullptr);
286+
char* buf = peek();
287+
if (!Unicode_To_Utf8(buf, src, srcLen, size))
288+
clear();
280289
validate();
281290
}
282291

Core/GameEngine/Source/Common/System/UnicodeString.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
4646

4747
#include "Common/CriticalSection.h"
48+
#include "utf8.h"
4849

4950

5051
// -----------------------------------------------------
@@ -88,8 +89,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
8889
// TheSuperHackers @fix Mauller 04/04/2025 Replace wcscpy with safer memmove as memory regions can overlap when part of string is copied to itself
8990
DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
9091
memmove(m_data->peek(), strToCopy, usableNumChars * sizeof(WideChar));
91-
m_data->peek()[usableNumChars] = 0;
9292
}
93+
m_data->peek()[usableNumChars] = 0;
9394
if (strToCat)
9495
wcscat(m_data->peek(), strToCat);
9596
return;
@@ -117,8 +118,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
117118
{
118119
DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
119120
wcsncpy(newData->peek(), strToCopy, usableNumChars);
120-
newData->peek()[usableNumChars] = 0;
121121
}
122+
newData->peek()[usableNumChars] = 0;
122123
if (strToCat)
123124
wcscat(newData->peek(), strToCat);
124125

@@ -221,11 +222,19 @@ WideChar* UnicodeString::getBufferForRead(Int len)
221222
void UnicodeString::translate(const AsciiString& stringSrc)
222223
{
223224
validate();
224-
/// @todo srj put in a real translation here; this will only work for 7-bit ascii
225-
clear();
226-
Int len = stringSrc.getLength();
227-
for (Int i = 0; i < len; i++)
228-
concat((WideChar)stringSrc.getCharAt(i));
225+
// TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
226+
const char* src = stringSrc.str();
227+
size_t srcLen = strlen(src);
228+
size_t size = Get_Unicode_Size(src, srcLen);
229+
if (size == 0)
230+
{
231+
clear();
232+
return;
233+
}
234+
ensureUniqueBufferOfSize((Int)size + 1, false, nullptr, nullptr);
235+
WideChar* buf = peek();
236+
if (!Utf8_To_Unicode(buf, src, srcLen, size))
237+
clear();
229238
validate();
230239
}
231240

Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,24 @@
2828

2929
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
3030

31+
#include "utf8.h"
32+
3133
//-------------------------------------------------------------------------
3234

35+
// TheSuperHackers @refactor bobtista 02/04/2026 Use WWLib UTF-8 functions instead of raw Win32 API calls
3336
std::wstring MultiByteToWideCharSingleLine( const char *orig )
3437
{
35-
Int len = strlen(orig);
36-
WideChar *dest = NEW WideChar[len+1];
37-
38-
MultiByteToWideChar(CP_UTF8, 0, orig, -1, dest, len);
38+
size_t srcLen = strlen(orig);
39+
size_t size = Get_Unicode_Size(orig, srcLen);
40+
if (size == 0)
41+
return std::wstring();
42+
std::wstring ret;
43+
ret.resize(size);
44+
Utf8_To_Unicode(&ret[0], orig, srcLen, size);
3945
WideChar *c = nullptr;
4046
do
4147
{
42-
c = wcschr(dest, L'\n');
48+
c = wcschr(&ret[0], L'\n');
4349
if (c)
4450
{
4551
*c = L' ';
@@ -48,32 +54,26 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig )
4854
while ( c != nullptr );
4955
do
5056
{
51-
c = wcschr(dest, L'\r');
57+
c = wcschr(&ret[0], L'\r');
5258
if (c)
5359
{
5460
*c = L' ';
5561
}
5662
}
5763
while ( c != nullptr );
5864

59-
dest[len] = 0;
60-
std::wstring ret = dest;
61-
delete[] dest;
6265
return ret;
6366
}
6467

6568
std::string WideCharStringToMultiByte( const WideChar *orig )
6669
{
70+
size_t srcLen = wcslen(orig);
71+
size_t size = Get_Utf8_Size(orig, srcLen);
72+
if (size == 0)
73+
return std::string();
6774
std::string ret;
68-
Int len = WideCharToMultiByte( CP_UTF8, 0, orig, wcslen(orig), nullptr, 0, nullptr, nullptr ) + 1;
69-
if (len > 0)
70-
{
71-
char *dest = NEW char[len];
72-
WideCharToMultiByte( CP_UTF8, 0, orig, -1, dest, len, nullptr, nullptr );
73-
dest[len-1] = 0;
74-
ret = dest;
75-
delete[] dest;
76-
}
75+
ret.resize(size);
76+
Unicode_To_Utf8(&ret[0], orig, srcLen, size);
7777
return ret;
7878
}
7979

Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ set(WWLIB_SRC
133133
trim.cpp
134134
trim.h
135135
uarray.h
136+
utf8.cpp
137+
utf8.h
136138
vector.cpp
137139
Vector.h
138140
visualc.h
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
** Command & Conquer Generals Zero Hour(tm)
3+
** Copyright 2026 TheSuperHackers
4+
**
5+
** This program is free software: you can redistribute it and/or modify
6+
** it under the terms of the GNU General Public License as published by
7+
** the Free Software Foundation, either version 3 of the License, or
8+
** (at your option) any later version.
9+
**
10+
** This program is distributed in the hope that it will be useful,
11+
** but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
** GNU General Public License for more details.
14+
**
15+
** You should have received a copy of the GNU General Public License
16+
** along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
#include "always.h"
20+
#include "utf8.h"
21+
22+
#include <string.h>
23+
24+
#ifdef _WIN32
25+
#include <windows.h>
26+
27+
static bool Is_Trail_Byte(char c)
28+
{
29+
return (c & 0xC0) == 0x80;
30+
}
31+
32+
size_t Utf8_Num_Bytes(char lead)
33+
{
34+
if ((lead & 0x80) == 0x00) return 1;
35+
if ((lead & 0xE0) == 0xC0) return 2;
36+
if ((lead & 0xF0) == 0xE0) return 3;
37+
if ((lead & 0xF8) == 0xF0) return 4;
38+
return 0;
39+
}
40+
41+
size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length)
42+
{
43+
if (length == 0)
44+
return 0;
45+
46+
size_t i = length;
47+
while (i > 0 && Is_Trail_Byte(str[i - 1]))
48+
--i;
49+
50+
if (i == 0)
51+
return length;
52+
53+
size_t claimed = Utf8_Num_Bytes(str[i - 1]);
54+
size_t actual = length - (i - 1);
55+
56+
if (claimed == 0 || claimed != actual)
57+
return actual;
58+
59+
return 0;
60+
}
61+
62+
bool Utf8_Validate(const char* str)
63+
{
64+
return Utf8_Validate(str, strlen(str));
65+
}
66+
67+
bool Utf8_Validate(const char* str, size_t length)
68+
{
69+
const unsigned char* s = (const unsigned char*)str;
70+
size_t i = 0;
71+
while (i < length)
72+
{
73+
size_t bytes = Utf8_Num_Bytes(str[i]);
74+
if (bytes == 0)
75+
return false;
76+
if (i + bytes > length)
77+
return false;
78+
for (size_t j = 1; j < bytes; ++j)
79+
{
80+
if (!Is_Trail_Byte(str[i + j]))
81+
return false;
82+
}
83+
// Reject overlong encodings per RFC 3629
84+
if (bytes == 2 && s[i] < 0xC2)
85+
return false;
86+
if (bytes == 3 && s[i] == 0xE0 && s[i + 1] < 0xA0)
87+
return false;
88+
if (bytes == 4 && s[i] == 0xF0 && s[i + 1] < 0x90)
89+
return false;
90+
// Reject codepoints above U+10FFFF
91+
if (bytes == 4 && s[i] > 0xF4)
92+
return false;
93+
if (bytes == 4 && s[i] == 0xF4 && s[i + 1] > 0x8F)
94+
return false;
95+
i += bytes;
96+
}
97+
return true;
98+
}
99+
100+
size_t Get_Utf8_Size(const wchar_t* src, size_t srcLen)
101+
{
102+
int bytes = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, nullptr, 0, nullptr, nullptr);
103+
return (bytes > 0) ? (size_t)bytes : 0;
104+
}
105+
106+
size_t Get_Unicode_Size(const char* src, size_t srcLen)
107+
{
108+
int wchars = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, nullptr, 0);
109+
return (wchars > 0) ? (size_t)wchars : 0;
110+
}
111+
112+
bool Unicode_To_Utf8(char* dest, const wchar_t* src, size_t srcLen, size_t destSize)
113+
{
114+
if (destSize == 0)
115+
return false;
116+
int result = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, dest, (int)destSize, nullptr, nullptr);
117+
if (result == 0)
118+
dest[0] = '\0';
119+
return result != 0;
120+
}
121+
122+
bool Utf8_To_Unicode(wchar_t* dest, const char* src, size_t srcLen, size_t destSize)
123+
{
124+
if (destSize == 0)
125+
return false;
126+
int result = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, dest, (int)destSize);
127+
if (result == 0)
128+
dest[0] = L'\0';
129+
return result != 0;
130+
}
131+
132+
#else
133+
#error "Not implemented"
134+
#endif
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
** Command & Conquer Generals Zero Hour(tm)
3+
** Copyright 2026 TheSuperHackers
4+
**
5+
** This program is free software: you can redistribute it and/or modify
6+
** it under the terms of the GNU General Public License as published by
7+
** the Free Software Foundation, either version 3 of the License, or
8+
** (at your option) any later version.
9+
**
10+
** This program is distributed in the hope that it will be useful,
11+
** but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
** GNU General Public License for more details.
14+
**
15+
** You should have received a copy of the GNU General Public License
16+
** along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
#pragma once
20+
21+
#include <stddef.h>
22+
#include <wchar.h>
23+
24+
// Returns the number of bytes in a UTF-8 character based on its lead byte.
25+
// Returns 0 if the lead byte is invalid.
26+
size_t Utf8_Num_Bytes(char lead);
27+
28+
// Returns the number of invalid bytes at the end of the string due to an
29+
// incomplete multi-byte sequence. Returns 0 if the string ends on a complete sequence.
30+
size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length);
31+
32+
// Returns true if the null-terminated string is valid UTF-8, false otherwise.
33+
bool Utf8_Validate(const char* str);
34+
bool Utf8_Validate(const char* str, size_t length);
35+
36+
// Returns the number of bytes in the UTF-8 representation of srcLen wide characters
37+
// from src. Returns 0 on failure or if srcLen is 0.
38+
size_t Get_Utf8_Size(const wchar_t* src, size_t srcLen);
39+
40+
// Returns the number of wchar_t elements in the wide character representation of
41+
// srcLen bytes from the UTF-8 string src. Returns 0 on failure or if srcLen is 0.
42+
size_t Get_Unicode_Size(const char* src, size_t srcLen);
43+
44+
// Converts srcLen wide characters from src to UTF-8. destSize is in bytes.
45+
// Does not write a null terminator. Caller must allocate destSize + 1 and
46+
// write the terminator if one is needed. Returns true on success, false on failure.
47+
// On failure, dest[0] is set to '\0'.
48+
bool Unicode_To_Utf8(char* dest, const wchar_t* src, size_t srcLen, size_t destSize);
49+
50+
// Converts srcLen bytes from the UTF-8 string src to wide characters. destSize is in wchar_t elements.
51+
// Does not write a null terminator. Caller must allocate destSize + 1 and
52+
// write the terminator if one is needed. Returns true on success, false on failure.
53+
// On failure, dest[0] is set to L'\0'.
54+
bool Utf8_To_Unicode(wchar_t* dest, const char* src, size_t srcLen, size_t destSize);

0 commit comments

Comments
 (0)