feat(utf8): add UTF-8 string conversion and validation functions

bobtista · bobtista · commit 40393b877dac · 2026-04-06T16:01:02.000-04:00
diff --git a/Core/GameEngine/Source/Common/System/AsciiString.cpp b/Core/GameEngine/Source/Common/System/AsciiString.cpp
@@ -45,6 +45,7 @@
 #include "PreRTS.h"	// This must go first in EVERY cpp file in the GameEngine
 
 #include "Common/CriticalSection.h"
+#include "utf8.h"
 
 
 // -----------------------------------------------------
@@ -137,8 +138,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
 			// TheSuperHackers @fix Mauller 04/04/2025 Replace strcpy with safer memmove as memory regions can overlap when part of string is copied to itself
 			DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
 			memmove(m_data->peek(), strToCopy, usableNumChars);
-			m_data->peek()[usableNumChars] = 0;
 		}
+		m_data->peek()[usableNumChars] = 0;
 		if (strToCat)
 			strcat(m_data->peek(), strToCat);
 		return;
@@ -166,8 +167,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
 	{
 		DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
 		strncpy(newData->peek(), strToCopy, usableNumChars);
-		newData->peek()[usableNumChars] = 0;
 	}
+	newData->peek()[usableNumChars] = 0;
 	if (strToCat)
 		strcat(newData->peek(), strToCat);
 
@@ -272,11 +273,19 @@ char*  AsciiString::getBufferForRead(Int len)
 void AsciiString::translate(const UnicodeString& stringSrc)
 {
 	validate();
-	/// @todo srj put in a real translation here; this will only work for 7-bit ascii
-	clear();
-	Int len = stringSrc.getLength();
-	for (Int i = 0; i < len; i++)
-		concat((char)stringSrc.getCharAt(i));
+	// TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
+	const WideChar* src = stringSrc.str();
+	size_t srcLen = wcslen(src);
+	size_t size = Get_Utf8_Size(src, srcLen);
+	if (size == 0)
+	{
+		clear();
+		return;
+	}
+	ensureUniqueBufferOfSize((Int)size + 1, false, nullptr, nullptr);
+	char* buf = peek();
+	if (!Unicode_To_Utf8(buf, src, srcLen, size))
+		clear();
 	validate();
 }
 
diff --git a/Core/GameEngine/Source/Common/System/UnicodeString.cpp b/Core/GameEngine/Source/Common/System/UnicodeString.cpp
@@ -45,6 +45,7 @@
 #include "PreRTS.h"	// This must go first in EVERY cpp file in the GameEngine
 
 #include "Common/CriticalSection.h"
+#include "utf8.h"
 
 
 // -----------------------------------------------------
@@ -88,8 +89,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
 			// TheSuperHackers @fix Mauller 04/04/2025 Replace wcscpy with safer memmove as memory regions can overlap when part of string is copied to itself
 			DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
 			memmove(m_data->peek(), strToCopy, usableNumChars * sizeof(WideChar));
-			m_data->peek()[usableNumChars] = 0;
 		}
+		m_data->peek()[usableNumChars] = 0;
 		if (strToCat)
 			wcscat(m_data->peek(), strToCat);
 		return;
@@ -117,8 +118,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
 	{
 		DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
 		wcsncpy(newData->peek(), strToCopy, usableNumChars);
-		newData->peek()[usableNumChars] = 0;
 	}
+	newData->peek()[usableNumChars] = 0;
 	if (strToCat)
 		wcscat(newData->peek(), strToCat);
 
@@ -221,11 +222,19 @@ WideChar* UnicodeString::getBufferForRead(Int len)
 void UnicodeString::translate(const AsciiString& stringSrc)
 {
 	validate();
-	/// @todo srj put in a real translation here; this will only work for 7-bit ascii
-	clear();
-	Int len = stringSrc.getLength();
-	for (Int i = 0; i < len; i++)
-		concat((WideChar)stringSrc.getCharAt(i));
+	// TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
+	const char* src = stringSrc.str();
+	size_t srcLen = strlen(src);
+	size_t size = Get_Unicode_Size(src, srcLen);
+	if (size == 0)
+	{
+		clear();
+		return;
+	}
+	ensureUniqueBufferOfSize((Int)size + 1, false, nullptr, nullptr);
+	WideChar* buf = peek();
+	if (!Utf8_To_Unicode(buf, src, srcLen, size))
+		clear();
 	validate();
 }
 
diff --git a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp
@@ -28,18 +28,24 @@
 
 #include "PreRTS.h"	// This must go first in EVERY cpp file in the GameEngine
 
+#include "utf8.h"
+
 //-------------------------------------------------------------------------
 
+// TheSuperHackers @refactor bobtista 02/04/2026 Use WWLib UTF-8 functions instead of raw Win32 API calls
 std::wstring MultiByteToWideCharSingleLine( const char *orig )
 {
-	Int len = strlen(orig);
-	WideChar *dest = NEW WideChar[len+1];
-
-	MultiByteToWideChar(CP_UTF8, 0, orig, -1, dest, len);
+	size_t srcLen = strlen(orig);
+	size_t size = Get_Unicode_Size(orig, srcLen);
+	if (size == 0)
+		return std::wstring();
+	std::wstring ret;
+	ret.resize(size);
+	Utf8_To_Unicode(&ret[0], orig, srcLen, size);
 	WideChar *c = nullptr;
 	do
 	{
-		c = wcschr(dest, L'\n');
+		c = wcschr(&ret[0], L'\n');
 		if (c)
 		{
 			*c = L' ';
@@ -48,32 +54,26 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig )
 	while ( c != nullptr );
 	do
 	{
-		c = wcschr(dest, L'\r');
+		c = wcschr(&ret[0], L'\r');
 		if (c)
 		{
 			*c = L' ';
 		}
 	}
 	while ( c != nullptr );
 
-	dest[len] = 0;
-	std::wstring ret = dest;
-	delete[] dest;
 	return ret;
 }
 
 std::string WideCharStringToMultiByte( const WideChar *orig )
 {
+	size_t srcLen = wcslen(orig);
+	size_t size = Get_Utf8_Size(orig, srcLen);
+	if (size == 0)
+		return std::string();
 	std::string ret;
-	Int len = WideCharToMultiByte( CP_UTF8, 0, orig, wcslen(orig), nullptr, 0, nullptr, nullptr ) + 1;
-	if (len > 0)
-	{
-		char *dest = NEW char[len];
-		WideCharToMultiByte( CP_UTF8, 0, orig, -1, dest, len, nullptr, nullptr );
-		dest[len-1] = 0;
-		ret = dest;
-		delete[] dest;
-	}
+	ret.resize(size);
+	Unicode_To_Utf8(&ret[0], orig, srcLen, size);
 	return ret;
 }
 
diff --git a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt
@@ -133,6 +133,8 @@ set(WWLIB_SRC
     trim.cpp
     trim.h
     uarray.h
+    utf8.cpp
+    utf8.h
     vector.cpp
     Vector.h
     visualc.h
diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp
@@ -0,0 +1,134 @@
+/*
+**	Command & Conquer Generals Zero Hour(tm)
+**	Copyright 2026 TheSuperHackers
+**
+**	This program is free software: you can redistribute it and/or modify
+**	it under the terms of the GNU General Public License as published by
+**	the Free Software Foundation, either version 3 of the License, or
+**	(at your option) any later version.
+**
+**	This program is distributed in the hope that it will be useful,
+**	but WITHOUT ANY WARRANTY; without even the implied warranty of
+**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**	GNU General Public License for more details.
+**
+**	You should have received a copy of the GNU General Public License
+**	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "always.h"
+#include "utf8.h"
+
+#include <string.h>
+
+#ifdef _WIN32
+#include <windows.h>
+
+static bool Is_Trail_Byte(char c)
+{
+	return (c & 0xC0) == 0x80;
+}
+
+size_t Utf8_Num_Bytes(char lead)
+{
+	if ((lead & 0x80) == 0x00) return 1;
+	if ((lead & 0xE0) == 0xC0) return 2;
+	if ((lead & 0xF0) == 0xE0) return 3;
+	if ((lead & 0xF8) == 0xF0) return 4;
+	return 0;
+}
+
+size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length)
+{
+	if (length == 0)
+		return 0;
+
+	size_t i = length;
+	while (i > 0 && Is_Trail_Byte(str[i - 1]))
+		--i;
+
+	if (i == 0)
+		return length;
+
+	size_t claimed = Utf8_Num_Bytes(str[i - 1]);
+	size_t actual = length - (i - 1);
+
+	if (claimed == 0 || claimed != actual)
+		return actual;
+
+	return 0;
+}
+
+bool Utf8_Validate(const char* str)
+{
+	return Utf8_Validate(str, strlen(str));
+}
+
+bool Utf8_Validate(const char* str, size_t length)
+{
+	const unsigned char* s = (const unsigned char*)str;
+	size_t i = 0;
+	while (i < length)
+	{
+		size_t bytes = Utf8_Num_Bytes(str[i]);
+		if (bytes == 0)
+			return false;
+		if (i + bytes > length)
+			return false;
+		for (size_t j = 1; j < bytes; ++j)
+		{
+			if (!Is_Trail_Byte(str[i + j]))
+				return false;
+		}
+		// Reject overlong encodings per RFC 3629
+		if (bytes == 2 && s[i] < 0xC2)
+			return false;
+		if (bytes == 3 && s[i] == 0xE0 && s[i + 1] < 0xA0)
+			return false;
+		if (bytes == 4 && s[i] == 0xF0 && s[i + 1] < 0x90)
+			return false;
+		// Reject codepoints above U+10FFFF
+		if (bytes == 4 && s[i] > 0xF4)
+			return false;
+		if (bytes == 4 && s[i] == 0xF4 && s[i + 1] > 0x8F)
+			return false;
+		i += bytes;
+	}
+	return true;
+}
+
+size_t Get_Utf8_Size(const wchar_t* src, size_t srcLen)
+{
+	int bytes = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, nullptr, 0, nullptr, nullptr);
+	return (bytes > 0) ? (size_t)bytes : 0;
+}
+
+size_t Get_Unicode_Size(const char* src, size_t srcLen)
+{
+	int wchars = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, nullptr, 0);
+	return (wchars > 0) ? (size_t)wchars : 0;
+}
+
+bool Unicode_To_Utf8(char* dest, const wchar_t* src, size_t srcLen, size_t destSize)
+{
+	if (destSize == 0)
+		return false;
+	int result = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, dest, (int)destSize, nullptr, nullptr);
+	if (result == 0)
+		dest[0] = '\0';
+	return result != 0;
+}
+
+bool Utf8_To_Unicode(wchar_t* dest, const char* src, size_t srcLen, size_t destSize)
+{
+	if (destSize == 0)
+		return false;
+	int result = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, dest, (int)destSize);
+	if (result == 0)
+		dest[0] = L'\0';
+	return result != 0;
+}
+
+#else
+#error "Not implemented"
+#endif
diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.h b/Core/Libraries/Source/WWVegas/WWLib/utf8.h
@@ -0,0 +1,54 @@
+/*
+**	Command & Conquer Generals Zero Hour(tm)
+**	Copyright 2026 TheSuperHackers
+**
+**	This program is free software: you can redistribute it and/or modify
+**	it under the terms of the GNU General Public License as published by
+**	the Free Software Foundation, either version 3 of the License, or
+**	(at your option) any later version.
+**
+**	This program is distributed in the hope that it will be useful,
+**	but WITHOUT ANY WARRANTY; without even the implied warranty of
+**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+**	GNU General Public License for more details.
+**
+**	You should have received a copy of the GNU General Public License
+**	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <stddef.h>
+#include <wchar.h>
+
+// Returns the number of bytes in a UTF-8 character based on its lead byte.
+// Returns 0 if the lead byte is invalid.
+size_t Utf8_Num_Bytes(char lead);
+
+// Returns the number of invalid bytes at the end of the string due to an
+// incomplete multi-byte sequence. Returns 0 if the string ends on a complete sequence.
+size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length);
+
+// Returns true if the null-terminated string is valid UTF-8, false otherwise.
+bool Utf8_Validate(const char* str);
+bool Utf8_Validate(const char* str, size_t length);
+
+// Returns the number of bytes in the UTF-8 representation of srcLen wide characters
+// from src. Returns 0 on failure or if srcLen is 0.
+size_t Get_Utf8_Size(const wchar_t* src, size_t srcLen);
+
+// Returns the number of wchar_t elements in the wide character representation of
+// srcLen bytes from the UTF-8 string src. Returns 0 on failure or if srcLen is 0.
+size_t Get_Unicode_Size(const char* src, size_t srcLen);
+
+// Converts srcLen wide characters from src to UTF-8. destSize is in bytes.
+// Does not write a null terminator. Caller must allocate destSize + 1 and
+// write the terminator if one is needed. Returns true on success, false on failure.
+// On failure, dest[0] is set to '\0'.
+bool Unicode_To_Utf8(char* dest, const wchar_t* src, size_t srcLen, size_t destSize);
+
+// Converts srcLen bytes from the UTF-8 string src to wide characters. destSize is in wchar_t elements.
+// Does not write a null terminator. Caller must allocate destSize + 1 and
+// write the terminator if one is needed. Returns true on success, false on failure.
+// On failure, dest[0] is set to L'\0'.
+bool Utf8_To_Unicode(wchar_t* dest, const char* src, size_t srcLen, size_t destSize);