From 256fc540f5a5f0e8619616c3f8641f1d02fd2bee Mon Sep 17 00:00:00 2001 From: Joachim Rosskopf Date: Fri, 26 Jun 2026 23:16:46 +0200 Subject: [PATCH] fix: Correct REST JSON serialization of UUID/HUGEINT/BLOB/BIT (#89) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type-coverage audit follow-up. Several scalar types were mis-routed in the REST serializer: - UUID -> read via the VARCHAR path (as a string pointer) but is physically a 128-bit int, causing a SEGFAULT on any UUID column. Now formatted as the canonical 8-4-4-4-12 hex string. - HUGEINT/UHUGEINT -> read as 64-bit ints, truncating values and mis-striding multi-row chunks (e.g. MAX::HUGEINT -> -1). Now emitted as exact decimal strings (lossless; JSON numbers lose precision above 2^53). - BLOB -> emitted raw bytes (invalid UTF-8 / invalid JSON). Now uses DuckDB's blob string form (printable as-is, others as \xNN). - BIT -> emitted raw bit storage as garbage. Now emitted as its 0/1 string. All honor row validity (NULL -> null). Adds [scalar_types] regression tests. VARINT/BIGNUM, GEOMETRY and VARIANT remain serialized as null (internal/extension encodings not safely convertible at the vector level) — documented as a known limitation. Found via type audit + codex review. --- src/include/query_executor.hpp | 5 ++ src/query_executor.cpp | 147 +++++++++++++++++++++++++++++-- test/cpp/query_executor_test.cpp | 84 ++++++++++++++++++ 3 files changed, 231 insertions(+), 5 deletions(-) diff --git a/src/include/query_executor.hpp b/src/include/query_executor.hpp index 0ca23d9..e748149 100644 --- a/src/include/query_executor.hpp +++ b/src/include/query_executor.hpp @@ -55,6 +55,11 @@ class QueryResult { static crow::json::wvalue convertVectorTimeToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorIntervalToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorEnumToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorUuidToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorHugeintToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorUhugeintToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorBlobToJson(const duckdb_vector &vector, const idx_t row_idx); + static crow::json::wvalue convertVectorBitToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorListToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorArrayToJson(const duckdb_vector &vector, const idx_t row_idx); static crow::json::wvalue convertVectorStructToJson(const duckdb_vector &vector, const idx_t row_idx); diff --git a/src/query_executor.cpp b/src/query_executor.cpp index c24dbb5..3dc5f1e 100644 --- a/src/query_executor.cpp +++ b/src/query_executor.cpp @@ -234,7 +234,7 @@ crow::json::wvalue QueryResult::convertVectorEntryToJson(const duckdb_vector &ve case DUCKDB_TYPE_BIGINT: return convertVectorEntryToJson(vector, row_idx); case DUCKDB_TYPE_HUGEINT: - return convertVectorEntryToJson(vector, row_idx); + return convertVectorHugeintToJson(vector, row_idx); case DUCKDB_TYPE_FLOAT: return convertVectorEntryToJson(vector, row_idx); case DUCKDB_TYPE_DOUBLE: @@ -278,13 +278,13 @@ crow::json::wvalue QueryResult::convertVectorEntryToJson(const duckdb_vector &ve case DUCKDB_TYPE_UBIGINT: return convertVectorEntryToJson(vector, row_idx); case DUCKDB_TYPE_UHUGEINT: - return convertVectorEntryToJson(vector, row_idx); // Treat as uint64 + return convertVectorUhugeintToJson(vector, row_idx); case DUCKDB_TYPE_BLOB: - return convertVectorVarcharToJson(vector, row_idx); // Treat as string for JSON + return convertVectorBlobToJson(vector, row_idx); case DUCKDB_TYPE_UUID: - return convertVectorVarcharToJson(vector, row_idx); // Treat as string for JSON + return convertVectorUuidToJson(vector, row_idx); case DUCKDB_TYPE_BIT: - return convertVectorVarcharToJson(vector, row_idx); // Treat as string for JSON + return convertVectorBitToJson(vector, row_idx); case DUCKDB_TYPE_MAP: return convertVectorMapToJson(vector, row_idx); case DUCKDB_TYPE_ARRAY: @@ -422,6 +422,143 @@ crow::json::wvalue QueryResult::convertVectorEnumToJson(const duckdb_vector &vec return crow::json::wvalue(str_val); } +crow::json::wvalue QueryResult::convertVectorUuidToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // UUID is physically a 128-bit integer (hugeint), NOT a string — reading + // it via the VARCHAR path dereferences garbage and crashes (issue #89). + // Format it the way DuckDB's UUID::ToString does: flip the stored MSB + // back, then emit the canonical 8-4-4-4-12 lowercase hex form. + auto data = static_cast(duckdb_vector_get_data(vector)); + auto value = data[row_idx]; + uint64_t upper = static_cast(value.upper) ^ (uint64_t(1) << 63); + uint64_t lower = value.lower; + + static const char *HEX = "0123456789abcdef"; + char buf[36]; + idx_t pos = 0; + auto byte_to_hex = [&](uint64_t byte_val) { + buf[pos++] = HEX[(byte_val >> 4) & 0xf]; + buf[pos++] = HEX[byte_val & 0xf]; + }; + byte_to_hex((upper >> 56) & 0xFF); + byte_to_hex((upper >> 48) & 0xFF); + byte_to_hex((upper >> 40) & 0xFF); + byte_to_hex((upper >> 32) & 0xFF); + buf[pos++] = '-'; + byte_to_hex((upper >> 24) & 0xFF); + byte_to_hex((upper >> 16) & 0xFF); + buf[pos++] = '-'; + byte_to_hex((upper >> 8) & 0xFF); + byte_to_hex(upper & 0xFF); + buf[pos++] = '-'; + byte_to_hex((lower >> 56) & 0xFF); + byte_to_hex((lower >> 48) & 0xFF); + buf[pos++] = '-'; + byte_to_hex((lower >> 40) & 0xFF); + byte_to_hex((lower >> 32) & 0xFF); + byte_to_hex((lower >> 24) & 0xFF); + byte_to_hex((lower >> 16) & 0xFF); + byte_to_hex((lower >> 8) & 0xFF); + byte_to_hex(lower & 0xFF); + + return crow::json::wvalue(std::string(buf, pos)); +} + +crow::json::wvalue QueryResult::convertVectorHugeintToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // HUGEINT is 128-bit: reading it as int64 truncates and mis-strides for + // multi-row chunks (issue #89). Emit the exact decimal as a JSON string so + // values beyond 2^63 survive (JSON consumers lose precision above 2^53). + auto data = static_cast(duckdb_vector_get_data(vector)); + auto value = duckdb_create_hugeint(data[row_idx]); + DuckDBString str(duckdb_value_to_string(value)); + duckdb_destroy_value(&value); + return crow::json::wvalue(str.to_string()); +} + +crow::json::wvalue QueryResult::convertVectorUhugeintToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // UHUGEINT is unsigned 128-bit; emit as a decimal string for the same + // precision reasons as HUGEINT. + auto data = static_cast(duckdb_vector_get_data(vector)); + auto value = duckdb_create_uhugeint(data[row_idx]); + DuckDBString str(duckdb_value_to_string(value)); + duckdb_destroy_value(&value); + return crow::json::wvalue(str.to_string()); +} + +crow::json::wvalue QueryResult::convertVectorBlobToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // A BLOB's bytes are arbitrary binary; emitting them raw via the VARCHAR + // path can produce invalid UTF-8 / invalid JSON. Render DuckDB's own blob + // string form (printable bytes as-is, others as \xNN), matching to_json / + // CAST AS VARCHAR. (duckdb_value_to_string would add a '...'::BLOB wrapper.) + auto data = static_cast(duckdb_vector_get_data(vector)); + auto length = duckdb_string_is_inlined(data[row_idx]) + ? data[row_idx].value.inlined.length + : data[row_idx].value.pointer.length; + auto bytes = duckdb_string_is_inlined(data[row_idx]) + ? reinterpret_cast(data[row_idx].value.inlined.inlined) + : reinterpret_cast(data[row_idx].value.pointer.ptr); + + static const char *HEX = "0123456789ABCDEF"; + std::string out; + out.reserve(length); + for (idx_t i = 0; i < length; i++) { + uint8_t c = bytes[i]; + bool regular = c >= 32 && c <= 126 && c != '\\' && c != '\'' && c != '"'; + if (regular) { + out.push_back(static_cast(c)); + } else { + out.push_back('\\'); + out.push_back('x'); + out.push_back(HEX[(c >> 4) & 0xF]); + out.push_back(HEX[c & 0xF]); + } + } + return crow::json::wvalue(out); +} + +crow::json::wvalue QueryResult::convertVectorBitToJson(const duckdb_vector &vector, const idx_t row_idx) { + auto validity = duckdb_vector_get_validity(vector); + if (!duckdb_validity_row_is_valid(validity, row_idx)) { + return crow::json::wvalue(nullptr); + } + + // BIT is stored as its internal bitstring blob; the VARCHAR path renders + // that raw storage (padding byte + bytes) as garbage. Round-trip through a + // BIT value so it serializes as a "0101" string, matching to_json. + auto data = static_cast(duckdb_vector_get_data(vector)); + auto length = duckdb_string_is_inlined(data[row_idx]) + ? data[row_idx].value.inlined.length + : data[row_idx].value.pointer.length; + auto bytes = duckdb_string_is_inlined(data[row_idx]) + ? reinterpret_cast(data[row_idx].value.inlined.inlined) + : reinterpret_cast(data[row_idx].value.pointer.ptr); + + duckdb_bit bit { const_cast(bytes), length }; + auto value = duckdb_create_bit(bit); + DuckDBString str(duckdb_value_to_string(value)); + duckdb_destroy_value(&value); + return crow::json::wvalue(str.to_string()); +} + crow::json::wvalue QueryResult::convertVectorListToJson(const duckdb_vector &vector, const idx_t row_idx) { auto validity = duckdb_vector_get_validity(vector); if (!duckdb_validity_row_is_valid(validity, row_idx)) { diff --git a/test/cpp/query_executor_test.cpp b/test/cpp/query_executor_test.cpp index 1b0e3c7..8522daa 100644 --- a/test/cpp/query_executor_test.cpp +++ b/test/cpp/query_executor_test.cpp @@ -199,6 +199,90 @@ TEST_CASE("QueryExecutor type coverage", "[query_executor]") { duckdb_close(&database); } +TEST_CASE("QueryExecutor scalar type coverage", "[query_executor][scalar_types]") { + // Regression for issue #89 type audit: UUID previously crashed (read as a + // string pointer), HUGEINT/UHUGEINT silently truncated, BLOB/BIT emitted + // garbage. All now match DuckDB's own string/JSON representation. + duckdb_database database; + REQUIRE(duckdb_open(NULL, &database) == DuckDBSuccess); + + SECTION("UUID serializes as canonical string (no crash)") { + QueryExecutor executor(database); + executor.execute("SELECT '12345678-1234-5678-1234-567812345678'::UUID AS v"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 1); + REQUIRE(doc[0]["v"].t() == crow::json::type::String); + REQUIRE(doc[0]["v"].s() == "12345678-1234-5678-1234-567812345678"); + } + + SECTION("multi-row UUID keeps each row's own value") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + ('11111111-1111-1111-1111-111111111111'::UUID), + ('22222222-2222-2222-2222-222222222222'::UUID) + ) t(v) ORDER BY v + )SQL"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 2); + REQUIRE(doc[0]["v"].s() == "11111111-1111-1111-1111-111111111111"); + REQUIRE(doc[1]["v"].s() == "22222222-2222-2222-2222-222222222222"); + } + + SECTION("HUGEINT emits exact decimal string") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT * FROM (VALUES + (1, 1::HUGEINT), + (2, 170141183460469231731687303715884105727::HUGEINT), + (3, (-9999999999999999999)::HUGEINT) + ) t(id, v) ORDER BY id + )SQL"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc.size() == 3); + REQUIRE(doc[0]["v"].s() == "1"); + REQUIRE(doc[1]["v"].s() == "170141183460469231731687303715884105727"); + REQUIRE(doc[2]["v"].s() == "-9999999999999999999"); + } + + SECTION("UHUGEINT emits exact decimal string") { + QueryExecutor executor(database); + executor.execute("SELECT 340282366920938463463374607431768211455::UHUGEINT AS v"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc[0]["v"].s() == "340282366920938463463374607431768211455"); + } + + SECTION("BLOB emits DuckDB blob string, not garbled bytes") { + QueryExecutor executor(database); + executor.execute("SELECT '\\xAA\\xBB'::BLOB AS v"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc[0]["v"].t() == crow::json::type::String); + REQUIRE(doc[0]["v"].s() == "\\xAA\\xBB"); + } + + SECTION("BIT emits its 0/1 string") { + QueryExecutor executor(database); + executor.execute("SELECT '101010'::BIT AS v"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc[0]["v"].s() == "101010"); + } + + SECTION("NULL values of these types stay null") { + QueryExecutor executor(database); + executor.execute(R"SQL( + SELECT CAST(NULL AS UUID) AS u, CAST(NULL AS HUGEINT) AS h, + CAST(NULL AS BLOB) AS b, CAST(NULL AS BIT) AS t + )SQL"); + auto doc = crow::json::load(executor.toJson().dump()); + REQUIRE(doc[0]["u"].t() == crow::json::type::Null); + REQUIRE(doc[0]["h"].t() == crow::json::type::Null); + REQUIRE(doc[0]["b"].t() == crow::json::type::Null); + REQUIRE(doc[0]["t"].t() == crow::json::type::Null); + } + + duckdb_close(&database); +} + TEST_CASE("QueryExecutor chunk experiment", "[query_executor]") { duckdb_database database; REQUIRE(duckdb_open(NULL, &database) == DuckDBSuccess);