diff --git a/_typos.toml b/_typos.toml index dae8089bf..6ed685629 100644 --- a/_typos.toml +++ b/_typos.toml @@ -1,5 +1,6 @@ [default.extend-words] # Intentional partial strings in test data and example error messages. +caf = "caf" hel = "hel" leve = "leve" # CSS class prefix for pipeline nodes in the dashboard. diff --git a/crates/logfwd-arrow/src/scanner.rs b/crates/logfwd-arrow/src/scanner.rs index ea9742a2c..cc05282eb 100644 --- a/crates/logfwd-arrow/src/scanner.rs +++ b/crates/logfwd-arrow/src/scanner.rs @@ -70,6 +70,10 @@ impl ScanBuilder for StreamingBuilder { self.append_str_by_idx(idx, v); } #[inline(always)] + fn append_decoded_str_by_idx(&mut self, idx: usize, v: &[u8]) { + self.append_decoded_str_by_idx(idx, v); + } + #[inline(always)] fn append_int_by_idx(&mut self, idx: usize, v: &[u8]) { self.append_int_by_idx(idx, v); } diff --git a/crates/logfwd-arrow/src/streaming_builder.rs b/crates/logfwd-arrow/src/streaming_builder.rs index 673320c0a..6b81dc8ef 100644 --- a/crates/logfwd-arrow/src/streaming_builder.rs +++ b/crates/logfwd-arrow/src/streaming_builder.rs @@ -27,7 +27,10 @@ use crate::check_dup_bits; struct FieldColumns { name: Vec, - /// String values: (row, offset_in_buffer, len). Views into the shared buffer. + /// String values: (row, offset_in_buffer, len). Views into the shared + /// buffer. Offsets `< buf.len()` reference the original input buffer; + /// offsets `>= buf.len()` reference the decoded-strings buffer at + /// `offset - buf.len()`. See `StreamingBuilder::decoded_buf`. str_views: Vec<(u32, u32, u32)>, /// Int values: (row, parsed_value). int_values: Vec<(u32, i64)>, @@ -99,6 +102,10 @@ pub struct StreamingBuilder { /// Reference-counted buffer. Stored here to compute offsets safely /// and shared with Arrow StringViewArrays in finish_batch. buf: bytes::Bytes, + /// Secondary buffer for decoded string values (JSON escape sequences). + /// String views with offsets `>= buf.len()` reference this buffer at + /// `offset - buf.len()`. Allocated lazily; empty when no escapes are decoded. + decoded_buf: Vec, /// When true, `append_raw` stores (offset, len) views for the `_raw` column. keep_raw: bool, /// Raw line views: (offset_in_buf, len) per row, in row order. @@ -123,6 +130,7 @@ impl StreamingBuilder { row_count: 0, written_bits: 0, buf: bytes::Bytes::new(), + decoded_buf: Vec::new(), keep_raw, raw_views: Vec::new(), state: BuilderState::Idle, @@ -148,6 +156,7 @@ impl StreamingBuilder { buf.len() ); self.buf = buf; + self.decoded_buf.clear(); self.row_count = 0; // Only clear the slots that were active in the previous batch. // This preserves the inner-Vec capacity of each FieldColumns for @@ -263,6 +272,36 @@ impl StreamingBuilder { .push((self.row_count, offset, value.len() as u32)); } + /// Append a decoded string value that is NOT a subslice of the input + /// buffer. Used for strings whose JSON escape sequences have been decoded + /// (see issue #410). Appends the bytes to `decoded_buf` and records a + /// view in the same `str_views` vector as regular strings, with the + /// offset shifted by `buf.len()` so that `finish_batch` can create a + /// combined Arrow buffer. + #[inline(always)] + pub fn append_decoded_str_by_idx(&mut self, idx: usize, value: &[u8]) { + debug_assert_eq!( + self.state, + BuilderState::InRow, + "append_decoded_str_by_idx called outside of a row" + ); + if check_dup_bits(&mut self.written_bits, idx) { + return; + } + if std::str::from_utf8(value).is_err() { + return; + } + let decoded_offset = self.decoded_buf.len() as u32; + self.decoded_buf.extend_from_slice(value); + // Offset into the combined buffer: original buf bytes come first, + // decoded bytes follow at buf.len() + decoded_offset. + let combined_offset = self.buf.len() as u32 + decoded_offset; + let fc = &mut self.fields[idx]; + fc.has_str = true; + fc.str_views + .push((self.row_count, combined_offset, value.len() as u32)); + } + #[inline(always)] pub fn append_int_by_idx(&mut self, idx: usize, value: &[u8]) { debug_assert_eq!( @@ -328,8 +367,11 @@ impl StreamingBuilder { /// Build a RecordBatch with zero-copy StringViewArrays. /// - /// The resulting RecordBatch shares the input buffer via Bytes reference - /// counting -- no string data is copied. + /// When no JSON escape sequences were decoded, the resulting RecordBatch + /// shares the input buffer via Bytes reference counting (zero-copy). + /// When decoded strings exist, a combined buffer is built that appends + /// decoded bytes after the original input so that all str_views offsets + /// resolve into a single contiguous Arrow buffer. pub fn finish_batch(&mut self) -> Result { debug_assert_eq!( self.state, @@ -337,7 +379,22 @@ impl StreamingBuilder { "finish_batch called outside of a batch (call begin_batch first, and ensure all rows are closed with end_row)" ); let num_rows = self.row_count as usize; - let arrow_buf = Buffer::from(self.buf.clone()); + + // Build the Arrow buffer. When no decoded strings exist, this is + // zero-copy via Bytes refcount. When decoded strings are present, + // we concatenate the original buffer with the decoded buffer so that + // str_views offsets >= buf.len() resolve correctly. + let arrow_buf = if self.decoded_buf.is_empty() { + Buffer::from(self.buf.clone()) + } else { + let mut combined = Vec::with_capacity(self.buf.len() + self.decoded_buf.len()); + combined.extend_from_slice(&self.buf); + combined.extend_from_slice(&self.decoded_buf); + Buffer::from(combined) + }; + // Separate zero-copy buffer for _raw views (always into the original + // input buffer, never into decoded_buf). + let raw_arrow_buf = Buffer::from(self.buf.clone()); let mut schema_fields: Vec = Vec::with_capacity(self.num_active); let mut arrays: Vec = Vec::with_capacity(self.num_active); @@ -510,7 +567,7 @@ impl StreamingBuilder { num_rows ); let mut builder = StringViewBuilder::new(); - let block = builder.append_block(arrow_buf.clone()); + let block = builder.append_block(raw_arrow_buf); for row in 0..num_rows { if row < self.raw_views.len() { diff --git a/crates/logfwd-core/src/json_scanner.rs b/crates/logfwd-core/src/json_scanner.rs index 9deeb798e..9a521b620 100644 --- a/crates/logfwd-core/src/json_scanner.rs +++ b/crates/logfwd-core/src/json_scanner.rs @@ -28,6 +28,11 @@ struct StoredBitmasks<'a> { /// structural positions are extracted from 64-byte block bitmasks /// consumed on the fly. /// +/// JSON escape sequences in string values (`\"`, `\\`, `\/`, `\b`, `\f`, +/// `\n`, `\r`, `\t`, `\uXXXX`) are decoded to their UTF-8 representation +/// during extraction. This prevents double-escaping when values are +/// re-serialized downstream (see issue #410). +/// /// # Preconditions /// - The caller must have already invoked `begin_batch` on the builder before /// this call (see [`ScanBuilder`] for the initialization contract). @@ -103,9 +108,13 @@ pub fn scan_streaming(buf: &[u8], config: &ScanConfig, builder: close_bracket: &close_bracket, }; + // Scratch buffer for decoding JSON escape sequences in string values. + // Allocated once, reused across lines via clear() — no per-line allocation. + let mut scratch = alloc::vec::Vec::new(); + // Phase 2: Scan each line using stored bitmasks for quote/nested lookups. for (start, end) in line_ranges { - scan_line(buf, start, end, &bitmasks, config, builder); + scan_line(buf, start, end, &bitmasks, config, builder, &mut scratch); } } @@ -117,6 +126,7 @@ fn scan_line( blocks: &StoredBitmasks<'_>, config: &ScanConfig, builder: &mut B, + scratch: &mut alloc::vec::Vec, ) { builder.begin_row(); if config.keep_raw { @@ -166,7 +176,7 @@ fn scan_line( let wanted = config.is_wanted(key); match buf[pos] { b'"' => { - // String value + // String value — decode JSON escape sequences (#410) let val_start = pos + 1; let val_end = match next_quote(pos + 1, end, blocks) { Some(p) => p, @@ -174,7 +184,13 @@ fn scan_line( }; if wanted { let idx = builder.resolve_field(key); - builder.append_str_by_idx(idx, &buf[val_start..val_end]); + let raw = &buf[val_start..val_end]; + if memchr::memchr(b'\\', raw).is_some() { + decode_json_escapes(raw, scratch); + builder.append_decoded_str_by_idx(idx, scratch); + } else { + builder.append_str_by_idx(idx, raw); + } } pos = val_end + 1; } @@ -394,6 +410,151 @@ fn skip_bare_value(buf: &[u8], mut pos: usize, end: usize) -> usize { pos } +// --------------------------------------------------------------------------- +// JSON string escape decoding (#410) +// --------------------------------------------------------------------------- + +/// Decode JSON escape sequences from `input` into `out`. +/// +/// Handles all RFC 8259 §7 escapes: `\"` `\\` `\/` `\b` `\f` `\n` `\r` `\t` +/// and `\uXXXX` (including surrogate pairs for supplementary code points). +/// +/// Invalid or truncated escape sequences are passed through unchanged +/// to avoid data loss on malformed input. +fn decode_json_escapes(input: &[u8], out: &mut alloc::vec::Vec) { + out.clear(); + // Decoded output is always ≤ input length (escapes expand, never shrink). + out.reserve(input.len()); + + let mut i = 0; + while i < input.len() { + if input[i] != b'\\' || i + 1 >= input.len() { + out.push(input[i]); + i += 1; + continue; + } + + match input[i + 1] { + b'"' => { + out.push(b'"'); + i += 2; + } + b'\\' => { + out.push(b'\\'); + i += 2; + } + b'/' => { + out.push(b'/'); + i += 2; + } + b'b' => { + out.push(0x08); + i += 2; + } + b'f' => { + out.push(0x0C); + i += 2; + } + b'n' => { + out.push(b'\n'); + i += 2; + } + b'r' => { + out.push(b'\r'); + i += 2; + } + b't' => { + out.push(b'\t'); + i += 2; + } + b'u' => { + i = decode_unicode_escape(input, i, out); + } + _ => { + // Unknown escape — pass through unchanged + out.push(b'\\'); + i += 1; + } + } + } +} + +/// Decode a `\uXXXX` escape (possibly a surrogate pair) starting at `pos`. +/// Appends the decoded UTF-8 bytes to `out` and returns the new position. +fn decode_unicode_escape(input: &[u8], pos: usize, out: &mut alloc::vec::Vec) -> usize { + // Need at least 6 bytes: \uXXXX + if pos + 6 > input.len() { + out.push(b'\\'); + return pos + 1; + } + let cp = match parse_hex4(&input[pos + 2..pos + 6]) { + Some(v) => v, + None => { + out.push(b'\\'); + return pos + 1; + } + }; + + // High surrogate — expect a following \uXXXX low surrogate + if (0xD800..=0xDBFF).contains(&cp) { + if pos + 12 <= input.len() && input[pos + 6] == b'\\' && input[pos + 7] == b'u' { + if let Some(lo) = parse_hex4(&input[pos + 8..pos + 12]) { + if (0xDC00..=0xDFFF).contains(&lo) { + let full = 0x10000 + ((cp as u32 - 0xD800) << 10) + (lo as u32 - 0xDC00); + if let Some(c) = char::from_u32(full) { + let mut utf8 = [0u8; 4]; + let s = c.encode_utf8(&mut utf8); + out.extend_from_slice(s.as_bytes()); + return pos + 12; + } + } + } + } + // Unpaired high surrogate — pass through raw + out.extend_from_slice(&input[pos..pos + 6]); + return pos + 6; + } + + // Lone low surrogate — pass through raw + if (0xDC00..=0xDFFF).contains(&cp) { + out.extend_from_slice(&input[pos..pos + 6]); + return pos + 6; + } + + // BMP code point + if let Some(c) = char::from_u32(cp as u32) { + let mut utf8 = [0u8; 4]; + let s = c.encode_utf8(&mut utf8); + out.extend_from_slice(s.as_bytes()); + pos + 6 + } else { + // Invalid code point — pass through raw + out.extend_from_slice(&input[pos..pos + 6]); + pos + 6 + } +} + +/// Parse 4 ASCII hex digits into a `u16`. +#[inline] +fn parse_hex4(bytes: &[u8]) -> Option { + if bytes.len() < 4 { + return None; + } + let mut val: u16 = 0; + let mut j = 0; + while j < 4 { + let digit = match bytes[j] { + b'0'..=b'9' => bytes[j] - b'0', + b'a'..=b'f' => bytes[j] - b'a' + 10, + b'A'..=b'F' => bytes[j] - b'A' + 10, + _ => return None, + }; + val = (val << 4) | digit as u16; + j += 1; + } + Some(val) +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- @@ -553,6 +714,7 @@ mod tests { #[test] fn escaped_quotes_in_value() { + // After #410 fix: scanner decodes \" to " in string values. let buf = br#"{"msg":"said \"hello\""}"#; let config = ScanConfig::default(); let mut builder = TestBuilder::new(); @@ -562,10 +724,179 @@ mod tests { let row = &builder.rows[0]; assert!( row.iter() - .any(|(k, v)| k == "msg" && v == r#"said \"hello\""#) + .any(|(k, v)| k == "msg" && v == r#"said "hello""#) ); } + // --- Escape decoding tests (#410) --- + + #[test] + fn unicode_escape_u0041() { + // \u0041 is 'A' — must be decoded, not double-escaped. + let buf = br#"{"a":"\u0041"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!(row.iter().any(|(k, v)| k == "a" && v == "A")); + } + + #[test] + fn unicode_escape_e_acute() { + // \u00e9 is 'é' — must be decoded to UTF-8. + let buf = br#"{"msg":"caf\u00e9"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!(row.iter().any(|(k, v)| k == "msg" && v == "café")); + } + + #[test] + fn unicode_surrogate_pair() { + // \uD83D\uDE00 is U+1F600 (😀) — surrogate pair decoded to UTF-8. + let buf = br#"{"e":"\uD83D\uDE00"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!(row.iter().any(|(k, v)| k == "e" && v == "😀")); + } + + #[test] + fn escape_newline_tab_cr() { + let buf = br#"{"a":"line1\nline2\ttab\rret"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!( + row.iter() + .any(|(k, v)| k == "a" && v == "line1\nline2\ttab\rret") + ); + } + + #[test] + fn escape_backslash() { + // \\\\ in raw bytes is two JSON-escaped backslashes → two literal backslashes + let buf = b"{\"a\":\"c:\\\\path\\\\file\"}"; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!(row.iter().any(|(k, v)| k == "a" && v == "c:\\path\\file")); + } + + #[test] + fn escape_solidus() { + let buf = br#"{"url":"http:\/\/example.com"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!( + row.iter() + .any(|(k, v)| k == "url" && v == "http://example.com") + ); + } + + #[test] + fn no_escape_passthrough() { + // Strings without backslashes pass through unchanged (fast path). + let buf = br#"{"x":"hello world"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!(row.iter().any(|(k, v)| k == "x" && v == "hello world")); + } + + #[test] + fn mixed_escapes_and_plain_text() { + let buf = br#"{"m":"start\n\tmiddle \u0041 end"}"#; + let config = ScanConfig::default(); + let mut builder = TestBuilder::new(); + scan_streaming(buf, &config, &mut builder); + + assert_eq!(builder.rows.len(), 1); + let row = &builder.rows[0]; + assert!( + row.iter() + .any(|(k, v)| k == "m" && v == "start\n\tmiddle A end") + ); + } + + #[test] + fn decode_json_escapes_unit() { + let mut out = Vec::new(); + + // Simple escapes + decode_json_escapes(br#"hello"#, &mut out); + assert_eq!(&out, b"hello"); + + decode_json_escapes(br#"say \"hi\""#, &mut out); + assert_eq!(&out, b"say \"hi\""); + + decode_json_escapes(br#"a\\b"#, &mut out); + assert_eq!(&out, b"a\\b"); + + decode_json_escapes(br#"a\/b"#, &mut out); + assert_eq!(&out, b"a/b"); + + decode_json_escapes(br#"a\nb\tc"#, &mut out); + assert_eq!(&out, b"a\nb\tc"); + + decode_json_escapes(br#"\b\f"#, &mut out); + assert_eq!(&out, &[0x08, 0x0C]); + + // Unicode escape + decode_json_escapes(br#"\u0041"#, &mut out); + assert_eq!(&out, b"A"); + + // Multi-byte unicode + decode_json_escapes(br#"\u00e9"#, &mut out); + assert_eq!(&out, "é".as_bytes()); + + // Surrogate pair + decode_json_escapes(br#"\uD83D\uDE00"#, &mut out); + assert_eq!(&out, "😀".as_bytes()); + + // Truncated escape at end — pass through + decode_json_escapes(br#"abc\"#, &mut out); + assert_eq!(&out, br#"abc\"#); + + // Unknown escape letter — pass through backslash + decode_json_escapes(br#"\x"#, &mut out); + assert_eq!(&out, br#"\x"#); + } + + #[test] + fn parse_hex4_unit() { + assert_eq!(parse_hex4(b"0041"), Some(0x0041)); + assert_eq!(parse_hex4(b"00e9"), Some(0x00e9)); + assert_eq!(parse_hex4(b"D83D"), Some(0xD83D)); + assert_eq!(parse_hex4(b"DE00"), Some(0xDE00)); + assert_eq!(parse_hex4(b"FFFF"), Some(0xFFFF)); + assert_eq!(parse_hex4(b"0000"), Some(0x0000)); + assert_eq!(parse_hex4(b"abcf"), Some(0xABCF)); + assert_eq!(parse_hex4(b"ZZZZ"), None); + assert_eq!(parse_hex4(b"00g0"), None); + } + #[test] fn array_value() { let buf = br#"{"tags":["a","b"],"x":1}"#; diff --git a/crates/logfwd-core/src/scanner.rs b/crates/logfwd-core/src/scanner.rs index bb0e31695..f0344bbd7 100644 --- a/crates/logfwd-core/src/scanner.rs +++ b/crates/logfwd-core/src/scanner.rs @@ -82,6 +82,16 @@ pub trait ScanBuilder { fn resolve_field(&mut self, key: &[u8]) -> usize; /// Append a string value at the given column index. fn append_str_by_idx(&mut self, idx: usize, value: &[u8]); + /// Append a decoded string value at the given column index. + /// + /// Called when the value has been decoded from JSON escape sequences + /// and may not be a direct subslice of the input buffer. Builders that + /// use zero-copy views (e.g., `StreamingBuilder`) should override this + /// to handle non-buffer bytes. The default delegates to + /// [`append_str_by_idx`](Self::append_str_by_idx). + fn append_decoded_str_by_idx(&mut self, idx: usize, value: &[u8]) { + self.append_str_by_idx(idx, value); + } /// Append an integer value (as raw ASCII digits) at the given column index. fn append_int_by_idx(&mut self, idx: usize, value: &[u8]); /// Append a float value (as raw ASCII) at the given column index. diff --git a/crates/logfwd-core/tests/compliance_data.rs b/crates/logfwd-core/tests/compliance_data.rs index 72ea3c0d6..5d8389e3f 100644 --- a/crates/logfwd-core/tests/compliance_data.rs +++ b/crates/logfwd-core/tests/compliance_data.rs @@ -227,17 +227,16 @@ fn compliance_huge_line() { #[test] fn compliance_special_chars_in_values() { // JSON strings with escape sequences for newlines, tabs, quotes, backslashes, unicode. - // The scanner stores raw byte content between quotes (including escape sequences). + // After #410: scanner decodes escape sequences to their actual characters. let input = br#"{"msg":"line1\nline2\ttab\"quote\\back"}"#; let input_nl = [input.as_slice(), b"\n"].concat(); assert_both_scanners(&input_nl, |batch| { assert_eq!(batch.num_rows(), 1); let val = get_str(batch, "msg", 0).expect("msg should exist"); - // Scanner stores raw escape sequences, so the value should contain - // the literal backslash-n, backslash-t etc. - assert!(val.contains(r"\n"), "missing \\n in: {val}"); - assert!(val.contains(r"\t"), "missing \\t in: {val}"); - assert!(val.contains(r"\\"), "missing \\\\ in: {val}"); + // Scanner decodes escape sequences: \n → newline, \t → tab, \\ → backslash + assert!(val.contains('\n'), "missing decoded newline in: {val:?}"); + assert!(val.contains('\t'), "missing decoded tab in: {val:?}"); + assert!(val.contains('\\'), "missing decoded backslash in: {val:?}"); }); } @@ -391,10 +390,10 @@ fn compliance_escaped_quotes_in_strings() { assert_both_scanners(&input_nl, |batch| { assert_eq!(batch.num_rows(), 1); let val = get_str(batch, "msg", 0).expect("msg should exist"); - // Scanner stores raw content between outer quotes, preserving escape sequences. + // After #410: scanner decodes \" to literal quote characters. assert!( - val.contains("\\\"hello\\\""), - "escaped quotes not preserved: {val}" + val.contains("\"hello\""), + "decoded quotes not found: {val:?}" ); }); } diff --git a/crates/logfwd-core/tests/scanner_conformance.proptest-regressions b/crates/logfwd-core/tests/scanner_conformance.proptest-regressions new file mode 100644 index 000000000..52cf7a4e0 --- /dev/null +++ b/crates/logfwd-core/tests/scanner_conformance.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc bcb9ee033397b2124e7f987ab308beba899a893fa6e66753038223307c7d0328 # shrinks to buf = [123, 34, 65, 34, 58, 34, 34, 44, 34, 65, 97, 34, 58, 34, 34, 44, 34, 65, 98, 34, 58, 34, 34, 44, 34, 65, 65, 34, 58, 34, 34, 44, 34, 65, 48, 34, 58, 34, 34, 44, 34, 95, 48, 34, 58, 34, 34, 44, 34, 65, 99, 34, 58, 34, 34, 44, 34, 95, 97, 34, 58, 34, 34, 44, 34, 65, 66, 34, 58, 34, 34, 44, 34, 65, 49, 34, 58, 34, 34, 44, 34, 65, 67, 34, 58, 34, 34, 44, 34, 97, 34, 58, 34, 34, 125, 10, 123, 34, 97, 65, 34, 58, 34, 34, 44, 34, 65, 100, 34, 58, 34, 34, 44, 34, 97, 95, 34, 58, 34, 34, 44, 34, 97, 97, 34, 58, 34, 34, 44, 34, 97, 48, 34, 58, 34, 34, 44, 34, 66, 34, 58, 34, 34, 44, 34, 65, 50, 34, 58, 34, 34, 44, 34, 95, 49, 34, 58, 34, 34, 44, 34, 65, 95, 34, 58, 34, 34, 44, 34, 65, 51, 34, 58, 34, 34, 44, 34, 95, 95, 34, 58, 34, 34, 125, 10, 123, 34, 65, 101, 34, 58, 34, 34, 44, 34, 97, 98, 34, 58, 34, 34, 44, 34, 97, 99, 34, 58, 34, 34, 44, 34, 65, 102, 34, 58, 34, 34, 44, 34, 65, 52, 34, 58, 34, 34, 44, 34, 95, 65, 34, 58, 34, 34, 44, 34, 98, 34, 58, 34, 34, 44, 34, 65, 68, 34, 58, 34, 34, 44, 34, 65, 103, 34, 58, 34, 34, 125, 10, 123, 34, 65, 69, 34, 58, 34, 34, 44, 34, 97, 66, 34, 58, 34, 34, 44, 34, 97, 95, 95, 34, 58, 34, 34, 44, 34, 97, 100, 34, 58, 34, 34, 44, 34, 95, 66, 34, 58, 34, 34, 44, 34, 65, 104, 34, 58, 34, 34, 44, 34, 97, 67, 34, 58, 34, 34, 125, 10, 123, 34, 65, 70, 34, 58, 34, 34, 44, 34, 65, 65, 65, 34, 58, 34, 34, 44, 34, 65, 71, 34, 58, 34, 34, 44, 34, 65, 72, 34, 58, 34, 34, 44, 34, 95, 98, 34, 58, 34, 34, 44, 34, 97, 68, 34, 58, 34, 34, 44, 34, 65, 73, 34, 58, 34, 34, 44, 34, 97, 95, 97, 34, 58, 34, 34, 44, 34, 95, 99, 34, 58, 34, 34, 44, 34, 95, 67, 34, 58, 34, 34, 125, 10, 123, 34, 65, 65, 95, 34, 58, 34, 34, 44, 34, 65, 74, 34, 58, 34, 34, 44, 34, 65, 75, 34, 58, 34, 34, 44, 34, 95, 50, 34, 58, 34, 34, 44, 34, 97, 69, 34, 58, 34, 34, 44, 34, 97, 101, 34, 58, 34, 34, 44, 34, 97, 49, 34, 58, 34, 34, 44, 34, 97, 102, 34, 58, 34, 34, 44, 34, 95, 48, 97, 34, 58, 34, 34, 44, 34, 65, 95, 48, 34, 58, 34, 34, 44, 34, 65, 48, 65, 34, 58, 34, 34, 125, 10, 123, 34, 97, 103, 34, 58, 34, 34, 44, 34, 65, 76, 34, 58, 34, 34, 44, 34, 65, 95, 65, 34, 58, 34, 34, 125, 10, 123, 34, 67, 34, 58, 34, 34, 44, 34, 95, 34, 58, 34, 92, 34, 34, 44, 34, 95, 34, 58, 102, 97, 108, 115, 101, 125, 10] diff --git a/crates/logfwd-core/tests/scanner_conformance.rs b/crates/logfwd-core/tests/scanner_conformance.rs index c157268de..b4a337df5 100644 --- a/crates/logfwd-core/tests/scanner_conformance.rs +++ b/crates/logfwd-core/tests/scanner_conformance.rs @@ -132,17 +132,14 @@ fn assert_values_correct(input: &[u8]) { }); if !arr.is_null(row) { let actual = arr.value(row); - // Our scanner preserves escape sequences (raw bytes), - // sonic-rs unescapes them. For strings without escapes - // the values must be identical. - if !actual.contains('\\') { - assert_eq!( - actual, - expected, - "String value mismatch at '{key_str}'[{row}].\nExpected: {expected:?}\nActual: {actual:?}\nInput: {:?}", - String::from_utf8_lossy(line) - ); - } + // After #410: scanner decodes JSON escape sequences, + // so values match sonic-rs output for all strings. + assert_eq!( + actual, + expected, + "String value mismatch at '{key_str}'[{row}].\nExpected: {expected:?}\nActual: {actual:?}\nInput: {:?}", + String::from_utf8_lossy(line) + ); } } else if val.is_i64() { let expected = val.as_i64().unwrap(); diff --git a/crates/logfwd-test-utils/src/json.rs b/crates/logfwd-test-utils/src/json.rs index 5dc1b82e8..9c75a008f 100644 --- a/crates/logfwd-test-utils/src/json.rs +++ b/crates/logfwd-test-utils/src/json.rs @@ -18,6 +18,11 @@ pub fn arb_json_string() -> impl Strategy { Just("\\n".to_string()), Just("\\t".to_string()), Just("\\r".to_string()), + Just("\\/".to_string()), + Just("\\b".to_string()), + Just("\\f".to_string()), + // Unicode escape sequences (\uXXXX) — BMP code points + (0x0020u16..0xD800).prop_map(|cp| format!("\\u{cp:04X}")), ], 0..20, )