diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 6c451f10a..843323822 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -71,7 +71,9 @@ public BaseLargeVariableWidthVector(Field field, final BufferAllocator allocator lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; valueCount = 0; lastSet = -1; - offsetBuffer = allocator.getEmpty(); + // Allocate offset buffer with at least OFFSET_WIDTH capacity to ensure + // offset[0] is always available according to Arrow spec. + offsetBuffer = allocateOffsetBuffer(OFFSET_WIDTH); validityBuffer = allocator.getEmpty(); valueBuffer = allocator.getEmpty(); } @@ -373,14 +375,29 @@ private void setReaderAndWriterIndex() { valueBuffer.readerIndex(0); if (valueCount == 0) { validityBuffer.writerIndex(0); - offsetBuffer.writerIndex(0); valueBuffer.writerIndex(0); } else { final long lastDataOffset = getStartOffset(valueCount); validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount)); - offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); valueBuffer.writerIndex(lastDataOffset); } + // IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`. + // Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers + // in other libraries. According to Arrow spec, we should still output the offset buffer which + // is [0]. + final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + if (offsetBuffer.capacity() < requiredOffsetBufferSize) { + // Allocate a new buffer with sufficient capacity. This can happen when vector + // was loaded via loadFieldBuffers() with an empty offset buffer. + ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize); + // Copy existing data if any + if (offsetBuffer.capacity() > 0) { + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + } + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + } + offsetBuffer.writerIndex(requiredOffsetBufferSize); } /** Same as {@link #allocateNewSafe()}. */ @@ -492,7 +509,9 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { /* allocate offset buffer */ private ArrowBuf allocateOffsetBuffer(final long size) { - ArrowBuf offsetBuffer = allocator.buffer(size); + // Ensure at least OFFSET_WIDTH capacity according to Arrow spec + final long curSize = Math.max(size, OFFSET_WIDTH); + ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); offsetBuffer.setZero(0, offsetBuffer.capacity()); return offsetBuffer; diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 96e2afbd2..58b5dd11a 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -69,7 +69,9 @@ public BaseVariableWidthVector(Field field, final BufferAllocator allocator) { lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1; valueCount = 0; lastSet = -1; - offsetBuffer = allocator.getEmpty(); + // Allocate offset buffer with at least OFFSET_WIDTH capacity to ensure + // offset[0] is always available according to Arrow spec. + offsetBuffer = allocateOffsetBuffer(OFFSET_WIDTH); validityBuffer = allocator.getEmpty(); valueBuffer = allocator.getEmpty(); } @@ -389,14 +391,29 @@ private void setReaderAndWriterIndex() { valueBuffer.readerIndex(0); if (valueCount == 0) { validityBuffer.writerIndex(0); - offsetBuffer.writerIndex(0); valueBuffer.writerIndex(0); } else { final int lastDataOffset = getStartOffset(valueCount); validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount)); - offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); valueBuffer.writerIndex(lastDataOffset); } + // IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`. + // Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers + // in other libraries. According to Arrow spec, we should still output the offset buffer which + // is [0]. + final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + if (offsetBuffer.capacity() < requiredOffsetBufferSize) { + // Allocate a new buffer with sufficient capacity. This can happen when vector + // was loaded via loadFieldBuffers() with an empty offset buffer. + ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize); + // Copy existing data if any + if (offsetBuffer.capacity() > 0) { + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + } + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + } + offsetBuffer.writerIndex(requiredOffsetBufferSize); } /** Same as {@link #allocateNewSafe()}. */ @@ -509,7 +526,8 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) { /* allocate offset buffer */ private ArrowBuf allocateOffsetBuffer(final long size) { - final int curSize = (int) size; + // Ensure at least OFFSET_WIDTH capacity according to Arrow spec + final int curSize = (int) Math.max(size, OFFSET_WIDTH); ArrowBuf offsetBuffer = allocator.buffer(curSize); offsetBuffer.readerIndex(0); offsetBuffer.setZero(0, offsetBuffer.capacity()); diff --git a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index df42d04e6..22c93b0cb 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -3940,4 +3940,42 @@ public void testVectorLoadUnloadOnNonVariadicVectors() { } } } + + @Test + public void testEmptyVarCharOffsetBuffer() { + // Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0) + // even when valueCount is 0, per Arrow specification. + try (VarCharVector vector = newVarCharVector("varchar", allocator)) { + vector.allocateNew(); + vector.setValueCount(0); + + List buffers = vector.getFieldBuffers(); + // buffers: [validity, offset, data] + assertTrue( + buffers.get(1).readableBytes() >= BaseVariableWidthVector.OFFSET_WIDTH, + "Offset buffer should have at least " + + BaseVariableWidthVector.OFFSET_WIDTH + + " bytes for offset[0]"); + assertEquals(0, vector.getOffsetBuffer().getInt(0)); + } + } + + @Test + public void testEmptyLargeVarCharOffsetBuffer() { + // Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0) + // even when valueCount is 0, per Arrow specification. + try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", allocator)) { + vector.allocateNew(); + vector.setValueCount(0); + + List buffers = vector.getFieldBuffers(); + // buffers: [validity, offset, data] + assertTrue( + buffers.get(1).readableBytes() >= BaseLargeVariableWidthVector.OFFSET_WIDTH, + "Offset buffer should have at least " + + BaseLargeVariableWidthVector.OFFSET_WIDTH + + " bytes for offset[0]"); + assertEquals(0, vector.getOffsetBuffer().getLong(0)); + } + } }