diff --git a/.github/patches/enable-binaryview-integration-tests.patch b/.github/patches/enable-binaryview-integration-tests.patch new file mode 100644 index 00000000..ac5c17e1 --- /dev/null +++ b/.github/patches/enable-binaryview-integration-tests.patch @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py +index 83913dc379..7ace28e1be 100644 +--- a/dev/archery/archery/integration/datagen.py ++++ b/dev/archery/archery/integration/datagen.py +@@ -2003,7 +2003,6 @@ def get_generated_json_files(tempdir=None): + .skip_tester('Rust'), + + generate_binary_view_case() +- .skip_tester('JS') + # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) + .skip_tester('nanoarrow') + .skip_tester('Rust'), diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5e96bc17..344942a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -193,6 +193,9 @@ jobs: uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: 3 + - name: Patch Archery to enable BinaryView tests + run: | + patch -p1 < js/.github/patches/enable-binaryview-integration-tests.patch - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/scripts/update_flatbuffers.sh b/scripts/update_flatbuffers.sh new file mode 100755 index 00000000..d81dfbc3 --- /dev/null +++ b/scripts/update_flatbuffers.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Regenerate the FlatBuffers helper files used by arrow-js. Requires a sibling +# checkout of apache/arrow (../arrow) if not provided in env and a working flatc on PATH. + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FORMAT_DIR="${PROJECT_ROOT}/../arrow/format" + +if [[ ! -d "${FORMAT_DIR}" ]]; then + echo "error: expected FlatBuffers schemas in ../arrow/format" >&2 + exit 1 +fi + +if ! command -v flatc >/dev/null 2>&1; then + echo "error: flatc not found on PATH" >&2 + exit 1 +fi + +TMPDIR="$(mktemp -d "${PROJECT_ROOT}/.flatc.XXXXXX")" +cleanup() { + rm -rf "${TMPDIR}" +} +trap cleanup EXIT + +schemas=(File Schema Message Tensor SparseTensor) + +for schema in "${schemas[@]}"; do + cp "${FORMAT_DIR}/${schema}.fbs" "${TMPDIR}/${schema}.fbs" + sed -i '' \ + -e 's/namespace org.apache.arrow.flatbuf;//g' \ + -e 's/org\.apache\.arrow\.flatbuf\.//g' \ + "${TMPDIR}/${schema}.fbs" +done + +flatc --ts --ts-flat-files --ts-omit-entrypoint \ + -o "${TMPDIR}" \ + "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +rm -f "${TMPDIR}"/{File,Schema,Message,Tensor,SparseTensor}.fbs + +generated_files=( + binary-view.ts + list-view.ts + large-list-view.ts + message.ts + record-batch.ts + schema.ts + type.ts + utf8-view.ts +) + +for file in "${generated_files[@]}"; do + if [[ ! -f "${TMPDIR}/${file}" ]]; then + echo "error: expected generated file ${file} not found" >&2 + exit 1 + fi + install -m 0644 "${TMPDIR}/${file}" "${PROJECT_ROOT}/src/fb/${file}" +done diff --git a/src/Arrow.dom.ts b/src/Arrow.dom.ts index e0cd681c..512e761e 100644 --- a/src/Arrow.dom.ts +++ b/src/Arrow.dom.ts @@ -49,14 +49,14 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, ListView, LargeListView, Struct, StructRow, Union, DenseUnion, SparseUnion, Dictionary, @@ -81,7 +81,7 @@ export { } from './Arrow.js'; export { - BinaryBuilder, LargeBinaryBuilder, + BinaryBuilder, BinaryViewBuilder, LargeBinaryBuilder, BoolBuilder, DateBuilder, DateDayBuilder, DateMillisecondBuilder, DecimalBuilder, @@ -92,12 +92,12 @@ export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder, DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder, IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder, - ListBuilder, + ListBuilder, ListViewBuilder, LargeListViewBuilder, MapBuilder, NullBuilder, StructBuilder, TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, TimestampMicrosecondBuilder, TimestampNanosecondBuilder, TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecondBuilder, TimeNanosecondBuilder, UnionBuilder, DenseUnionBuilder, SparseUnionBuilder, - Utf8Builder, LargeUtf8Builder + Utf8Builder, Utf8ViewBuilder, LargeUtf8Builder } from './Arrow.js'; diff --git a/src/Arrow.ts b/src/Arrow.ts index 8321026f..73edbd42 100644 --- a/src/Arrow.ts +++ b/src/Arrow.ts @@ -37,14 +37,14 @@ export { Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, ListView, LargeListView, Struct, Union, DenseUnion, SparseUnion, Dictionary, @@ -79,10 +79,13 @@ export { TimestampBuilder, TimestampSecondBuilder, TimestampMillisecondBuilder, export { IntervalBuilder, IntervalDayTimeBuilder, IntervalYearMonthBuilder, IntervalMonthDayNanoBuilder } from './builder/interval.js'; export { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from './builder/duration.js'; export { Utf8Builder } from './builder/utf8.js'; +export { Utf8ViewBuilder } from './builder/utf8view.js'; export { LargeUtf8Builder } from './builder/largeutf8.js'; export { BinaryBuilder } from './builder/binary.js'; +export { BinaryViewBuilder } from './builder/binaryview.js'; export { LargeBinaryBuilder } from './builder/largebinary.js'; export { ListBuilder } from './builder/list.js'; +export { ListViewBuilder, LargeListViewBuilder } from './builder/listview.js'; export { FixedSizeListBuilder } from './builder/fixedsizelist.js'; export { MapBuilder } from './builder/map.js'; export { StructBuilder } from './builder/struct.js'; diff --git a/src/builder/binaryview.ts b/src/builder/binaryview.ts new file mode 100644 index 00000000..31addf8e --- /dev/null +++ b/src/builder/binaryview.ts @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView } from '../type.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { toUint8Array } from '../util/buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class BinaryViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; // 32MB per buffer as per spec recommendation + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + for (const buffer of this._variadicBuffers) { + size += buffer.byteLength; + } + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: Uint8Array) { + const data = toUint8Array(value); + const length = data.length; + + // Ensure views buffer has space up to this index (similar to FixedWidthBuilder) + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + // Write length (4 bytes, little-endian) + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + // Inline: store data directly in view struct (up to 12 bytes) + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + // Zero out remaining bytes + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + // Out-of-line: store in variadic buffer + // Write prefix (first 4 bytes of data) + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + // Allocate space in variadic buffer + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + // Start a new buffer + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + // Write data to current buffer + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + // Write buffer index and offset to view struct + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + // Ensure space is allocated in the views buffer for this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const result = super.setValid(index, isValid); + + if (!result) { + // For null values, zero out the view struct + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + } + + return result; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + // Finalize current buffer if it exists + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + // Reset variadic buffers for next batch + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/builder/listview.ts b/src/builder/listview.ts new file mode 100644 index 00000000..ca26cf78 --- /dev/null +++ b/src/builder/listview.ts @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Field } from '../schema.js'; +import { Vector } from '../vector.js'; +import { DataType, ListView, LargeListView } from '../type.js'; +import { DataBufferBuilder } from './buffer.js'; +import { Builder, BuilderOptions } from '../builder.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class ListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = 0; + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(Int32Array, 0); + this._sizes = new DataBufferBuilder(Int32Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('ListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new ListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = 0; + this._offsets.clear(); + this._sizes.clear(); + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for ListView + const type = this.type; + const length = this.length; + const nullCount = this.nullCount; + const offsetsBuilder = this._offsets; + const sizesBuilder = this._sizes; + const nullsBuilder = this._nulls; + + const valueOffsets = offsetsBuilder.flush(length); + const valueSizes = sizesBuilder.flush(length); + const nullBitmap = nullCount > 0 ? nullsBuilder.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + const entries = [...pending.entries()].sort((a, b) => a[0] - b[0]); + for (const [index, value] of entries) { + const offset = this._writeIndex; + offsets.set(index, offset); + + if (typeof value === 'undefined') { + sizes.set(index, 0); + continue; + } + + const listValues = value as T['TValue']; + const length = Array.isArray(listValues) + ? listValues.length + : (listValues as Vector).length; + sizes.set(index, length); + + for (let i = 0; i < length; i++) { + const element = Array.isArray(listValues) + ? listValues[i] + : (listValues as Vector).get(i); + if (element == null) { + child.setValid(offset + i, false); + } else { + child.set(offset + i, element as any); + } + } + + this._writeIndex += length; + } + } +} + +/** @ignore */ +export class LargeListViewBuilder extends Builder, TNull> { + protected _offsets: DataBufferBuilder; + protected _sizes: DataBufferBuilder; + protected _pending: Map | undefined; + protected _writeIndex = BigInt(0); // BigInt for LargeListView + + constructor(opts: BuilderOptions, TNull>) { + super(opts); + this._offsets = new DataBufferBuilder(BigInt64Array, 0); + this._sizes = new DataBufferBuilder(BigInt64Array, 0); + } + + public addChild(child: Builder, name = '0') { + if (this.numChildren > 0) { + throw new Error('LargeListViewBuilder can only have one child.'); + } + this.children[this.numChildren] = child; + this.type = new LargeListView(new Field(name, child.type, true)); + return this.numChildren - 1; + } + + public setValue(index: number, value: T['TValue']) { + const pending = this._pending || (this._pending = new Map()); + pending.set(index, value); + } + + public setValid(index: number, isValid: boolean) { + if (!super.setValid(index, isValid)) { + (this._pending || (this._pending = new Map())).set(index, undefined); + return false; + } + return true; + } + + public clear() { + this._pending = undefined; + this._writeIndex = BigInt(0); + this._offsets.clear(); + this._sizes.clear(); + return super.clear(); + } + + public flush() { + this._flush(); + + // Custom flush logic for LargeListView + const type = this.type; + const length = this.length; + const nullCount = this.nullCount; + const offsetsBuilder = this._offsets; + const sizesBuilder = this._sizes; + const nullsBuilder = this._nulls; + + const valueOffsets = offsetsBuilder.flush(length); + const valueSizes = sizesBuilder.flush(length); + const nullBitmap = nullCount > 0 ? nullsBuilder.flush(length) : undefined; + const children = this.children.map((child) => child.flush()); + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + child: children[0] + }); + } + + public finish() { + this._flush(); + return super.finish(); + } + + protected _flush() { + const pending = this._pending; + this._pending = undefined; + if (pending && pending.size > 0) { + this._flushPending(pending); + } + } + + protected _flushPending(pending: Map) { + const offsets = this._offsets; + const sizes = this._sizes; + const [child] = this.children; + + const entries = [...pending.entries()].sort((a, b) => a[0] - b[0]); + for (const [index, value] of entries) { + const offset = this._writeIndex; + offsets.set(index, offset); + + if (typeof value === 'undefined') { + sizes.set(index, BigInt(0)); + continue; + } + + const listValues = value as T['TValue']; + const numericLength = Array.isArray(listValues) + ? listValues.length + : (listValues as Vector).length; + const length = BigInt(numericLength); + sizes.set(index, length); + + for (let i = 0; i < numericLength; i++) { + const element = Array.isArray(listValues) + ? listValues[i] + : (listValues as Vector).get(i); + const targetIndex = Number(offset + BigInt(i)); + if (element == null) { + child.setValid(targetIndex, false); + } else { + child.set(targetIndex, element as any); + } + } + + this._writeIndex += length; + } + } +} diff --git a/src/builder/utf8view.ts b/src/builder/utf8view.ts new file mode 100644 index 00000000..f71bf210 --- /dev/null +++ b/src/builder/utf8view.ts @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { Utf8View, BinaryView } from '../type.js'; +import { encodeUtf8 } from '../util/utf8.js'; +import { BuilderOptions, Builder } from '../builder.js'; +import { BufferBuilder } from './buffer.js'; +import { makeData } from '../data.js'; + +/** @ignore */ +export class Utf8ViewBuilder extends Builder { + protected _views: BufferBuilder; + protected _variadicBuffers: Uint8Array[] = []; + protected _currentBuffer: BufferBuilder | null = null; + protected _currentBufferIndex = 0; + protected _currentBufferOffset = 0; + protected readonly _bufferSize = 32 * 1024 * 1024; + + constructor(opts: BuilderOptions) { + super(opts); + this._views = new BufferBuilder(Uint8Array); + } + + public get byteLength(): number { + let size = 0; + this._views && (size += this._views.byteLength); + this._nulls && (size += this._nulls.byteLength); + size += this._variadicBuffers.reduce((acc, buffer) => acc + buffer.byteLength, 0); + this._currentBuffer && (size += this._currentBuffer.byteLength); + return size; + } + + public setValue(index: number, value: string) { + const data = encodeUtf8(value); + const length = data.length; + + // Ensure views buffer has space up to this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + const view = new DataView(viewBuffer.buffer, viewBuffer.byteOffset + viewOffset, BinaryView.ELEMENT_WIDTH); + + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + viewBuffer.set(data, viewOffset + BinaryView.INLINE_OFFSET); + for (let i = length; i < BinaryView.INLINE_CAPACITY; i++) { + viewBuffer[viewOffset + BinaryView.INLINE_OFFSET + i] = 0; + } + } else { + const prefix = new DataView(data.buffer, data.byteOffset, Math.min(4, length)); + view.setUint32(BinaryView.INLINE_OFFSET, prefix.getUint32(0, true), true); + + if (!this._currentBuffer || this._currentBufferOffset + length > this._bufferSize) { + if (this._currentBuffer) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + } + this._currentBuffer = new BufferBuilder(Uint8Array); + this._currentBufferIndex = this._variadicBuffers.length; + this._currentBufferOffset = 0; + } + + const bufferData = this._currentBuffer.reserve(length).buffer; + bufferData.set(data, this._currentBufferOffset); + + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, this._currentBufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, this._currentBufferOffset, true); + + this._currentBufferOffset += length; + } + + return this; + } + + public setValid(index: number, isValid: boolean) { + // Ensure space is allocated in the views buffer for this index + const bytesNeeded = (index + 1) * BinaryView.ELEMENT_WIDTH; + const currentBytes = this._views.length; + if (bytesNeeded > currentBytes) { + this._views.reserve(bytesNeeded - currentBytes); + } + + const result = super.setValid(index, isValid); + + if (!result) { + // For null values, zero out the view struct + const viewBuffer = this._views.buffer; + const viewOffset = index * BinaryView.ELEMENT_WIDTH; + for (let i = 0; i < BinaryView.ELEMENT_WIDTH; i++) { + viewBuffer[viewOffset + i] = 0; + } + } + + return result; + } + + public clear() { + this._variadicBuffers = []; + this._currentBuffer = null; + this._currentBufferIndex = 0; + this._currentBufferOffset = 0; + this._views.clear(); + return super.clear(); + } + + public flush() { + const { type, length, nullCount, _views, _nulls } = this; + + if (this._currentBuffer && this._currentBufferOffset > 0) { + this._variadicBuffers.push(this._currentBuffer.buffer.slice(0, this._currentBufferOffset)); + this._currentBuffer = null; + this._currentBufferOffset = 0; + } + + const views = _views.flush(length * BinaryView.ELEMENT_WIDTH); + const nullBitmap = nullCount > 0 ? _nulls.flush(length) : undefined; + const variadicBuffers = this._variadicBuffers.slice(); + + this._variadicBuffers = []; + this._currentBufferIndex = 0; + + this.clear(); + + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } + + public finish() { + this.finished = true; + return this; + } +} diff --git a/src/data.ts b/src/data.ts index 45fcc35d..74a2cf4b 100644 --- a/src/data.ts +++ b/src/data.ts @@ -68,6 +68,14 @@ export class Data { declare public readonly typeIds: Buffers[BufferType.TYPE]; declare public readonly nullBitmap: Buffers[BufferType.VALIDITY]; declare public readonly valueOffsets: Buffers[BufferType.OFFSET]; + declare public readonly variadicBuffers: ReadonlyArray; + + public get valueSizes(): Buffers[BufferType.DATA] | undefined { + if (DataType.isListView(this.type) || DataType.isLargeListView(this.type)) { + return this.values; + } + return undefined; + } public get typeId(): T['TType'] { return this.type.typeId; } @@ -97,6 +105,7 @@ export class Data { values && (byteLength += values.byteLength); nullBitmap && (byteLength += nullBitmap.byteLength); typeIds && (byteLength += typeIds.byteLength); + byteLength += this.variadicBuffers.reduce((size, data) => size + (data?.byteLength ?? 0), 0); return this.children.reduce((byteLength, child) => byteLength + child.byteLength, byteLength); } @@ -117,7 +126,16 @@ export class Data { return nullCount; } - constructor(type: T, offset: number, length: number, nullCount?: number, buffers?: Partial> | Data, children: Data[] = [], dictionary?: Vector) { + constructor( + type: T, + offset: number, + length: number, + nullCount?: number, + buffers?: Partial> | Data, + children: Data[] = [], + dictionary?: Vector, + variadicBuffers: ReadonlyArray = [] + ) { this.type = type; this.children = children; this.dictionary = dictionary; @@ -131,6 +149,7 @@ export class Data { this.typeIds = buffers.typeIds; this.nullBitmap = buffers.nullBitmap; this.valueOffsets = buffers.valueOffsets; + this.variadicBuffers = buffers.variadicBuffers; } else { this.stride = strideForType(type); if (buffers) { @@ -139,15 +158,22 @@ export class Data { (buffer = (buffers as Buffers)[2]) && (this.nullBitmap = buffer); (buffer = (buffers as Buffers)[3]) && (this.typeIds = buffer); } + this.variadicBuffers = variadicBuffers; } + this.variadicBuffers ??= []; } public getValid(index: number): boolean { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; return child.getValid(indexInChild); } if (this.nullable && this.nullCount > 0) { @@ -163,8 +189,13 @@ export class Data { const { type } = this; if (DataType.isUnion(type)) { const union = (type as Union); - const child = this.children[union.typeIdToChildIndex[this.typeIds[index]]]; - const indexInChild = union.mode === UnionMode.Dense ? this.valueOffsets[index] : index; + const typeId = this.typeIds[index]; + const childIndex = union.typeIdToChildIndex[typeId]; + const child = this.children[childIndex]; + const valueOffsets = this.valueOffsets as Int32Array | BigInt64Array | undefined; + const indexInChild = union.mode === UnionMode.Dense && valueOffsets + ? Number(valueOffsets[index]) + : index; prev = child.getValid(indexInChild); child.setValid(indexInChild, value); } else { @@ -200,8 +231,16 @@ export class Data { return value; } - public clone(type: R = this.type as any, offset = this.offset, length = this.length, nullCount = this._nullCount, buffers: Buffers = this, children: Data[] = this.children) { - return new Data(type, offset, length, nullCount, buffers, children, this.dictionary); + public clone( + type: R = this.type as any, + offset = this.offset, + length = this.length, + nullCount = this._nullCount, + buffers: Buffers = this, + children: Data[] = this.children, + variadicBuffers: ReadonlyArray = this.variadicBuffers + ) { + return new Data(type, offset, length, nullCount, buffers, children, this.dictionary, variadicBuffers); } public slice(offset: number, length: number): Data { @@ -214,12 +253,13 @@ export class Data { const buffers = this._sliceBuffers(offset, length, stride, typeId); return this.clone(this.type, this.offset + offset, length, nullCount, buffers, // Don't slice children if we have value offsets (the variable-width types) - (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length)); + (children.length === 0 || this.valueOffsets) ? children : this._sliceChildren(children, childStride * offset, childStride * length), + this.variadicBuffers); } public _changeLengthAndBackfillNullBitmap(newLength: number): Data { if (this.typeId === Type.Null) { - return this.clone(this.type, 0, newLength, 0); + return this.clone(this.type, 0, newLength, 0, this.buffers, this.children, this.variadicBuffers); } const { length, nullCount } = this; // start initialized with 0s (nulls), then fill from 0 to length with 1s (not null) @@ -232,7 +272,7 @@ export class Data { } const buffers = this.buffers; buffers[BufferType.VALIDITY] = bitmap; - return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers); + return this.clone(this.type, 0, newLength, nullCount + (newLength - length), buffers, this.children, this.variadicBuffers); } protected _sliceBuffers(offset: number, length: number, stride: number, typeId: T['TType']): Buffers { @@ -240,10 +280,18 @@ export class Data { const { buffers } = this; // If typeIds exist, slice the typeIds buffer (arr = buffers[BufferType.TYPE]) && (buffers[BufferType.TYPE] = arr.subarray(offset, offset + length)); - // If offsets exist, only slice the offsets buffer - (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || - // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes - (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + if (DataType.isBinaryView(this.type) || DataType.isUtf8View(this.type)) { + const width = BinaryView.ELEMENT_WIDTH; + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = arr.subarray(offset * width, (offset + length) * width)); + } else if (DataType.isListView(this.type) || DataType.isLargeListView(this.type)) { + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length)); + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = arr.subarray(offset, offset + length)); + } else { + // If offsets exist, only slice the offsets buffer + (arr = buffers[BufferType.OFFSET]) && (buffers[BufferType.OFFSET] = arr.subarray(offset, offset + length + 1)) || + // Otherwise if no offsets, slice the data buffer. Don't slice the data vector for Booleans, since the offset goes by bits not bytes + (arr = buffers[BufferType.DATA]) && (buffers[BufferType.DATA] = typeId === 6 ? arr : arr.subarray(stride * offset, stride * (offset + length))); + } return buffers; } @@ -256,7 +304,7 @@ export class Data { import { Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Int, Date_, @@ -311,6 +359,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitUtf8View(props: Utf8ViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / Utf8View.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeUtf8(props: LargeUtf8DataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -327,6 +384,15 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, data, nullBitmap]); } + public visitBinaryView(props: BinaryViewDataProps) { + const { ['type']: type, ['offset']: offset = 0 } = props; + const views = toArrayBufferView(type.ArrayType, props['views']); + const nullBitmap = toUint8Array(props['nullBitmap']); + const variadicBuffers = (props['variadicBuffers'] || []).map((buffer) => toUint8Array(buffer)); + const length = props['length'] ?? Math.trunc(views.length / BinaryView.ELEMENT_WIDTH); + const nullCount = props['nullBitmap'] ? -1 : 0; + return new Data(type, offset, length, nullCount, [undefined, views, nullBitmap], [], undefined, variadicBuffers); + } public visitLargeBinary(props: LargeBinaryDataProps) { const { ['type']: type, ['offset']: offset = 0 } = props; const data = toUint8Array(props['data']); @@ -377,6 +443,24 @@ class MakeDataVisitor extends Visitor { const { ['length']: length = valueOffsets.length - 1, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; return new Data(type, offset, length, nullCount, [valueOffsets, undefined, nullBitmap], [child]); } + public visitListView(props: ListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toInt32Array(props['valueOffsets']); + const sizesSource = props['valueSizes'] ?? props['sizes']; + const valueSizes = toInt32Array(sizesSource); + const { ['length']: length = valueSizes.length, ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, valueSizes, nullBitmap], [child]); + } + public visitLargeListView(props: LargeListViewDataProps) { + const { ['type']: type, ['offset']: offset = 0, ['child']: child } = props; + const nullBitmap = toUint8Array(props['nullBitmap']); + const valueOffsets = toBigInt64Array(props['valueOffsets']); + const sizesSource = props['valueSizes'] ?? props['sizes']; + const valueSizes = toBigInt64Array(sizesSource); + const { ['length']: length = Number(valueSizes.length), ['nullCount']: nullCount = props['nullBitmap'] ? -1 : 0 } = props; + return new Data(type, offset, length, nullCount, [valueOffsets, valueSizes, nullBitmap], [child]); + } public visitStruct(props: StructDataProps) { const { ['type']: type, ['offset']: offset = 0, ['children']: children = [] } = props; const nullBitmap = toUint8Array(props['nullBitmap']); @@ -455,10 +539,14 @@ interface IntervalDataProps extends DataProps_ { data?: D interface DurationDataProps extends DataProps_ { data?: DataBuffer } interface FixedSizeBinaryDataProps extends DataProps_ { data?: DataBuffer } interface BinaryDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface BinaryViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeBinaryDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface Utf8DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; data?: DataBuffer } +interface Utf8ViewDataProps extends DataProps_ { views: DataBuffer; variadicBuffers?: ReadonlyArray | Iterable | Uint8Array>; data?: DataBuffer } interface LargeUtf8DataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; data?: DataBuffer } interface ListDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } +interface ListViewDataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; valueSizes: ValueOffsetsBuffer; child: Data } +interface LargeListViewDataProps extends DataProps_ { valueOffsets: LargeValueOffsetsBuffer | ValueOffsetsBuffer; valueSizes: LargeValueOffsetsBuffer | ValueOffsetsBuffer; child: Data } interface FixedSizeListDataProps extends DataProps_ { child: Data } interface StructDataProps extends DataProps_ { children: Data[] } interface Map_DataProps extends DataProps_ { valueOffsets: ValueOffsetsBuffer; child: Data } @@ -481,9 +569,13 @@ export type DataProps = ( T extends FixedSizeBinary /* */ ? FixedSizeBinaryDataProps : T extends Binary /* */ ? BinaryDataProps : T extends LargeBinary /* */ ? LargeBinaryDataProps : + T extends BinaryView /* */ ? BinaryViewDataProps : T extends Utf8 /* */ ? Utf8DataProps : T extends LargeUtf8 /* */ ? LargeUtf8DataProps : + T extends Utf8View /* */ ? Utf8ViewDataProps : T extends List /* */ ? ListDataProps : + T extends ListView /* */ ? ListViewDataProps : + T extends LargeListView /* */ ? LargeListViewDataProps : T extends FixedSizeList /* */ ? FixedSizeListDataProps : T extends Struct /* */ ? StructDataProps : T extends Map_ /* */ ? Map_DataProps : @@ -507,11 +599,15 @@ export function makeData(props: TimestampDataProps): Dat export function makeData(props: IntervalDataProps): Data; export function makeData(props: DurationDataProps): Data; export function makeData(props: FixedSizeBinaryDataProps): Data; +export function makeData(props: BinaryViewDataProps): Data; export function makeData(props: BinaryDataProps): Data; export function makeData(props: LargeBinaryDataProps): Data; export function makeData(props: Utf8DataProps): Data; export function makeData(props: LargeUtf8DataProps): Data; +export function makeData(props: Utf8ViewDataProps): Data; export function makeData(props: ListDataProps): Data; +export function makeData(props: ListViewDataProps): Data; +export function makeData(props: LargeListViewDataProps): Data; export function makeData(props: FixedSizeListDataProps): Data; export function makeData(props: StructDataProps): Data; export function makeData(props: Map_DataProps): Data; diff --git a/src/enum.ts b/src/enum.ts index 73d95538..facb2184 100644 --- a/src/enum.ts +++ b/src/enum.ts @@ -70,6 +70,10 @@ export enum Type { Duration = 18, /** Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds */ LargeBinary = 19, /** Large variable-length bytes (no guarantee of UTF8-ness) */ LargeUtf8 = 20, /** Large variable-length string as List */ + BinaryView = 23, /** Variable-length binary values backed by inline-or-referenced views */ + Utf8View = 24, /** Variable-length UTF8 string values backed by inline-or-referenced views */ + ListView = 25, /** Variable-length list values backed by entry views */ + LargeListView = 26, /** Large variable-length list values backed by entry views */ Dictionary = -1, /** Dictionary aka Category type */ Int8 = -2, diff --git a/src/fb/File.ts b/src/fb/File.ts new file mode 100644 index 00000000..12c6f822 --- /dev/null +++ b/src/fb/File.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +export { Binary } from './binary.js'; +export { BinaryView } from './binary-view.js'; +export { Block } from './block.js'; +export { Bool } from './bool.js'; +export { Buffer } from './buffer.js'; +export { Date } from './date.js'; +export { DateUnit } from './date-unit.js'; +export { Decimal } from './decimal.js'; +export { DictionaryEncoding } from './dictionary-encoding.js'; +export { DictionaryKind } from './dictionary-kind.js'; +export { Duration } from './duration.js'; +export { Endianness } from './endianness.js'; +export { Feature } from './feature.js'; +export { Field } from './field.js'; +export { FixedSizeBinary } from './fixed-size-binary.js'; +export { FixedSizeList } from './fixed-size-list.js'; +export { FloatingPoint } from './floating-point.js'; +export { Footer } from './footer.js'; +export { Int } from './int.js'; +export { Interval } from './interval.js'; +export { IntervalUnit } from './interval-unit.js'; +export { KeyValue } from './key-value.js'; +export { LargeBinary } from './large-binary.js'; +export { LargeList } from './large-list.js'; +export { LargeListView } from './large-list-view.js'; +export { LargeUtf8 } from './large-utf8.js'; +export { List } from './list.js'; +export { ListView } from './list-view.js'; +export { Map } from './map.js'; +export { MetadataVersion } from './metadata-version.js'; +export { Null } from './null.js'; +export { Precision } from './precision.js'; +export { RunEndEncoded } from './run-end-encoded.js'; +export { Schema } from './schema.js'; +export { Struct_ } from './struct-.js'; +export { Time } from './time.js'; +export { TimeUnit } from './time-unit.js'; +export { Timestamp } from './timestamp.js'; +export { Type } from './type.js'; +export { Union } from './union.js'; +export { UnionMode } from './union-mode.js'; +export { Utf8 } from './utf8.js'; +export { Utf8View } from './utf8-view.js'; diff --git a/src/fb/binary-view.ts b/src/fb/binary-view.ts new file mode 100644 index 00000000..f91f910f --- /dev/null +++ b/src/fb/binary-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Binary, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class BinaryView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):BinaryView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsBinaryView(bb:flatbuffers.ByteBuffer, obj?:BinaryView):BinaryView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new BinaryView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startBinaryView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createBinaryView(builder:flatbuffers.Builder):flatbuffers.Offset { + BinaryView.startBinaryView(builder); + return BinaryView.endBinaryView(builder); +} +} diff --git a/src/fb/large-list-view.ts b/src/fb/large-list-view.ts new file mode 100644 index 00000000..5785cd3f --- /dev/null +++ b/src/fb/large-list-view.ts @@ -0,0 +1,42 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Same as ListView, but with 64-bit offsets and sizes, allowing to represent + * extremely large data values. + */ +export class LargeListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):LargeListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsLargeListView(bb:flatbuffers.ByteBuffer, obj?:LargeListView):LargeListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new LargeListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startLargeListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createLargeListView(builder:flatbuffers.Builder):flatbuffers.Offset { + LargeListView.startLargeListView(builder); + return LargeListView.endLargeListView(builder); +} +} diff --git a/src/fb/list-view.ts b/src/fb/list-view.ts new file mode 100644 index 00000000..f9afae01 --- /dev/null +++ b/src/fb/list-view.ts @@ -0,0 +1,43 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Represents the same logical types that List can, but contains offsets and + * sizes allowing for writes in any order and sharing of child values among + * list values. + */ +export class ListView { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):ListView { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsListView(bb:flatbuffers.ByteBuffer, obj?:ListView):ListView { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new ListView()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startListView(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endListView(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createListView(builder:flatbuffers.Builder):flatbuffers.Offset { + ListView.startListView(builder); + return ListView.endListView(builder); +} +} diff --git a/src/fb/message.ts b/src/fb/message.ts index d752b91b..d3518599 100644 --- a/src/fb/message.ts +++ b/src/fb/message.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { KeyValue } from './key-value.js'; diff --git a/src/fb/record-batch.ts b/src/fb/record-batch.ts index 00681999..e6f41d02 100644 --- a/src/fb/record-batch.ts +++ b/src/fb/record-batch.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { BodyCompression } from './body-compression.js'; @@ -78,8 +80,34 @@ compression(obj?:BodyCompression):BodyCompression|null { return offset ? (obj || new BodyCompression()).__init(this.bb!.__indirect(this.bb_pos + offset), this.bb!) : null; } +/** + * Some types such as Utf8View are represented using a variable number of buffers. + * For each such Field in the pre-ordered flattened logical schema, there will be + * an entry in variadicBufferCounts to indicate the number of number of variadic + * buffers which belong to that Field in the current RecordBatch. + * + * For example, the schema + * col1: Struct + * col2: Utf8View + * contains two Fields with variadic buffers so variadicBufferCounts will have + * two entries, the first counting the variadic buffers of `col1.beta` and the + * second counting `col2`'s. + * + * This field may be omitted if and only if the schema contains no Fields with + * a variable number of buffers, such as BinaryView and Utf8View. + */ +variadicBufferCounts(index: number):bigint|null { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.readInt64(this.bb!.__vector(this.bb_pos + offset) + index * 8) : BigInt(0); +} + +variadicBufferCountsLength():number { + const offset = this.bb!.__offset(this.bb_pos, 12); + return offset ? this.bb!.__vector_len(this.bb_pos + offset) : 0; +} + static startRecordBatch(builder:flatbuffers.Builder) { - builder.startObject(4); + builder.startObject(5); } static addLength(builder:flatbuffers.Builder, length:bigint) { @@ -106,6 +134,22 @@ static addCompression(builder:flatbuffers.Builder, compressionOffset:flatbuffers builder.addFieldOffset(3, compressionOffset, 0); } +static addVariadicBufferCounts(builder:flatbuffers.Builder, variadicBufferCountsOffset:flatbuffers.Offset) { + builder.addFieldOffset(4, variadicBufferCountsOffset, 0); +} + +static createVariadicBufferCountsVector(builder:flatbuffers.Builder, data:bigint[]):flatbuffers.Offset { + builder.startVector(8, data.length, 8); + for (let i = data.length - 1; i >= 0; i--) { + builder.addInt64(data[i]!); + } + return builder.endVector(); +} + +static startVariadicBufferCountsVector(builder:flatbuffers.Builder, numElems:number) { + builder.startVector(8, numElems, 8); +} + static endRecordBatch(builder:flatbuffers.Builder):flatbuffers.Offset { const offset = builder.endObject(); return offset; diff --git a/src/fb/schema.ts b/src/fb/schema.ts index 394883eb..daae447e 100644 --- a/src/fb/schema.ts +++ b/src/fb/schema.ts @@ -1,5 +1,7 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import * as flatbuffers from 'flatbuffers'; import { Endianness } from './endianness.js'; @@ -133,14 +135,6 @@ static endSchema(builder:flatbuffers.Builder):flatbuffers.Offset { return offset; } -static finishSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset); -} - -static finishSizePrefixedSchemaBuffer(builder:flatbuffers.Builder, offset:flatbuffers.Offset) { - builder.finish(offset, undefined, true); -} - static createSchema(builder:flatbuffers.Builder, endianness:Endianness, fieldsOffset:flatbuffers.Offset, customMetadataOffset:flatbuffers.Offset, featuresOffset:flatbuffers.Offset):flatbuffers.Offset { Schema.startSchema(builder); Schema.addEndianness(builder, endianness); diff --git a/src/fb/type.ts b/src/fb/type.ts index 8eb87042..8f913d01 100644 --- a/src/fb/type.ts +++ b/src/fb/type.ts @@ -1,6 +1,9 @@ // automatically generated by the FlatBuffers compiler, do not modify +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + import { Binary } from './binary.js'; +import { BinaryView } from './binary-view.js'; import { Bool } from './bool.js'; import { Date } from './date.js'; import { Decimal } from './decimal.js'; @@ -12,8 +15,10 @@ import { Int } from './int.js'; import { Interval } from './interval.js'; import { LargeBinary } from './large-binary.js'; import { LargeList } from './large-list.js'; +import { LargeListView } from './large-list-view.js'; import { LargeUtf8 } from './large-utf8.js'; import { List } from './list.js'; +import { ListView } from './list-view.js'; import { Map } from './map.js'; import { Null } from './null.js'; import { RunEndEncoded } from './run-end-encoded.js'; @@ -22,6 +27,7 @@ import { Time } from './time.js'; import { Timestamp } from './timestamp.js'; import { Union } from './union.js'; import { Utf8 } from './utf8.js'; +import { Utf8View } from './utf8-view.js'; /** @@ -52,15 +58,19 @@ export enum Type { LargeBinary = 19, LargeUtf8 = 20, LargeList = 21, - RunEndEncoded = 22 + RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26 } export function unionToType( type: Type, - accessor: (obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { + accessor: (obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(new Null())! as Null; case 'Int': return accessor(new Int())! as Int; case 'FloatingPoint': return accessor(new FloatingPoint())! as FloatingPoint; @@ -83,17 +93,21 @@ export function unionToType( case 'LargeUtf8': return accessor(new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(new Utf8View())! as Utf8View; + case 'ListView': return accessor(new ListView())! as ListView; + case 'LargeListView': return accessor(new LargeListView())! as LargeListView; default: return null; } } export function unionListToType( - type: Type, - accessor: (index: number, obj:Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8) => Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null, + type: Type, + accessor: (index: number, obj:Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View) => Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null, index: number -): Binary|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeUtf8|List|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|null { +): Binary|BinaryView|Bool|Date|Decimal|Duration|FixedSizeBinary|FixedSizeList|FloatingPoint|Int|Interval|LargeBinary|LargeList|LargeListView|LargeUtf8|List|ListView|Map|Null|RunEndEncoded|Struct_|Time|Timestamp|Union|Utf8|Utf8View|null { switch(Type[type]) { - case 'NONE': return null; + case 'NONE': return null; case 'Null': return accessor(index, new Null())! as Null; case 'Int': return accessor(index, new Int())! as Int; case 'FloatingPoint': return accessor(index, new FloatingPoint())! as FloatingPoint; @@ -116,6 +130,10 @@ export function unionListToType( case 'LargeUtf8': return accessor(index, new LargeUtf8())! as LargeUtf8; case 'LargeList': return accessor(index, new LargeList())! as LargeList; case 'RunEndEncoded': return accessor(index, new RunEndEncoded())! as RunEndEncoded; + case 'BinaryView': return accessor(index, new BinaryView())! as BinaryView; + case 'Utf8View': return accessor(index, new Utf8View())! as Utf8View; + case 'ListView': return accessor(index, new ListView())! as ListView; + case 'LargeListView': return accessor(index, new LargeListView())! as LargeListView; default: return null; } } diff --git a/src/fb/utf8-view.ts b/src/fb/utf8-view.ts new file mode 100644 index 00000000..886a9df7 --- /dev/null +++ b/src/fb/utf8-view.ts @@ -0,0 +1,47 @@ +// automatically generated by the FlatBuffers compiler, do not modify + +/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/no-explicit-any, @typescript-eslint/no-non-null-assertion */ + +import * as flatbuffers from 'flatbuffers'; + +/** + * Logically the same as Utf8, but the internal representation uses a view + * struct that contains the string length and either the string's entire data + * inline (for small strings) or an inlined prefix, an index of another buffer, + * and an offset pointing to a slice in that buffer (for non-small strings). + * + * Since it uses a variable number of data buffers, each Field with this type + * must have a corresponding entry in `variadicBufferCounts`. + */ +export class Utf8View { + bb: flatbuffers.ByteBuffer|null = null; + bb_pos = 0; + __init(i:number, bb:flatbuffers.ByteBuffer):Utf8View { + this.bb_pos = i; + this.bb = bb; + return this; +} + +static getRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static getSizePrefixedRootAsUtf8View(bb:flatbuffers.ByteBuffer, obj?:Utf8View):Utf8View { + bb.setPosition(bb.position() + flatbuffers.SIZE_PREFIX_LENGTH); + return (obj || new Utf8View()).__init(bb.readInt32(bb.position()) + bb.position(), bb); +} + +static startUtf8View(builder:flatbuffers.Builder) { + builder.startObject(0); +} + +static endUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + const offset = builder.endObject(); + return offset; +} + +static createUtf8View(builder:flatbuffers.Builder):flatbuffers.Offset { + Utf8View.startUtf8View(builder); + return Utf8View.endUtf8View(builder); +} +} diff --git a/src/interfaces.ts b/src/interfaces.ts index 0645753b..2d81222c 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -37,6 +37,7 @@ import type { LargeUtf8Builder } from './builder/largeutf8.js'; import type { BinaryBuilder } from './builder/binary.js'; import type { LargeBinaryBuilder } from './builder/largebinary.js'; import type { ListBuilder } from './builder/list.js'; +import type { ListViewBuilder, LargeListViewBuilder } from './builder/listview.js'; import type { FixedSizeListBuilder } from './builder/fixedsizelist.js'; import type { MapBuilder } from './builder/map.js'; import type { StructBuilder } from './builder/struct.js'; @@ -212,6 +213,7 @@ export type TypeToDataType = { [Type.LargeUtf8]: type.LargeUtf8; [Type.Binary]: type.Binary; [Type.LargeBinary]: type.LargeBinary; + [Type.BinaryView]: type.BinaryView; [Type.FixedSizeBinary]: type.FixedSizeBinary; [Type.Date]: type.Date_; [Type.DateDay]: type.DateDay; @@ -240,10 +242,13 @@ export type TypeToDataType = { [Type.DurationMicrosecond]: type.DurationMicrosecond; [Type.DurationNanosecond]: type.DurationNanosecond; [Type.Map]: type.Map_; + [Type.ListView]: type.ListView; + [Type.LargeListView]: type.LargeListView; [Type.List]: type.List; [Type.Struct]: type.Struct; [Type.Dictionary]: type.Dictionary; [Type.FixedSizeList]: type.FixedSizeList; + [Type.Utf8View]: type.Utf8View; }[T]; /** @ignore */ @@ -268,6 +273,7 @@ type TypeToBuilder = { [Type.LargeUtf8]: LargeUtf8Builder; [Type.Binary]: BinaryBuilder; [Type.LargeBinary]: LargeBinaryBuilder; + [Type.BinaryView]: Builder; [Type.FixedSizeBinary]: FixedSizeBinaryBuilder; [Type.Date]: DateBuilder; [Type.DateDay]: DateDayBuilder; @@ -297,9 +303,12 @@ type TypeToBuilder = { [Type.DurationNanosecond]: DurationNanosecondBuilder; [Type.Map]: MapBuilder; [Type.List]: ListBuilder; + [Type.ListView]: ListViewBuilder; + [Type.LargeListView]: LargeListViewBuilder; [Type.Struct]: StructBuilder; [Type.Dictionary]: DictionaryBuilder; [Type.FixedSizeList]: FixedSizeListBuilder; + [Type.Utf8View]: Builder; }[T]; /** @ignore */ @@ -324,6 +333,7 @@ type DataTypeToBuilder = { [Type.LargeUtf8]: T extends type.LargeUtf8 ? LargeUtf8Builder : never; [Type.Binary]: T extends type.Binary ? BinaryBuilder : never; [Type.LargeBinary]: T extends type.LargeBinary ? LargeBinaryBuilder : never; + [Type.BinaryView]: T extends type.BinaryView ? Builder : never; [Type.FixedSizeBinary]: T extends type.FixedSizeBinary ? FixedSizeBinaryBuilder : never; [Type.Date]: T extends type.Date_ ? DateBuilder : never; [Type.DateDay]: T extends type.DateDay ? DateDayBuilder : never; @@ -353,7 +363,10 @@ type DataTypeToBuilder = { [Type.DurationNanosecond]: T extends type.DurationNanosecond ? DurationNanosecondBuilder : never; [Type.Map]: T extends type.Map_ ? MapBuilder : never; [Type.List]: T extends type.List ? ListBuilder : never; + [Type.ListView]: T extends type.ListView ? ListViewBuilder : never; + [Type.LargeListView]: T extends type.LargeListView ? LargeListViewBuilder : never; [Type.Struct]: T extends type.Struct ? StructBuilder : never; [Type.Dictionary]: T extends type.Dictionary ? DictionaryBuilder : never; [Type.FixedSizeList]: T extends type.FixedSizeList ? FixedSizeListBuilder : never; + [Type.Utf8View]: T extends type.Utf8View ? Builder : never; }[T['TType']]; diff --git a/src/ipc/message.ts b/src/ipc/message.ts index 3dc86252..2cd329dc 100644 --- a/src/ipc/message.ts +++ b/src/ipc/message.ts @@ -203,7 +203,10 @@ export class JSONMessageReader extends MessageReader { ...(column['VALIDITY'] && [column['VALIDITY']] || []), ...(column['TYPE_ID'] && [column['TYPE_ID']] || []), ...(column['OFFSET'] && [column['OFFSET']] || []), + ...(column['SIZE'] && [column['SIZE']] || []), ...(column['DATA'] && [column['DATA']] || []), + ...(column['VIEWS'] && [column['VIEWS']] || []), + ...(column['VARIADIC_DATA_BUFFERS'] || []), ...flattenDataSources(column['children']) ], [] as any[][]); } diff --git a/src/ipc/metadata/json.ts b/src/ipc/metadata/json.ts index 15f87189..ae930eb8 100644 --- a/src/ipc/metadata/json.ts +++ b/src/ipc/metadata/json.ts @@ -18,8 +18,8 @@ import { Schema, Field } from '../../schema.js'; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, - List, FixedSizeList, Map_, Struct, Union, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, + List, ListView, LargeListView, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -41,7 +41,8 @@ export function recordBatchFromJSON(b: any) { b['count'], fieldNodesFromJSON(b['columns']), buffersFromJSON(b['columns']), - null + null, + variadicBufferCountsFromJSON(b['columns']) ); } @@ -82,7 +83,15 @@ function buffersFromJSON(xs: any[], buffers: BufferRegion[] = []): BufferRegion[ column['VALIDITY'] && buffers.push(new BufferRegion(buffers.length, column['VALIDITY'].length)); column['TYPE_ID'] && buffers.push(new BufferRegion(buffers.length, column['TYPE_ID'].length)); column['OFFSET'] && buffers.push(new BufferRegion(buffers.length, column['OFFSET'].length)); + column['SIZE'] && buffers.push(new BufferRegion(buffers.length, column['SIZE'].length)); column['DATA'] && buffers.push(new BufferRegion(buffers.length, column['DATA'].length)); + column['VIEWS'] && buffers.push(new BufferRegion(buffers.length, column['VIEWS'].length)); + // Handle variadic buffers for view types (BinaryView, Utf8View) + if (column['VARIADIC_DATA_BUFFERS']) { + for (const buf of column['VARIADIC_DATA_BUFFERS']) { + buffers.push(new BufferRegion(buffers.length, buf.length)); + } + } buffers = buffersFromJSON(column['children'], buffers); } return buffers; @@ -93,6 +102,15 @@ function nullCountFromJSON(validity: number[]) { return (validity || []).reduce((sum, val) => sum + +(val === 0), 0); } +/** @ignore */ +function variadicBufferCountsFromJSON(xs: any[]): number[] { + return (xs || []).reduce((counts, column: any) => [ + ...counts, + ...(column['VARIADIC_DATA_BUFFERS'] ? [column['VARIADIC_DATA_BUFFERS'].length] : []), + ...variadicBufferCountsFromJSON(column['children']) + ], [] as number[]); +} + /** @ignore */ export function fieldFromJSON(_field: any, dictionaries?: Map) { @@ -149,10 +167,14 @@ function typeFromJSON(f: any, children?: Field[]): DataType { case 'null': return new Null(); case 'binary': return new Binary(); case 'largebinary': return new LargeBinary(); + case 'binaryview': return new BinaryView(); case 'utf8': return new Utf8(); case 'largeutf8': return new LargeUtf8(); + case 'utf8view': return new Utf8View(); case 'bool': return new Bool(); case 'list': return new List((children || [])[0]); + case 'listview': return new ListView((children || [])[0]); + case 'largelistview': return new LargeListView((children || [])[0]); case 'struct': return new Struct(children || []); case 'struct_': return new Struct(children || []); } diff --git a/src/ipc/metadata/message.ts b/src/ipc/metadata/message.ts index 17e8897b..1abb1c1f 100644 --- a/src/ipc/metadata/message.ts +++ b/src/ipc/metadata/message.ts @@ -57,8 +57,8 @@ import ByteBuffer = flatbuffers.ByteBuffer; import { DataType, Dictionary, TimeBitWidth, - Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, - List, FixedSizeList, Map_, Struct, Union, + Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, + List, ListView, LargeListView, FixedSizeList, Map_, Struct, Union, Bool, Null, Int, Float, Date_, Time, Interval, Timestamp, IntBitWidth, Int32, TKeys, Duration, } from '../../type.js'; @@ -156,20 +156,24 @@ export class RecordBatch { protected _nodes: FieldNode[]; protected _buffers: BufferRegion[]; protected _compression: BodyCompression | null; + protected _variadicBufferCounts: number[]; public get nodes() { return this._nodes; } public get length() { return this._length; } public get buffers() { return this._buffers; } public get compression() { return this._compression; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } constructor( length: bigint | number, nodes: FieldNode[], buffers: BufferRegion[], - compression: BodyCompression | null + compression: BodyCompression | null, + variadicBufferCounts: number[] = [] ) { this._nodes = nodes; this._buffers = buffers; this._length = bigIntToNumber(length); this._compression = compression; + this._variadicBufferCounts = variadicBufferCounts; } } @@ -334,7 +338,8 @@ function decodeRecordBatch(batch: _RecordBatch, version = MetadataVersion.V5) { batch.length(), decodeFieldNodes(batch), decodeBuffers(batch, version), - decodeBodyCompression(batch.compression()) + decodeBodyCompression(batch.compression()), + decodeVariadicBufferCounts(batch) ); return recordBatch; } @@ -382,6 +387,16 @@ function decodeBuffers(batch: _RecordBatch, version: MetadataVersion) { return bufferRegions; } +/** @ignore */ +function decodeVariadicBufferCounts(batch: _RecordBatch) { + const counts = [] as number[]; + const length = Math.trunc(batch.variadicBufferCountsLength()); + for (let i = 0; i < length; ++i) { + counts.push(bigIntToNumber(batch.variadicBufferCounts(i)!)); + } + return counts; +} + /** @ignore */ function decodeSchemaFields(schema: _Schema, dictionaries?: Map) { const fields = [] as Field[]; @@ -468,10 +483,14 @@ function decodeFieldType(f: _Field, children?: Field[]): DataType { case Type['Null']: return new Null(); case Type['Binary']: return new Binary(); case Type['LargeBinary']: return new LargeBinary(); + case Type['BinaryView']: return new BinaryView(); case Type['Utf8']: return new Utf8(); case Type['LargeUtf8']: return new LargeUtf8(); + case Type['Utf8View']: return new Utf8View(); case Type['Bool']: return new Bool(); case Type['List']: return new List((children || [])[0]); + case Type['ListView']: return new ListView((children || [])[0]); + case Type['LargeListView']: return new LargeListView((children || [])[0]); case Type['Struct_']: return new Struct(children || []); } @@ -614,6 +633,7 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { const nodes = recordBatch.nodes || []; const buffers = recordBatch.buffers || []; + const variadicBufferCounts = recordBatch.variadicBufferCounts || []; _RecordBatch.startNodesVector(b, nodes.length); for (const n of nodes.slice().reverse()) FieldNode.encode(b, n); @@ -630,6 +650,11 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { bodyCompressionOffset = encodeBodyCompression(b, recordBatch.compression); } + let variadicBufferCountsOffset = -1; + if (variadicBufferCounts.length > 0) { + variadicBufferCountsOffset = _RecordBatch.createVariadicBufferCountsVector(b, variadicBufferCounts.map(BigInt)); + } + _RecordBatch.startRecordBatch(b); _RecordBatch.addLength(b, BigInt(recordBatch.length)); _RecordBatch.addNodes(b, nodesVectorOffset); @@ -637,6 +662,9 @@ function encodeRecordBatch(b: Builder, recordBatch: RecordBatch) { if (recordBatch.compression !== null && bodyCompressionOffset) { _RecordBatch.addCompression(b, bodyCompressionOffset); } + if (variadicBufferCountsOffset !== -1) { + _RecordBatch.addVariadicBufferCounts(b, variadicBufferCountsOffset); + } return _RecordBatch.endRecordBatch(b); } diff --git a/src/ipc/reader.ts b/src/ipc/reader.ts index e36eeb52..af49f372 100644 --- a/src/ipc/reader.ts +++ b/src/ipc/reader.ts @@ -397,7 +397,8 @@ abstract class RecordBatchReaderImpl implements RecordB header.data.length, header.data.nodes, buffers, - null + null, + header.data.variadicBufferCounts ), id, isDelta) } else { throw new Error('Dictionary batch is compressed but codec not found'); @@ -412,11 +413,11 @@ abstract class RecordBatchReaderImpl implements RecordB } protected _loadVectors(header: metadata.RecordBatch, body: Uint8Array, types: (Field | DataType)[]) { - return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new VectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } protected _loadCompressedVectors(header: metadata.RecordBatch, body: Uint8Array[], types: (Field | DataType)[]) { - return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new CompressedVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } private _decompressBuffers(header: metadata.RecordBatch, body: Uint8Array, codec: Codec): { decommpressedBody: Uint8Array[]; buffers: metadata.BufferRegion[] } { @@ -757,7 +758,7 @@ class RecordBatchJSONReaderImpl extends RecordBatchStre super(source, dictionaries); } protected _loadVectors(header: metadata.RecordBatch, body: any, types: (Field | DataType)[]) { - return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion).visitMany(types); + return new JSONVectorLoader(body, header.nodes, header.buffers, this.dictionaries, this.schema.metadataVersion, header.variadicBufferCounts).visitMany(types); } } diff --git a/src/ipc/writer.ts b/src/ipc/writer.ts index 17c8f0b6..0b13fdfc 100644 --- a/src/ipc/writer.ts +++ b/src/ipc/writer.ts @@ -274,8 +274,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeRecordBatch(batch: RecordBatch) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(batch); - const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(batch); + const recordBatch = new metadata.RecordBatch(batch.numRows, nodes, bufferRegions, this._compression, variadicBufferCounts); const message = Message.from(recordBatch, byteLength); return this ._writeDictionaries(batch) @@ -284,11 +284,11 @@ export class RecordBatchWriter extends ReadableInterop< } protected _assembleRecordBatch(batch: RecordBatch | Vector) { - let { byteLength, nodes, bufferRegions, buffers } = VectorAssembler.assemble(batch); + let { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = VectorAssembler.assemble(batch); if (this._compression != null) { ({ byteLength, bufferRegions, buffers } = this._compressBodyBuffers(buffers)); } - return { byteLength, nodes, bufferRegions, buffers }; + return { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts }; } protected _compressBodyBuffers(buffers: ArrayBufferView[]) { @@ -337,8 +337,8 @@ export class RecordBatchWriter extends ReadableInterop< } protected _writeDictionaryBatch(dictionary: Data, id: number, isDelta = false) { - const { byteLength, nodes, bufferRegions, buffers } = this._assembleRecordBatch(new Vector([dictionary])); - const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression); + const { byteLength, nodes, bufferRegions, buffers, variadicBufferCounts } = this._assembleRecordBatch(new Vector([dictionary])); + const recordBatch = new metadata.RecordBatch(dictionary.length, nodes, bufferRegions, this._compression, variadicBufferCounts); const dictionaryBatch = new metadata.DictionaryBatch(recordBatch, id, isDelta); const message = Message.from(dictionaryBatch, byteLength); return this diff --git a/src/type.ts b/src/type.ts index ea5e24fa..c72917e5 100644 --- a/src/type.ts +++ b/src/type.ts @@ -58,8 +58,10 @@ export abstract class DataType { })(Binary.prototype); } +/** @ignore */ +export interface BinaryView extends DataType { + TArray: Uint8Array; + TValue: Uint8Array; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class BinaryView extends DataType { + public static readonly ELEMENT_WIDTH = 16; + public static readonly INLINE_CAPACITY = 12; + public static readonly LENGTH_OFFSET = 0; + public static readonly INLINE_OFFSET = 4; + public static readonly BUFFER_INDEX_OFFSET = 8; + public static readonly BUFFER_OFFSET_OFFSET = 12; + constructor() { + super(Type.BinaryView); + } + public toString() { return `BinaryView`; } + protected static [Symbol.toStringTag] = ((proto: BinaryView) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'BinaryView'; + })(BinaryView.prototype); +} + /** @ignore */ export interface LargeBinary extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: Uint8Array; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -269,6 +297,7 @@ export class LargeBinary extends DataType { })(LargeBinary.prototype); } +/** @ignore */ /** @ignore */ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetArray: Int32Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: TypedArrayConstructor } /** @ignore */ @@ -283,6 +312,26 @@ export class Utf8 extends DataType { })(Utf8.prototype); } +/** @ignore */ +export interface Utf8View extends DataType { + TArray: Uint8Array; + TValue: string; + ArrayType: TypedArrayConstructor; +} +/** @ignore */ +export class Utf8View extends DataType { + public static readonly ELEMENT_WIDTH = BinaryView.ELEMENT_WIDTH; + public static readonly INLINE_CAPACITY = BinaryView.INLINE_CAPACITY; + constructor() { + super(Type.Utf8View); + } + public toString() { return `Utf8View`; } + protected static [Symbol.toStringTag] = ((proto: Utf8View) => { + (proto).ArrayType = Uint8Array; + return proto[Symbol.toStringTag] = 'Utf8View'; + })(Utf8View.prototype); +} + /** @ignore */ export interface LargeUtf8 extends DataType { TArray: Uint8Array; TOffsetArray: BigInt64Array; TValue: string; ArrayType: TypedArrayConstructor; OffsetArrayType: BigIntArrayConstructor } /** @ignore */ @@ -298,6 +347,7 @@ export class LargeUtf8 extends DataType { })(LargeUtf8.prototype); } +/** @ignore */ /** @ignore */ export interface Bool extends DataType { TArray: Uint8Array; TValue: boolean; ArrayType: TypedArrayConstructor } /** @ignore */ @@ -539,13 +589,65 @@ export class List extends DataType`; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } - public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + public get ArrayType(): TypedArrayConstructor { return Int32Array; } protected static [Symbol.toStringTag] = ((proto: List) => { (proto).children = null; return proto[Symbol.toStringTag] = 'List'; })(List.prototype); } +/** @ignore */ +export interface ListView extends DataType { + TArray: Vector[]; + TValue: Vector; + ArrayType: TypedArrayConstructor; + OffsetArrayType: TypedArrayConstructor; +} + +/** @ignore */ +export class ListView extends DataType { + constructor(child: Field) { + super(Type.ListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `ListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): TypedArrayConstructor { return Int32Array; } + public get OffsetArrayType(): TypedArrayConstructor { return Int32Array; } + protected static [Symbol.toStringTag] = ((proto: ListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'ListView'; + })(ListView.prototype); +} + +/** @ignore */ +export interface LargeListView extends DataType { + TArray: Vector[]; + TValue: Vector; + ArrayType: BigIntArrayConstructor; + OffsetArrayType: BigIntArrayConstructor; +} + +/** @ignore */ +export class LargeListView extends DataType { + constructor(child: Field) { + super(Type.LargeListView); + this.children = [child]; + } + public declare readonly children: Field[]; + public toString() { return `LargeListView<${this.valueType}>`; } + public get valueType(): T { return this.children[0].type as T; } + public get valueField(): Field { return this.children[0] as Field; } + public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } + public get OffsetArrayType(): BigIntArrayConstructor { return BigInt64Array; } + protected static [Symbol.toStringTag] = ((proto: LargeListView) => { + (proto).children = null; + return proto[Symbol.toStringTag] = 'LargeListView'; + })(LargeListView.prototype); +} + /** @ignore */ export interface Struct extends DataType { TArray: Array>; @@ -759,6 +861,8 @@ export function strideForType(type: DataType) { } // case Type.Int: return 1 + +((t as Int_).bitWidth > 32); // case Type.Time: return 1 + +((t as Time_).bitWidth > 32); + case Type.BinaryView: + case Type.Utf8View: return 16; case Type.FixedSizeList: return (t as FixedSizeList).listSize; case Type.FixedSizeBinary: return (t as FixedSizeBinary).byteWidth; default: return 1; diff --git a/src/visitor.ts b/src/visitor.ts index 977e0a4e..177384ba 100644 --- a/src/visitor.ts +++ b/src/visitor.ts @@ -37,8 +37,10 @@ export abstract class Visitor { public visitFloat(_node: any, ..._args: any[]): any { return null; } public visitUtf8(_node: any, ..._args: any[]): any { return null; } public visitLargeUtf8(_node: any, ..._args: any[]): any { return null; } + public visitUtf8View(_node: any, ..._args: any[]): any { return null; } public visitBinary(_node: any, ..._args: any[]): any { return null; } public visitLargeBinary(_node: any, ..._args: any[]): any { return null; } + public visitBinaryView(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeBinary(_node: any, ..._args: any[]): any { return null; } public visitDate(_node: any, ..._args: any[]): any { return null; } public visitTimestamp(_node: any, ..._args: any[]): any { return null; } @@ -52,6 +54,8 @@ export abstract class Visitor { public visitDuration(_node: any, ..._args: any[]): any { return null; } public visitFixedSizeList(_node: any, ..._args: any[]): any { return null; } public visitMap(_node: any, ..._args: any[]): any { return null; } + public visitListView(_node: any, ..._args: any[]): any { return null; } + public visitLargeListView(_node: any, ..._args: any[]): any { return null; } } /** @ignore */ @@ -92,8 +96,10 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.Float64: fn = visitor.visitFloat64 || visitor.visitFloat; break; case Type.Utf8: fn = visitor.visitUtf8; break; case Type.LargeUtf8: fn = visitor.visitLargeUtf8; break; + case Type.Utf8View: fn = visitor.visitUtf8View || visitor.visitUtf8; break; case Type.Binary: fn = visitor.visitBinary; break; case Type.LargeBinary: fn = visitor.visitLargeBinary; break; + case Type.BinaryView: fn = visitor.visitBinaryView || visitor.visitBinary; break; case Type.FixedSizeBinary: fn = visitor.visitFixedSizeBinary; break; case Type.Date: fn = visitor.visitDate; break; case Type.DateDay: fn = visitor.visitDateDay || visitor.visitDate; break; @@ -126,6 +132,8 @@ function getVisitFnByTypeId(visitor: Visitor, dtype: Type, throwIfNotFound = tru case Type.DurationNanosecond: fn = visitor.visitDurationNanosecond || visitor.visitDuration; break; case Type.FixedSizeList: fn = visitor.visitFixedSizeList; break; case Type.Map: fn = visitor.visitMap; break; + case Type.ListView: fn = visitor.visitListView; break; + case Type.LargeListView: fn = visitor.visitLargeListView; break; } if (typeof fn === 'function') return fn; if (!throwIfNotFound) return () => null; @@ -157,8 +165,10 @@ function inferDType(type: T): Type { return Type.Float; case Type.Binary: return Type.Binary; case Type.LargeBinary: return Type.LargeBinary; + case Type.BinaryView: return Type.BinaryView; case Type.Utf8: return Type.Utf8; case Type.LargeUtf8: return Type.LargeUtf8; + case Type.Utf8View: return Type.Utf8View; case Type.Bool: return Type.Bool; case Type.Decimal: return Type.Decimal; case Type.Time: @@ -216,6 +226,8 @@ function inferDType(type: T): Type { case Type.FixedSizeBinary: return Type.FixedSizeBinary; case Type.FixedSizeList: return Type.FixedSizeList; case Type.Dictionary: return Type.Dictionary; + case Type.ListView: return Type.ListView; + case Type.LargeListView: return Type.LargeListView; } throw new Error(`Unrecognized type '${Type[type.typeId]}'`); } @@ -272,6 +284,8 @@ export interface Visitor { visitDurationNanosecond(node: any, ...args: any[]): any; visitFixedSizeList(node: any, ...args: any[]): any; visitMap(node: any, ...args: any[]): any; + visitListView(node: any, ...args: any[]): any; + visitLargeListView(node: any, ...args: any[]): any; } // Add these here so they're picked up by the externs creator diff --git a/src/visitor/builderctor.ts b/src/visitor/builderctor.ts index 791576b0..eda77abb 100644 --- a/src/visitor/builderctor.ts +++ b/src/visitor/builderctor.ts @@ -34,6 +34,7 @@ import { IntervalBuilder, IntervalDayTimeBuilder, IntervalMonthDayNanoBuilder, I import { DurationBuilder, DurationSecondBuilder, DurationMillisecondBuilder, DurationMicrosecondBuilder, DurationNanosecondBuilder } from '../builder/duration.js'; import { IntBuilder, Int8Builder, Int16Builder, Int32Builder, Int64Builder, Uint8Builder, Uint16Builder, Uint32Builder, Uint64Builder } from '../builder/int.js'; import { ListBuilder } from '../builder/list.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../builder/listview.js'; import { MapBuilder } from '../builder/map.js'; import { NullBuilder } from '../builder/null.js'; import { StructBuilder } from '../builder/struct.js'; @@ -42,6 +43,8 @@ import { TimeBuilder, TimeSecondBuilder, TimeMillisecondBuilder, TimeMicrosecond import { UnionBuilder, DenseUnionBuilder, SparseUnionBuilder } from '../builder/union.js'; import { Utf8Builder } from '../builder/utf8.js'; import { LargeUtf8Builder } from '../builder/largeutf8.js'; +import { BinaryViewBuilder } from '../builder/binaryview.js'; +import { Utf8ViewBuilder } from '../builder/utf8view.js'; /** @ignore */ export interface GetBuilderCtor extends Visitor { @@ -88,6 +91,8 @@ export class GetBuilderCtor extends Visitor { public visitTimeNanosecond() { return TimeNanosecondBuilder; } public visitDecimal() { return DecimalBuilder; } public visitList() { return ListBuilder; } + public visitListView() { return ListViewBuilder; } + public visitLargeListView() { return LargeListViewBuilder; } public visitStruct() { return StructBuilder; } public visitUnion() { return UnionBuilder; } public visitDenseUnion() { return DenseUnionBuilder; } @@ -104,6 +109,8 @@ export class GetBuilderCtor extends Visitor { public visitDurationNanosecond() { return DurationNanosecondBuilder; } public visitFixedSizeList() { return FixedSizeListBuilder; } public visitMap() { return MapBuilder; } + public visitBinaryView() { return BinaryViewBuilder; } + public visitUtf8View() { return Utf8ViewBuilder; } } /** @ignore */ diff --git a/src/visitor/get.ts b/src/visitor/get.ts index a5502dd3..69e37759 100644 --- a/src/visitor/get.ts +++ b/src/visitor/get.ts @@ -28,7 +28,7 @@ import { uint16ToFloat64 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -63,8 +63,10 @@ export interface GetVisitor extends Visitor { visitFloat64(data: Data, index: number): T['TValue'] | null; visitUtf8(data: Data, index: number): T['TValue'] | null; visitLargeUtf8(data: Data, index: number): T['TValue'] | null; + visitUtf8View(data: Data, index: number): T['TValue'] | null; visitBinary(data: Data, index: number): T['TValue'] | null; visitLargeBinary(data: Data, index: number): T['TValue'] | null; + visitBinaryView(data: Data, index: number): T['TValue'] | null; visitFixedSizeBinary(data: Data, index: number): T['TValue'] | null; visitDate(data: Data, index: number): T['TValue'] | null; visitDateDay(data: Data, index: number): T['TValue'] | null; @@ -81,6 +83,8 @@ export interface GetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number): T['TValue'] | null; visitDecimal(data: Data, index: number): T['TValue'] | null; visitList(data: Data, index: number): T['TValue'] | null; + visitListView(data: Data, index: number): T['TValue'] | null; + visitLargeListView(data: Data, index: number): T['TValue'] | null; visitStruct(data: Data, index: number): T['TValue'] | null; visitUnion(data: Data, index: number): T['TValue'] | null; visitDenseUnion(data: Data, index: number): T['TValue'] | null; @@ -109,6 +113,9 @@ function wrapGet(fn: (data: Data, _1: any) => any) { /** @ignore */const epochDaysToMs = (data: Int32Array, index: number) => 86400000 * data[index]; +const BINARY_VIEW_SIZE = 16; +const BINARY_VIEW_INLINE_CAPACITY = 12; + /** @ignore */ const getNull = (_data: Data, _index: number): T['TValue'] => null; /** @ignore */ @@ -149,10 +156,52 @@ const getFixedSizeBinary = ({ stride, values }: Data< /** @ignore */ const getBinary = ({ values, valueOffsets }: Data, index: number): T['TValue'] => getVariableWidthBytes(values, valueOffsets, index); /** @ignore */ +const getBinaryViewBytes = (data: Data, index: number): Uint8Array => { + const values = data.values as Uint8Array; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const viewOffset = index * BINARY_VIEW_SIZE; + const end = viewOffset + BINARY_VIEW_SIZE; + if (viewOffset < 0 || end > values.length) { + throw new Error(`BinaryView data buffer is too short: expected ${BINARY_VIEW_SIZE} bytes, got ${Math.max(0, values.length - viewOffset)}`); + } + // Get the 16-byte view struct from the values array + const viewStruct = values.subarray(viewOffset, end); + if (viewStruct.length < BINARY_VIEW_SIZE) { + throw new Error(`BinaryView data buffer is too short: expected ${BINARY_VIEW_SIZE} bytes, got ${viewStruct.length}`); + } + const view = new DataView(values.buffer, viewStruct.byteOffset, BINARY_VIEW_SIZE); + const size = view.getInt32(0, true); + if (size <= 0) { + return new Uint8Array(0); + } + if (size <= BINARY_VIEW_INLINE_CAPACITY) { + // Inline data is in bytes 4-15 of the view struct + return viewStruct.subarray(4, 4 + size); + } + const bufferIndex = view.getInt32(8, true); + const offset = view.getInt32(12, true); + const variadicBuffer = data.variadicBuffers?.[bufferIndex]; + if (!variadicBuffer) { + throw new Error(`BinaryView variadic buffer ${bufferIndex} is missing`); + } + return variadicBuffer.subarray(offset, offset + size); +}; +/** @ignore */ +const getBinaryViewValue = (data: Data, index: number): T['TValue'] => { + return getBinaryViewBytes(data, index) as T['TValue']; +}; +/** @ignore */ const getUtf8 = ({ values, valueOffsets }: Data, index: number): T['TValue'] => { const bytes = getVariableWidthBytes(values, valueOffsets, index); return bytes !== null ? decodeUtf8(bytes) : null as any; }; +/** @ignore */ +const getUtf8ViewValue = (data: Data, index: number): T['TValue'] => { + const bytes = getBinaryViewBytes(data, index); + return decodeUtf8(bytes); +}; /* istanbul ignore next */ /** @ignore */ @@ -222,6 +271,26 @@ const getList = (data: Data, index: number): T['TValue'] => { return new Vector([slice]) as T['TValue']; }; +/** @ignore */ +const getListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + +/** @ignore */ +const getLargeListView = (data: Data, index: number): T['TValue'] => { + const { valueOffsets, values: sizes, children } = data; + const offset = bigIntToNumber(valueOffsets[index]); + const size = bigIntToNumber(sizes[index]); + const child: Data = children[0]; + const slice = child.slice(offset, size); + return new Vector([slice]) as T['TValue']; +}; + /** @ignore */ const getMap = (data: Data, index: number): T['TValue'] => { const { valueOffsets, children } = data; @@ -332,8 +401,10 @@ GetVisitor.prototype.visitFloat32 = wrapGet(getNumeric); GetVisitor.prototype.visitFloat64 = wrapGet(getNumeric); GetVisitor.prototype.visitUtf8 = wrapGet(getUtf8); GetVisitor.prototype.visitLargeUtf8 = wrapGet(getUtf8); +GetVisitor.prototype.visitUtf8View = wrapGet(getUtf8ViewValue); GetVisitor.prototype.visitBinary = wrapGet(getBinary); GetVisitor.prototype.visitLargeBinary = wrapGet(getBinary); +GetVisitor.prototype.visitBinaryView = wrapGet(getBinaryViewValue); GetVisitor.prototype.visitFixedSizeBinary = wrapGet(getFixedSizeBinary); GetVisitor.prototype.visitDate = wrapGet(getDate); GetVisitor.prototype.visitDateDay = wrapGet(getDateDay); @@ -350,6 +421,8 @@ GetVisitor.prototype.visitTimeMicrosecond = wrapGet(getTimeMicrosecond); GetVisitor.prototype.visitTimeNanosecond = wrapGet(getTimeNanosecond); GetVisitor.prototype.visitDecimal = wrapGet(getDecimal); GetVisitor.prototype.visitList = wrapGet(getList); +GetVisitor.prototype.visitListView = wrapGet(getListView); +GetVisitor.prototype.visitLargeListView = wrapGet(getLargeListView); GetVisitor.prototype.visitStruct = wrapGet(getStruct); GetVisitor.prototype.visitUnion = wrapGet(getUnion); GetVisitor.prototype.visitDenseUnion = wrapGet(getDenseUnion); diff --git a/src/visitor/indexof.ts b/src/visitor/indexof.ts index 3a4d1171..5a2a3cc3 100644 --- a/src/visitor/indexof.ts +++ b/src/visitor/indexof.ts @@ -24,7 +24,7 @@ import { getBool, BitIterator } from '../util/bit.js'; import { createElementComparator } from '../util/vector.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -59,8 +59,10 @@ export interface IndexOfVisitor extends Visitor { visitFloat64(data: Data, value: T['TValue'] | null, index?: number): number; visitUtf8(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeUtf8(data: Data, value: T['TValue'] | null, index?: number): number; + visitUtf8View(data: Data, value: T['TValue'] | null, index?: number): number; visitBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitLargeBinary(data: Data, value: T['TValue'] | null, index?: number): number; + visitBinaryView(data: Data, value: T['TValue'] | null, index?: number): number; visitFixedSizeBinary(data: Data, value: T['TValue'] | null, index?: number): number; visitDate(data: Data, value: T['TValue'] | null, index?: number): number; visitDateDay(data: Data, value: T['TValue'] | null, index?: number): number; @@ -77,6 +79,8 @@ export interface IndexOfVisitor extends Visitor { visitTimeNanosecond(data: Data, value: T['TValue'] | null, index?: number): number; visitDecimal(data: Data, value: T['TValue'] | null, index?: number): number; visitList(data: Data, value: T['TValue'] | null, index?: number): number; + visitListView(data: Data, value: T['TValue'] | null, index?: number): number; + visitLargeListView(data: Data, value: T['TValue'] | null, index?: number): number; visitStruct(data: Data, value: T['TValue'] | null, index?: number): number; visitUnion(data: Data, value: T['TValue'] | null, index?: number): number; visitDenseUnion(data: Data, value: T['TValue'] | null, index?: number): number; @@ -177,8 +181,10 @@ IndexOfVisitor.prototype.visitFloat32 = indexOfValue; IndexOfVisitor.prototype.visitFloat64 = indexOfValue; IndexOfVisitor.prototype.visitUtf8 = indexOfValue; IndexOfVisitor.prototype.visitLargeUtf8 = indexOfValue; +IndexOfVisitor.prototype.visitUtf8View = indexOfValue; IndexOfVisitor.prototype.visitBinary = indexOfValue; IndexOfVisitor.prototype.visitLargeBinary = indexOfValue; +IndexOfVisitor.prototype.visitBinaryView = indexOfValue; IndexOfVisitor.prototype.visitFixedSizeBinary = indexOfValue; IndexOfVisitor.prototype.visitDate = indexOfValue; IndexOfVisitor.prototype.visitDateDay = indexOfValue; @@ -195,6 +201,8 @@ IndexOfVisitor.prototype.visitTimeMicrosecond = indexOfValue; IndexOfVisitor.prototype.visitTimeNanosecond = indexOfValue; IndexOfVisitor.prototype.visitDecimal = indexOfValue; IndexOfVisitor.prototype.visitList = indexOfValue; +IndexOfVisitor.prototype.visitListView = indexOfValue; +IndexOfVisitor.prototype.visitLargeListView = indexOfValue; IndexOfVisitor.prototype.visitStruct = indexOfValue; IndexOfVisitor.prototype.visitUnion = indexOfValue; IndexOfVisitor.prototype.visitDenseUnion = indexOfUnion; diff --git a/src/visitor/iterator.ts b/src/visitor/iterator.ts index 9f2844b3..9c3dce39 100644 --- a/src/visitor/iterator.ts +++ b/src/visitor/iterator.ts @@ -21,7 +21,7 @@ import { Type, Precision } from '../enum.js'; import { TypeToDataType } from '../interfaces.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -57,8 +57,10 @@ export interface IteratorVisitor extends Visitor { visitFloat64(vector: Vector): IterableIterator; visitUtf8(vector: Vector): IterableIterator; visitLargeUtf8(vector: Vector): IterableIterator; + visitUtf8View(vector: Vector): IterableIterator; visitBinary(vector: Vector): IterableIterator; visitLargeBinary(vector: Vector): IterableIterator; + visitBinaryView(vector: Vector): IterableIterator; visitFixedSizeBinary(vector: Vector): IterableIterator; visitDate(vector: Vector): IterableIterator; visitDateDay(vector: Vector): IterableIterator; @@ -75,6 +77,8 @@ export interface IteratorVisitor extends Visitor { visitTimeNanosecond(vector: Vector): IterableIterator; visitDecimal(vector: Vector): IterableIterator; visitList(vector: Vector): IterableIterator; + visitListView(vector: Vector): IterableIterator; + visitLargeListView(vector: Vector): IterableIterator; visitStruct(vector: Vector): IterableIterator; visitUnion(vector: Vector): IterableIterator; visitDenseUnion(vector: Vector): IterableIterator; @@ -164,8 +168,10 @@ IteratorVisitor.prototype.visitFloat32 = vectorIterator; IteratorVisitor.prototype.visitFloat64 = vectorIterator; IteratorVisitor.prototype.visitUtf8 = vectorIterator; IteratorVisitor.prototype.visitLargeUtf8 = vectorIterator; +IteratorVisitor.prototype.visitUtf8View = vectorIterator; IteratorVisitor.prototype.visitBinary = vectorIterator; IteratorVisitor.prototype.visitLargeBinary = vectorIterator; +IteratorVisitor.prototype.visitBinaryView = vectorIterator; IteratorVisitor.prototype.visitFixedSizeBinary = vectorIterator; IteratorVisitor.prototype.visitDate = vectorIterator; IteratorVisitor.prototype.visitDateDay = vectorIterator; @@ -182,6 +188,8 @@ IteratorVisitor.prototype.visitTimeMicrosecond = vectorIterator; IteratorVisitor.prototype.visitTimeNanosecond = vectorIterator; IteratorVisitor.prototype.visitDecimal = vectorIterator; IteratorVisitor.prototype.visitList = vectorIterator; +IteratorVisitor.prototype.visitListView = vectorIterator; +IteratorVisitor.prototype.visitLargeListView = vectorIterator; IteratorVisitor.prototype.visitStruct = vectorIterator; IteratorVisitor.prototype.visitUnion = vectorIterator; IteratorVisitor.prototype.visitDenseUnion = vectorIterator; diff --git a/src/visitor/jsontypeassembler.ts b/src/visitor/jsontypeassembler.ts index 823b1dea..96ef1b93 100644 --- a/src/visitor/jsontypeassembler.ts +++ b/src/visitor/jsontypeassembler.ts @@ -45,6 +45,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeBinary({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitBinaryView({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitBool({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } @@ -54,6 +57,9 @@ export class JSONTypeAssembler extends Visitor { public visitLargeUtf8({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitUtf8View({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitDecimal({ typeId, scale, precision, bitWidth }: T) { return { 'name': ArrowType[typeId].toLowerCase(), 'scale': scale, 'precision': precision, 'bitWidth': bitWidth }; } @@ -75,6 +81,12 @@ export class JSONTypeAssembler extends Visitor { public visitList({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } + public visitListView({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } + public visitLargeListView({ typeId }: T) { + return { 'name': ArrowType[typeId].toLowerCase() }; + } public visitStruct({ typeId }: T) { return { 'name': ArrowType[typeId].toLowerCase() }; } diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index 6841b39d..eaf41297 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -28,7 +28,8 @@ import { toIntervalDayTimeObjects, toIntervalMonthDayNanoObjects } from '../util import { DataType, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, BinaryView, Utf8View, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, IntArray, + ListView, LargeListView, } from '../type.js'; /** @ignore */ @@ -46,12 +47,16 @@ export interface JSONVectorAssembler extends Visitor { visitLargeUtf8(data: Data): { DATA: string[]; OFFSET: string[] }; visitBinary(data: Data): { DATA: string[]; OFFSET: number[] }; visitLargeBinary(data: Data): { DATA: string[]; OFFSET: string[] }; + visitBinaryView(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; + visitUtf8View(data: Data): { VIEWS: any[]; VARIADIC_DATA_BUFFERS: string[] }; visitFixedSizeBinary(data: Data): { DATA: string[] }; visitDate(data: Data): { DATA: number[] }; visitTimestamp(data: Data): { DATA: string[] }; visitTime(data: Data): { DATA: number[] }; visitDecimal(data: Data): { DATA: string[] }; visitList(data: Data): { children: any[]; OFFSET: number[] }; + visitListView(data: Data): { children: any[]; OFFSET: number[]; SIZE: number[] }; + visitLargeListView(data: Data): { children: any[]; OFFSET: string[]; SIZE: string[] }; visitStruct(data: Data): { children: any[] }; visitUnion(data: Data): { children: any[]; TYPE_ID: number[] }; visitInterval(data: Data): { DATA: number[] }; @@ -112,6 +117,15 @@ export class JSONVectorAssembler extends Visitor { public visitLargeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))], 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)] }; } + public visitBinaryView(data: Data) { + return binaryViewDataToJSON(data, (bytes) => Array.from(bytes) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase()); + } + public visitUtf8View(data: Data) { + return binaryViewDataToJSON(data, (bytes) => Array.from(bytes).map(b => String.fromCodePoint(b)).join('')); + } public visitFixedSizeBinary(data: Data) { return { 'DATA': [...binaryToString(new Vector([data]))] }; } @@ -141,6 +155,20 @@ export class JSONVectorAssembler extends Visitor { 'children': this.visitMany(data.type.children, data.children) }; } + public visitListView(data: Data) { + return { + 'OFFSET': [...data.valueOffsets], + 'SIZE': [...data.values], + 'children': this.visitMany(data.type.children, data.children) + }; + } + public visitLargeListView(data: Data) { + return { + 'OFFSET': [...bigNumsToStrings(data.valueOffsets, 2)], + 'SIZE': [...bigNumsToStrings(data.values, 2)], + 'children': this.visitMany(data.type.children, data.children) + }; + } public visitStruct(data: Data) { return { 'children': this.visitMany(data.type.children, data.children) @@ -195,3 +223,46 @@ function* bigNumsToStrings(values: BigUint64Array | BigInt64Array | Uint32Array yield `${BN.new(u32s.subarray((i + 0) * stride, (i + 1) * stride), false)}`; } } + +/** @ignore */ +function binaryViewDataToJSON(data: Data | Data, formatInlined: (bytes: Uint8Array) => string) { + const INLINE_SIZE = 12; + const viewsData = data.values; + const dataView = new DataView(viewsData.buffer, viewsData.byteOffset, viewsData.byteLength); + const numViews = viewsData.byteLength / 16; + const bytesToHex = (bytes: Uint8Array) => + Array.from(bytes) + .map(b => ('0' + (b & 0xFF).toString(16)).slice(-2)) + .join('') + .toUpperCase(); + const parsedViews = Array.from({ length: numViews }, (_, i) => { + const offset = i * 16; + const size = dataView.getInt32(offset, true); + return [offset, size]; + }).map(([offset, size]) => (size > INLINE_SIZE) ? { + 'SIZE': size, + 'PREFIX_HEX': bytesToHex(viewsData.subarray(offset + 4, offset + 8)), + 'BUFFER_INDEX': dataView.getInt32(offset + 8, true), + 'OFFSET': dataView.getInt32(offset + 12, true) + } : { + 'SIZE': size, + 'INLINED': formatInlined(viewsData.subarray(offset + 4, offset + 4 + size)) + }); + const uniqueBufferIndices = [...new Set( + parsedViews + .map(v => v['BUFFER_INDEX']) + .filter((idx): idx is number => idx !== undefined) + )]; + const variadicBuffers = uniqueBufferIndices.map(bufferIndex => + bytesToHex(data.variadicBuffers[bufferIndex]) + ); + const bufferIndexMap = new Map( + uniqueBufferIndices.map((bufferIndex, outputIndex) => [bufferIndex, outputIndex]) + ); + // Remap buffer indices in views + const views = parsedViews.map(v => v['BUFFER_INDEX'] !== undefined + ? { ...v, 'BUFFER_INDEX': bufferIndexMap.get(v['BUFFER_INDEX']) } + : v + ); + return { 'VIEWS': views, 'VARIADIC_DATA_BUFFERS': variadicBuffers }; +} diff --git a/src/visitor/set.ts b/src/visitor/set.ts index 4bf632ba..f4fb1ae1 100644 --- a/src/visitor/set.ts +++ b/src/visitor/set.ts @@ -26,7 +26,7 @@ import { float64ToUint16 } from '../util/math.js'; import { Type, UnionMode, Precision, DateUnit, TimeUnit, IntervalUnit } from '../enum.js'; import { DataType, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -61,8 +61,10 @@ export interface SetVisitor extends Visitor { visitFloat64(data: Data, index: number, value: T['TValue']): void; visitUtf8(data: Data, index: number, value: T['TValue']): void; visitLargeUtf8(data: Data, index: number, value: T['TValue']): void; + visitUtf8View(data: Data, index: number, value: T['TValue']): void; visitBinary(data: Data, index: number, value: T['TValue']): void; visitLargeBinary(data: Data, index: number, value: T['TValue']): void; + visitBinaryView(data: Data, index: number, value: T['TValue']): void; visitFixedSizeBinary(data: Data, index: number, value: T['TValue']): void; visitDate(data: Data, index: number, value: T['TValue']): void; visitDateDay(data: Data, index: number, value: T['TValue']): void; @@ -79,6 +81,8 @@ export interface SetVisitor extends Visitor { visitTimeNanosecond(data: Data, index: number, value: T['TValue']): void; visitDecimal(data: Data, index: number, value: T['TValue']): void; visitList(data: Data, index: number, value: T['TValue']): void; + visitListView(data: Data, index: number, value: T['TValue']): void; + visitLargeListView(data: Data, index: number, value: T['TValue']): void; visitStruct(data: Data, index: number, value: T['TValue']): void; visitUnion(data: Data, index: number, value: T['TValue']): void; visitDenseUnion(data: Data, index: number, value: T['TValue']): void; @@ -121,6 +125,8 @@ export const setVariableWidthBytes = (valu } }; +const toNumber = (value: number | bigint) => typeof value === 'bigint' ? Number(value) : value; + /** @ignore */ const setBool = ({ offset, values }: Data, index: number, val: boolean) => { const idx = offset + index; @@ -155,7 +161,63 @@ export const setFixedSizeBinary = ({ stride, values } /** @ignore */ const setBinary = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, value); /** @ignore */ +const ensureWritableVariadicBuffers = (data: Data): Uint8Array[] => { + let buffers = data.variadicBuffers as unknown as Uint8Array[]; + if (!Array.isArray(buffers) || Object.isFrozen(buffers)) { + buffers = Array.from(buffers) as Uint8Array[]; + (data as any).variadicBuffers = buffers; + } + return buffers; +}; +/** @ignore */ +const setBinaryViewBytes = (data: Data, index: number, bytes: Uint8Array) => { + const views = data.values as Uint8Array | undefined; + if (!views) { + throw new Error('BinaryView data is missing view buffer'); + } + const elementWidth = BinaryView.ELEMENT_WIDTH; + const viewOffset = index * elementWidth; + const end = viewOffset + elementWidth; + if (viewOffset < 0 || end > views.length) { + throw new RangeError(`BinaryView index ${index} out of bounds`); + } + + views.fill(0, viewOffset, end); + + const view = new DataView(views.buffer, views.byteOffset + viewOffset, elementWidth); + const length = bytes.length; + view.setInt32(BinaryView.LENGTH_OFFSET, length, true); + + if (length <= BinaryView.INLINE_CAPACITY) { + views.set(bytes, viewOffset + BinaryView.INLINE_OFFSET); + return; + } + + const prefix = + (bytes[0] ?? 0) | + ((bytes[1] ?? 0) << 8) | + ((bytes[2] ?? 0) << 16) | + ((bytes[3] ?? 0) << 24); + view.setUint32(BinaryView.INLINE_OFFSET, prefix >>> 0, true); + + const buffers = ensureWritableVariadicBuffers(data); + const copy = bytes.slice(); + const bufferIndex = buffers.push(copy) - 1; + view.setInt32(BinaryView.BUFFER_INDEX_OFFSET, bufferIndex, true); + view.setInt32(BinaryView.BUFFER_OFFSET_OFFSET, 0, true); +}; +/** @ignore */ +const setBinaryView = (data: Data, index: number, value: T['TValue']) => { + const bytes = value instanceof Uint8Array ? value : new Uint8Array(value); + setBinaryViewBytes(data as unknown as Data, index, bytes); +}; +/** @ignore */ const setUtf8 = ({ values, valueOffsets }: Data, index: number, value: T['TValue']) => setVariableWidthBytes(values, valueOffsets, index, encodeUtf8(value)); +/** @ignore */ +const setUtf8View = (data: Data, index: number, value: T['TValue']) => { + const bytes = encodeUtf8(value); + setBinaryViewBytes(data as unknown as Data, index, bytes); +}; /* istanbul ignore next */ export const setDate = (data: Data, index: number, value: T['TValue']): void => { @@ -221,6 +283,44 @@ const setList = (data: Data, index: number, value: T['TValue' } }; +const setListView = (data: Data, index: number, value: T['TValue']) => { + const child = data.children[0]; + const offsets = data.valueOffsets; + const sizes = data.valueSizes!; + const set = instance.getVisitFn(child); + const start = toNumber(offsets[index]); + const length = toNumber(sizes[index]); + + if (value instanceof Vector) { + for (let i = 0; i < length; i++) { + set(child, start + i, value.get(i)); + } + } else { + for (let i = 0; i < length; i++) { + set(child, start + i, (value as any)[i]); + } + } +}; + +const setLargeListView = (data: Data, index: number, value: T['TValue']) => { + const child = data.children[0]; + const offsets = data.valueOffsets; + const sizes = data.valueSizes!; + const set = instance.getVisitFn(child); + const start = bigIntToNumber(offsets[index]); + const length = bigIntToNumber(sizes[index]); + + if (value instanceof Vector) { + for (let i = 0; i < length; i++) { + set(child, start + i, value.get(i)); + } + } else { + for (let i = 0; i < length; i++) { + set(child, start + i, (value as any)[i]); + } + } +}; + /** @ignore */ const setMap = (data: Data, index: number, value: T['TValue']) => { const values = data.children[0]; @@ -359,8 +459,10 @@ SetVisitor.prototype.visitFloat32 = wrapSet(setFloat); SetVisitor.prototype.visitFloat64 = wrapSet(setFloat); SetVisitor.prototype.visitUtf8 = wrapSet(setUtf8); SetVisitor.prototype.visitLargeUtf8 = wrapSet(setUtf8); +SetVisitor.prototype.visitUtf8View = wrapSet(setUtf8View); SetVisitor.prototype.visitBinary = wrapSet(setBinary); SetVisitor.prototype.visitLargeBinary = wrapSet(setBinary); +SetVisitor.prototype.visitBinaryView = wrapSet(setBinaryView); SetVisitor.prototype.visitFixedSizeBinary = wrapSet(setFixedSizeBinary); SetVisitor.prototype.visitDate = wrapSet(setDate); SetVisitor.prototype.visitDateDay = wrapSet(setDateDay); @@ -377,6 +479,8 @@ SetVisitor.prototype.visitTimeMicrosecond = wrapSet(setTimeMicrosecond); SetVisitor.prototype.visitTimeNanosecond = wrapSet(setTimeNanosecond); SetVisitor.prototype.visitDecimal = wrapSet(setDecimal); SetVisitor.prototype.visitList = wrapSet(setList); +SetVisitor.prototype.visitListView = wrapSet(setListView); +SetVisitor.prototype.visitLargeListView = wrapSet(setLargeListView); SetVisitor.prototype.visitStruct = wrapSet(setStruct); SetVisitor.prototype.visitUnion = wrapSet(setUnion); SetVisitor.prototype.visitDenseUnion = wrapSet(setDenseUnion); diff --git a/src/visitor/typeassembler.ts b/src/visitor/typeassembler.ts index 169f3627..066d65e1 100644 --- a/src/visitor/typeassembler.ts +++ b/src/visitor/typeassembler.ts @@ -25,9 +25,11 @@ import { Null } from '../fb/null.js'; import { Int } from '../fb/int.js'; import { FloatingPoint } from '../fb/floating-point.js'; import { Binary } from '../fb/binary.js'; +import { BinaryView } from '../fb/binary-view.js'; import { LargeBinary } from '../fb/large-binary.js'; import { Bool } from '../fb/bool.js'; import { Utf8 } from '../fb/utf8.js'; +import { Utf8View } from '../fb/utf8-view.js'; import { LargeUtf8 } from '../fb/large-utf8.js'; import { Decimal } from '../fb/decimal.js'; import { Date } from '../fb/date.js'; @@ -36,6 +38,8 @@ import { Timestamp } from '../fb/timestamp.js'; import { Interval } from '../fb/interval.js'; import { Duration } from '../fb/duration.js'; import { List } from '../fb/list.js'; +import { ListView } from '../fb/list-view.js'; +import { LargeListView } from '../fb/large-list-view.js'; import { Struct_ as Struct } from '../fb/struct-.js'; import { Union } from '../fb/union.js'; import { DictionaryEncoding } from '../fb/dictionary-encoding.js'; @@ -72,6 +76,10 @@ export class TypeAssembler extends Visitor { Binary.startBinary(b); return Binary.endBinary(b); } + public visitBinaryView(_node: T, b: Builder) { + BinaryView.startBinaryView(b); + return BinaryView.endBinaryView(b); + } public visitLargeBinary(_node: T, b: Builder) { LargeBinary.startLargeBinary(b); return LargeBinary.endLargeBinary(b); @@ -84,6 +92,10 @@ export class TypeAssembler extends Visitor { Utf8.startUtf8(b); return Utf8.endUtf8(b); } + public visitUtf8View(_node: T, b: Builder) { + Utf8View.startUtf8View(b); + return Utf8View.endUtf8View(b); + } public visitLargeUtf8(_node: T, b: Builder) { LargeUtf8.startLargeUtf8(b); return LargeUtf8.endLargeUtf8(b); @@ -129,6 +141,14 @@ export class TypeAssembler extends Visitor { List.startList(b); return List.endList(b); } + public visitListView(_node: T, b: Builder) { + ListView.startListView(b); + return ListView.endListView(b); + } + public visitLargeListView(_node: T, b: Builder) { + LargeListView.startLargeListView(b); + return LargeListView.endLargeListView(b); + } public visitStruct(_node: T, b: Builder) { Struct.startStruct_(b); return Struct.endStruct_(b); diff --git a/src/visitor/typecomparator.ts b/src/visitor/typecomparator.ts index 65413ccd..7ba64b52 100644 --- a/src/visitor/typecomparator.ts +++ b/src/visitor/typecomparator.ts @@ -21,7 +21,7 @@ import { Visitor } from '../visitor.js'; import { Schema, Field } from '../schema.js'; import { DataType, TypeMap, Dictionary, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, Float, Float16, Float32, Float64, Int, Uint8, Uint16, Uint32, Uint64, Int8, Int16, Int32, Int64, Date_, DateDay, DateMillisecond, @@ -55,8 +55,10 @@ export interface TypeComparator extends Visitor { visitFloat64(type: T, other?: DataType | null): other is T; visitUtf8(type: T, other?: DataType | null): other is T; visitLargeUtf8(type: T, other?: DataType | null): other is T; + visitUtf8View(type: T, other?: DataType | null): other is T; visitBinary(type: T, other?: DataType | null): other is T; visitLargeBinary(type: T, other?: DataType | null): other is T; + visitBinaryView(type: T, other?: DataType | null): other is T; visitFixedSizeBinary(type: T, other?: DataType | null): other is T; visitDate(type: T, other?: DataType | null): other is T; visitDateDay(type: T, other?: DataType | null): other is T; @@ -73,6 +75,8 @@ export interface TypeComparator extends Visitor { visitTimeNanosecond(type: T, other?: DataType | null): other is T; visitDecimal(type: T, other?: DataType | null): other is T; visitList(type: T, other?: DataType | null): other is T; + visitListView(type: T, other?: DataType | null): other is T; + visitLargeListView(type: T, other?: DataType | null): other is T; visitStruct(type: T, other?: DataType | null): other is T; visitUnion(type: T, other?: DataType | null): other is T; visitDenseUnion(type: T, other?: DataType | null): other is T; @@ -178,6 +182,14 @@ function compareList(type: T, other?: DataType | null): other is ); } +function compareListView(type: T, other?: DataType | null): other is T { + return (type === other) || ( + compareConstructor(type, other) && + type.children.length === other.children.length && + instance.compareManyFields(type.children, other.children) + ); +} + function compareStruct(type: T, other?: DataType | null): other is T { return (type === other) || ( compareConstructor(type, other) && @@ -254,8 +266,10 @@ TypeComparator.prototype.visitFloat32 = compareFloat; TypeComparator.prototype.visitFloat64 = compareFloat; TypeComparator.prototype.visitUtf8 = compareAny; TypeComparator.prototype.visitLargeUtf8 = compareAny; +TypeComparator.prototype.visitUtf8View = compareAny; TypeComparator.prototype.visitBinary = compareAny; TypeComparator.prototype.visitLargeBinary = compareAny; +TypeComparator.prototype.visitBinaryView = compareAny; TypeComparator.prototype.visitFixedSizeBinary = compareFixedSizeBinary; TypeComparator.prototype.visitDate = compareDate; TypeComparator.prototype.visitDateDay = compareDate; @@ -272,6 +286,8 @@ TypeComparator.prototype.visitTimeMicrosecond = compareTime; TypeComparator.prototype.visitTimeNanosecond = compareTime; TypeComparator.prototype.visitDecimal = compareAny; TypeComparator.prototype.visitList = compareList; +TypeComparator.prototype.visitListView = compareListView; +TypeComparator.prototype.visitLargeListView = compareListView; TypeComparator.prototype.visitStruct = compareStruct; TypeComparator.prototype.visitUnion = compareUnion; TypeComparator.prototype.visitDenseUnion = compareUnion; diff --git a/src/visitor/typector.ts b/src/visitor/typector.ts index 2aab6d3d..7fc45b3e 100644 --- a/src/visitor/typector.ts +++ b/src/visitor/typector.ts @@ -84,6 +84,10 @@ export class GetDataTypeConstructor extends Visitor { public visitDurationNanosecond() { return type.DurationNanosecond; } public visitFixedSizeList() { return type.FixedSizeList; } public visitMap() { return type.Map_; } + public visitBinaryView() { return type.BinaryView; } + public visitUtf8View() { return type.Utf8View; } + public visitListView() { return type.ListView; } + public visitLargeListView() { return type.LargeListView; } } /** @ignore */ diff --git a/src/visitor/vectorassembler.ts b/src/visitor/vectorassembler.ts index 7dc36955..decb485f 100644 --- a/src/visitor/vectorassembler.ts +++ b/src/visitor/vectorassembler.ts @@ -27,7 +27,7 @@ import { BufferRegion, FieldNode } from '../ipc/metadata/message.js'; import { DataType, Dictionary, Float, Int, Date_, Interval, Time, Timestamp, Union, Duration, - Bool, Null, Utf8, LargeUtf8, Binary, LargeBinary, Decimal, FixedSizeBinary, List, FixedSizeList, Map_, Struct, + Bool, Null, Utf8, Utf8View, LargeUtf8, Binary, BinaryView, LargeBinary, Decimal, FixedSizeBinary, List, ListView, LargeListView, FixedSizeList, Map_, Struct, } from '../type.js'; import { bigIntToNumber } from '../util/bigint.js'; @@ -51,6 +51,8 @@ export interface VectorAssembler extends Visitor { visitTime(data: Data): this; visitDecimal(data: Data): this; visitList(data: Data): this; + visitListView(data: Data): this; + visitLargeListView(data: Data): this; visitStruct(data: Data): this; visitUnion(data: Data): this; visitInterval(data: Data): this; @@ -115,11 +117,13 @@ export class VectorAssembler extends Visitor { public get buffers() { return this._buffers; } public get byteLength() { return this._byteLength; } public get bufferRegions() { return this._bufferRegions; } + public get variadicBufferCounts() { return this._variadicBufferCounts; } protected _byteLength = 0; protected _nodes: FieldNode[] = []; protected _buffers: ArrayBufferView[] = []; protected _bufferRegions: BufferRegion[] = []; + protected _variadicBufferCounts: number[] = []; } /** @ignore */ @@ -215,6 +219,22 @@ function assembleFlatListVector(this: VectorAssembler, data: Data) { + const { offset, length, stride, values, variadicBuffers = [] } = data; + if (!values) { + throw new Error('BinaryView data is missing view buffer'); + } + const start = offset * stride; + const end = start + length * stride; + addBuffer.call(this, values.subarray(start, end)); + for (const buffer of variadicBuffers) { + addBuffer.call(this, buffer); + } + this._variadicBufferCounts.push(variadicBuffers.length); + return this; +} + /** @ignore */ function assembleListVector(this: VectorAssembler, data: Data) { const { length, valueOffsets } = data; @@ -229,6 +249,63 @@ function assembleListVector(this: VectorA return this.visit(data.children[0]); } +function assembleListViewVector(this: VectorAssembler, data: Data) { + const length = data['length']; + const valueOffsets = data['valueOffsets']; + const valueSizes = data['valueSizes']; + const children = data['children']; + if (!valueSizes) { + throw new Error('ListView data is missing size buffer'); + } + + if (length === 0) { + addBuffer.call(this, valueOffsets.subarray(0, 0)); + addBuffer.call(this, valueSizes.subarray(0, 0)); + return this.visit(children[0].slice(0, 0)); + } + + let minOffset = Number.POSITIVE_INFINITY; + let maxEnd = 0; + for (let i = 0; i < length; i++) { + const start = bigIntToNumber(valueOffsets[i]); + const size = bigIntToNumber(valueSizes[i]); + if (start < minOffset) { + minOffset = start; + } + const end = start + size; + if (end > maxEnd) { + maxEnd = end; + } + } + if (!Number.isFinite(minOffset)) { + minOffset = 0; + } + + if (typeof valueOffsets[0] === 'bigint') { + const base = BigInt(minOffset); + const OffsetArrayType = valueOffsets.constructor as typeof BigInt64Array; + const rebasedOffsets = new OffsetArrayType(length); + for (let i = 0; i < length; i++) { + rebasedOffsets[i] = (valueOffsets[i] as bigint) - base; + } + addBuffer.call(this, rebasedOffsets); + } else { + const base = minOffset; + const OffsetArrayType = valueOffsets.constructor as typeof Int32Array; + const rebasedOffsets = new OffsetArrayType(length); + for (let i = 0; i < length; i++) { + rebasedOffsets[i] = (valueOffsets[i] as number) - base; + } + addBuffer.call(this, rebasedOffsets); + } + + addBuffer.call(this, valueSizes.subarray(0, length)); + + const child = children[0].slice(minOffset, maxEnd - minOffset); + this.visit(child); + return this; +} + /** @ignore */ function assembleNestedVector(this: VectorAssembler, data: Data) { return this.visitMany(data.type.children.map((_, i) => data.children[i]).filter(Boolean))[0]; @@ -239,14 +316,18 @@ VectorAssembler.prototype.visitInt = assembleFlatVector; VectorAssembler.prototype.visitFloat = assembleFlatVector; VectorAssembler.prototype.visitUtf8 = assembleFlatListVector; VectorAssembler.prototype.visitLargeUtf8 = assembleFlatListVector; +VectorAssembler.prototype.visitUtf8View = assembleBinaryViewVector; VectorAssembler.prototype.visitBinary = assembleFlatListVector; VectorAssembler.prototype.visitLargeBinary = assembleFlatListVector; +VectorAssembler.prototype.visitBinaryView = assembleBinaryViewVector; VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector; VectorAssembler.prototype.visitDate = assembleFlatVector; VectorAssembler.prototype.visitTimestamp = assembleFlatVector; VectorAssembler.prototype.visitTime = assembleFlatVector; VectorAssembler.prototype.visitDecimal = assembleFlatVector; VectorAssembler.prototype.visitList = assembleListVector; +VectorAssembler.prototype.visitListView = assembleListViewVector; +VectorAssembler.prototype.visitLargeListView = assembleListViewVector; VectorAssembler.prototype.visitStruct = assembleNestedVector; VectorAssembler.prototype.visitUnion = assembleUnion; VectorAssembler.prototype.visitInterval = assembleFlatVector; diff --git a/src/visitor/vectorloader.ts b/src/visitor/vectorloader.ts index 7c82e7ab..8f842643 100644 --- a/src/visitor/vectorloader.ts +++ b/src/visitor/vectorloader.ts @@ -44,13 +44,16 @@ export class VectorLoader extends Visitor { protected buffersIndex = -1; private dictionaries: Map>; private readonly metadataVersion: MetadataVersion; - constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5) { + private variadicBufferCounts: number[]; + private variadicBufferIndex = -1; + constructor(bytes: Uint8Array, nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion = MetadataVersion.V5, variadicBufferCounts: number[] = []) { super(); this.bytes = bytes; this.nodes = nodes; this.buffers = buffers; this.dictionaries = dictionaries; this.metadataVersion = metadataVersion; + this.variadicBufferCounts = variadicBufferCounts; } public visit(node: Field | T): Data { @@ -75,12 +78,38 @@ export class VectorLoader extends Visitor { public visitLargeUtf8(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitUtf8View(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } public visitBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } public visitLargeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), data: this.readData(type) }); } + public visitBinaryView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const views = this.readData(type); + const variadicBuffers = this.readVariadicBuffers(this.nextVariadicBufferCount()); + return makeData({ + type, + length, + nullCount, + nullBitmap, + ['views']: views, + ['variadicBuffers']: variadicBuffers + }); + } public visitFixedSizeBinary(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), data: this.readData(type) }); } @@ -99,6 +128,36 @@ export class VectorLoader extends Visitor { public visitList(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), valueOffsets: this.readOffsets(type), 'child': this.visit(type.children[0]) }); } + public visitListView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const valueOffsets = this.readOffsets(type); + const valueSizes = this.readOffsets(type); + const child = this.visit(type.children[0]); + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + 'child': child + }); + } + public visitLargeListView(type: T, { length, nullCount } = this.nextFieldNode()) { + const nullBitmap = this.readNullBitmap(type, nullCount); + const valueOffsets = this.readOffsets(type); + const valueSizes = this.readOffsets(type); + const child = this.visit(type.children[0]); + return makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + 'child': child + }); + } public visitStruct(type: T, { length, nullCount } = this.nextFieldNode()) { return makeData({ type, length, nullCount, nullBitmap: this.readNullBitmap(type, nullCount), children: this.visitMany(type.children) }); } @@ -142,6 +201,12 @@ export class VectorLoader extends Visitor { protected readData(_type: T, { length, offset } = this.nextBufferRange()) { return this.bytes.subarray(offset, offset + length); } + protected readVariadicBuffers(length: number) { + return Array.from({ length }, () => this.readData(null as any)); + } + protected nextVariadicBufferCount() { + return this.variadicBufferCounts[++this.variadicBufferIndex] ?? 0; + } protected readDictionary(type: T): Vector { return this.dictionaries.get(type.id)!; } @@ -150,8 +215,8 @@ export class VectorLoader extends Visitor { /** @ignore */ export class JSONVectorLoader extends VectorLoader { private sources: any[][]; - constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(sources: any[][], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.sources = sources; } protected readNullBitmap(_type: T, nullCount: number, { offset } = this.nextBufferRange()) { @@ -175,6 +240,10 @@ export class JSONVectorLoader extends VectorLoader { return toArrayBufferView(Uint8Array, Int128.convertArray(sources[offset] as string[])); } else if (DataType.isBinary(type) || DataType.isLargeBinary(type) || DataType.isFixedSizeBinary(type)) { return binaryDataFromJSON(sources[offset] as string[]); + } else if (DataType.isBinaryView(type)) { + return binaryViewDataFromJSON(sources[offset] as any[]); + } else if (DataType.isUtf8View(type)) { + return utf8ViewDataFromJSON(sources[offset] as any[]); } else if (DataType.isBool(type)) { return packBools(sources[offset] as number[]); } else if (DataType.isUtf8(type) || DataType.isLargeUtf8(type)) { @@ -191,25 +260,107 @@ export class JSONVectorLoader extends VectorLoader { } return toArrayBufferView(Uint8Array, toArrayBufferView(type.ArrayType, sources[offset].map((x) => +x))); } + protected readVariadicBuffers(length: number) { + // Per Arrow C++ reference implementation (cpp/src/arrow/ipc/reader.cc), + // each variadic buffer is stored as a separate buffer region, matching + // the IPC format where each is accessed via separate GetBuffer() calls. + // VARIADIC_DATA_BUFFERS in JSON is an array, but flattenDataSources spreads + // it so each hex string gets its own sources entry, maintaining 1:1 + // correspondence with BufferRegion entries. + const buffers: Uint8Array[] = []; + for (let i = 0; i < length; i++) { + const { offset } = this.nextBufferRange(); + // sources[offset] is 'any[]' but for variadic buffers it's actually a string + // after spreading in flattenDataSources. Cast necessary due to heterogeneous + // sources array structure (most fields are arrays, variadic elements are strings). + const hexString = this.sources[offset] as unknown as string; + buffers.push(hexStringToBytes(hexString)); + } + return buffers; + } } /** @ignore */ -function binaryDataFromJSON(values: string[]) { - // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] - // There are definitely more efficient ways to do this... but it gets the - // job done. - const joined = values.join(''); - const data = new Uint8Array(joined.length / 2); - for (let i = 0; i < joined.length; i += 2) { - data[i >> 1] = Number.parseInt(joined.slice(i, i + 2), 16); +function hexStringToBytes(hexString: string): Uint8Array { + // Parse hex string per Arrow JSON integration format (uppercase hex encoding). + // Used for: VARIADIC_DATA_BUFFERS elements, Binary DATA (after join), + // BinaryView PREFIX_HEX and INLINED fields. + const data = new Uint8Array(hexString.length / 2); + for (let i = 0; i < hexString.length; i += 2) { + data[i >> 1] = Number.parseInt(hexString.slice(i, i + 2), 16); } return data; } +/** @ignore */ +function binaryDataFromJSON(values: string[]): Uint8Array { + // Arrow JSON Binary/LargeBinary/FixedSizeBinary format: + // "DATA": ["49BC7D5B6C47D2","3F5FB6D9322026"] (array of hex strings, one per value) + // Join all values into one continuous hex string, then parse to bytes. + return hexStringToBytes(values.join('')); +} + +/** @ignore */ +function parseViewDataFromJSON(views: any[], parseInlined: (inlined: string) => Uint8Array) { + // Each view is a 16-byte struct: [length: i32, prefix/inlined: 12 bytes, buffer_index: i32, offset: i32] + const data = new Uint8Array(views.length * 16); + const dataView = new DataView(data.buffer); + + for (const [i, view] of views.entries()) { + const offset = i * 16; + const size = view['SIZE']; + + // Write size (int32 at byte 0) + dataView.setInt32(offset, size, true); + + if (view['INLINED'] !== undefined) { + // Inline view: parse INLINED field using provided callback + const bytes = parseInlined(view['INLINED']); + for (let j = 0; j < bytes.length && j < 12; j++) { + data[offset + 4 + j] = bytes[j]; + } + } else { + // Out-of-line view: write prefix, buffer_index, offset + const prefix = view['PREFIX_HEX']; + // Write 4-byte prefix at bytes 4-7 + for (let j = 0; j < 8 && j < prefix.length; j += 2) { + data[offset + 4 + (j >> 1)] = Number.parseInt(prefix.slice(j, j + 2), 16); + } + // Write buffer_index (int32 at byte 8) + dataView.setInt32(offset + 8, view['BUFFER_INDEX'], true); + // Write offset (int32 at byte 12) + dataView.setInt32(offset + 12, view['OFFSET'], true); + } + } + + return data; +} + +/** @ignore */ +function binaryViewDataFromJSON(views: any[]) { + return parseViewDataFromJSON(views, (inlined: string) => { + // BinaryView: INLINED is hex-encoded string + const bytes = new Uint8Array(inlined.length / 2); + for (let i = 0; i < inlined.length; i += 2) { + bytes[i >> 1] = Number.parseInt(inlined.slice(i, i + 2), 16); + } + return bytes; + }); +} + +/** @ignore */ +function utf8ViewDataFromJSON(views: any[]) { + return parseViewDataFromJSON(views, (inlined: string) => { + // Utf8View: INLINED is UTF-8 string - encode to bytes + const encoder = new TextEncoder(); + return encoder.encode(inlined); + }); +} + export class CompressedVectorLoader extends VectorLoader { private bodyChunks: Uint8Array[]; - constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion) { - super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion); + constructor(bodyChunks: Uint8Array[], nodes: FieldNode[], buffers: BufferRegion[], dictionaries: Map>, metadataVersion: MetadataVersion, variadicBufferCounts: number[] = []) { + super(new Uint8Array(0), nodes, buffers, dictionaries, metadataVersion, variadicBufferCounts); this.bodyChunks = bodyChunks; } protected readData(_type: T, _buffer = this.nextBufferRange()) { diff --git a/test/data/tables.ts b/test/data/tables.ts index e9674d9b..5ac11b30 100644 --- a/test/data/tables.ts +++ b/test/data/tables.ts @@ -22,12 +22,12 @@ import * as generate from '../generate-test-data.js'; import { Schema, Field, Dictionary } from 'apache-arrow'; -const listVectorGeneratorNames = ['list', 'fixedSizeList']; +const listVectorGeneratorNames = ['list', 'fixedSizeList', 'listView', 'largeListView']; const nestedVectorGeneratorNames = ['struct', 'denseUnion', 'sparseUnion', 'map']; const dictionaryKeyGeneratorNames = ['int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']; const valueVectorGeneratorNames = [ 'null_', 'bool', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'binary', 'largeBinary', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', + 'float16', 'float32', 'float64', 'utf8', 'largeUtf8', 'utf8View', 'binary', 'largeBinary', 'binaryView', 'fixedSizeBinary', 'dateDay', 'dateMillisecond', 'timestampSecond', 'timestampMillisecond', 'timestampMicrosecond', 'timestampNanosecond', 'timeSecond', 'timeMillisecond', 'timeMicrosecond', 'timeNanosecond', 'decimal', 'dictionary', 'intervalDayTime', 'intervalYearMonth', 'intervalMonthDayNano', diff --git a/test/generate-test-data.ts b/test/generate-test-data.ts index de4a8269..f61f2b73 100644 --- a/test/generate-test-data.ts +++ b/test/generate-test-data.ts @@ -16,20 +16,20 @@ // under the License. import { - makeData, Vector, Visitor, DataType, TypeMap, + makeData, Vector, vectorFromArray, Visitor, DataType, TypeMap, Table, Schema, Field, RecordBatch, Null, Bool, Int, Int8, Int16, Int32, Int64, Uint8, Uint16, Uint32, Uint64, Float, Float16, Float32, Float64, - Utf8, LargeUtf8, - Binary, LargeBinary, + Utf8, LargeUtf8, Utf8View, + Binary, LargeBinary, BinaryView, FixedSizeBinary, Date_, DateDay, DateMillisecond, Timestamp, TimestampSecond, TimestampMillisecond, TimestampMicrosecond, TimestampNanosecond, Time, TimeSecond, TimeMillisecond, TimeMicrosecond, TimeNanosecond, Decimal, - List, + List, ListView, LargeListView, Struct, Union, DenseUnion, SparseUnion, Dictionary, @@ -64,6 +64,8 @@ interface TestDataVectorGenerator extends Visitor { visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; + visit(type: T, length?: number, nullCount?: number): GeneratedVector; visit(type: T, length?: number, nullCount?: number, child?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, dictionary?: Vector): GeneratedVector; visit(type: T, length?: number, nullCount?: number, children?: Vector[]): GeneratedVector; @@ -79,8 +81,10 @@ interface TestDataVectorGenerator extends Visitor { visitFloat: typeof generateFloat; visitUtf8: typeof generateUtf8; visitLargeUtf8: typeof generateLargeUtf8; + visitUtf8View: typeof generateUtf8View; visitBinary: typeof generateBinary; visitLargeBinary: typeof generateLargeBinary; + visitBinaryView: typeof generateBinaryView; visitFixedSizeBinary: typeof generateFixedSizeBinary; visitDate: typeof generateDate; visitTimestamp: typeof generateTimestamp; @@ -106,14 +110,18 @@ TestDataVectorGenerator.prototype.visitUint64 = generateBigInt; TestDataVectorGenerator.prototype.visitFloat = generateFloat; TestDataVectorGenerator.prototype.visitUtf8 = generateUtf8; TestDataVectorGenerator.prototype.visitLargeUtf8 = generateLargeUtf8; +TestDataVectorGenerator.prototype.visitUtf8View = generateUtf8View; TestDataVectorGenerator.prototype.visitBinary = generateBinary; TestDataVectorGenerator.prototype.visitLargeBinary = generateLargeBinary; +TestDataVectorGenerator.prototype.visitBinaryView = generateBinaryView; TestDataVectorGenerator.prototype.visitFixedSizeBinary = generateFixedSizeBinary; TestDataVectorGenerator.prototype.visitDate = generateDate; TestDataVectorGenerator.prototype.visitTimestamp = generateTimestamp; TestDataVectorGenerator.prototype.visitTime = generateTime; TestDataVectorGenerator.prototype.visitDecimal = generateDecimal; TestDataVectorGenerator.prototype.visitList = generateList; +TestDataVectorGenerator.prototype.visitListView = generateListView; +TestDataVectorGenerator.prototype.visitLargeListView = generateLargeListView; TestDataVectorGenerator.prototype.visitStruct = generateStruct; TestDataVectorGenerator.prototype.visitUnion = generateUnion; TestDataVectorGenerator.prototype.visitDictionary = generateDictionary; @@ -222,8 +230,10 @@ export const float32 = (length = 100, nullCount = Math.trunc(length * 0.2)) => v export const float64 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Float64(), length, nullCount); export const utf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8(), length, nullCount); export const largeUtf8 = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeUtf8(), length, nullCount); +export const utf8View = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Utf8View(), length, nullCount); export const binary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new Binary(), length, nullCount); export const largeBinary = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new LargeBinary(), length, nullCount); +export const binaryView = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new BinaryView(), length, nullCount); export const fixedSizeBinary = (length = 100, nullCount = Math.trunc(length * 0.2), byteWidth = 8) => vectorGenerator.visit(new FixedSizeBinary(byteWidth), length, nullCount); export const dateDay = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateDay(), length, nullCount); export const dateMillisecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new DateMillisecond(), length, nullCount); @@ -237,6 +247,8 @@ export const timeMicrosecond = (length = 100, nullCount = Math.trunc(length * 0. export const timeNanosecond = (length = 100, nullCount = Math.trunc(length * 0.2)) => vectorGenerator.visit(new TimeNanosecond(), length, nullCount); export const decimal = (length = 100, nullCount = Math.trunc(length * 0.2), scale = 2, precision = 9, bitWidth = 128) => vectorGenerator.visit(new Decimal(scale, precision, bitWidth), length, nullCount); export const list = (length = 100, nullCount = Math.trunc(length * 0.2), child = defaultListChild) => vectorGenerator.visit(new List(child), length, nullCount); +export const listView = (length = 100, nullCount = Math.trunc(length * 0.2), child = defaultListChild) => vectorGenerator.visit(new ListView(child), length, nullCount); +export const largeListView = (length = 100, nullCount = Math.trunc(length * 0.2), child = defaultListChild) => vectorGenerator.visit(new LargeListView(child), length, nullCount); export const struct = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultStructChildren()) => vectorGenerator.visit(new Struct(children), length, nullCount); export const denseUnion = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new DenseUnion(children.map((f) => f.typeId), children), length, nullCount); export const sparseUnion = (length = 100, nullCount = Math.trunc(length * 0.2), children: Field[] = defaultUnionChildren()) => vectorGenerator.visit(new SparseUnion(children.map((f) => f.typeId), children), length, nullCount); @@ -252,7 +264,53 @@ export const fixedSizeList = (length = 100, nullCount = Math.trunc(length * 0.2) export const map = (length = 100, nullCount = Math.trunc(length * 0.2), child: Field> = defaultMapChild()) => vectorGenerator.visit(new Map_(child), length, nullCount); export const vecs = { - null_, bool, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float16, float32, float64, utf8, largeUtf8, binary, largeBinary, fixedSizeBinary, dateDay, dateMillisecond, timestampSecond, timestampMillisecond, timestampMicrosecond, timestampNanosecond, timeSecond, timeMillisecond, timeMicrosecond, timeNanosecond, decimal, list, struct, denseUnion, sparseUnion, dictionary, intervalDayTime, intervalYearMonth, intervalMonthDayNano, fixedSizeList, map, durationSecond, durationMillisecond, durationMicrosecond, durationNanosecond + null_, + bool, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + float16, + float32, + float64, + utf8, + largeUtf8, + utf8View, + binary, + largeBinary, + binaryView, + fixedSizeBinary, + dateDay, + dateMillisecond, + timestampSecond, + timestampMillisecond, + timestampMicrosecond, + timestampNanosecond, + timeSecond, + timeMillisecond, + timeMicrosecond, + timeNanosecond, + decimal, + list, + listView, + largeListView, + struct, + denseUnion, + sparseUnion, + dictionary, + intervalDayTime, + intervalYearMonth, + intervalMonthDayNano, + fixedSizeList, + map, + durationSecond, + durationMillisecond, + durationMicrosecond, + durationNanosecond, } as { [k: string]: (...args: any[]) => any }; function generateNull(this: TestDataVectorGenerator, type: T, length = 100): GeneratedVector { @@ -364,6 +422,13 @@ function generateLargeUtf8(this: TestDataVectorGenerator, t return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateUtf8View(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const values = Array.from({ length }, (_, i) => isValid(nullBitmap, i) ? randomString(Math.trunc(Math.random() * 20)) : null); + const vector = vectorFromArray(values, type); + return { values: () => values, vector }; +} + function generateBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const valueOffsets = createVariableWidthOffsets32(length, nullBitmap, 10, 20, nullCount != 0); @@ -384,6 +449,13 @@ function generateLargeBinary(this: TestDataVectorGenerato return { values: () => values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, data })]) }; } +function generateBinaryView(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const values = Array.from({ length }, (_, i) => isValid(nullBitmap, i) ? randomBytes(Math.trunc(Math.random() * 20)) : null); + const vector = vectorFromArray(values, type); + return { values: () => values, vector }; +} + function generateFixedSizeBinary(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const data = fillRandom(Uint8Array, length * type.byteWidth); @@ -493,6 +565,97 @@ function generateList(this: TestDataVectorGenerator, type: T, le return { values, vector: new Vector([makeData({ type, length, nullCount, nullBitmap, valueOffsets, child: childVec.data[0] })]) }; } +function generateListView(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = new Int32Array(length); + const valueSizes = new Int32Array(length); + let totalValues = 0; + + iterateBitmap(length, nullBitmap, (i, valid) => { + valueOffsets[i] = totalValues; + if (!valid) { + valueSizes[i] = 0; + return; + } + const size = Math.trunc(rand() * 5); + valueSizes[i] = size; + totalValues += size; + }); + + const childLength = Math.max(totalValues, 1); + const childNullCount = Math.trunc(childLength * 0.2); + const childVector = this.visit(type.children[0].type, childLength, childNullCount).vector; + const childData = childVector.data[0]; + const childSlice = totalValues === childLength ? childData : childData.slice(0, totalValues); + + const vector = new Vector([makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + child: childSlice + })]); + + const values = memoize(() => { + const entries = new Array(length) as (T['TValue'] | null)[]; + for (let i = 0; i < length; i++) { + entries[i] = vector.get(i); + } + return entries; + }); + + return { values, vector }; +} + +function generateLargeListView(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2)): GeneratedVector { + const nullBitmap = createBitmap(length, nullCount); + const valueOffsets = new BigInt64Array(length); + const valueSizes = new BigInt64Array(length); + let totalValues = 0n; + + iterateBitmap(length, nullBitmap, (i, valid) => { + valueOffsets[i] = totalValues; + if (!valid) { + valueSizes[i] = 0n; + return; + } + const size = BigInt(Math.trunc(rand() * 5)); + valueSizes[i] = size; + totalValues += size; + }); + + const childLength = Number(totalValues > 0n ? totalValues : 1n); + const childNullCount = Math.trunc(childLength * 0.2); + const childVector = this.visit(type.children[0].type, childLength, childNullCount).vector; + const childData = childVector.data[0]; + const targetLength = Number(totalValues); + const childSlice = totalValues === BigInt(childLength) + ? childData + : childData.slice(0, targetLength); + + const vector = new Vector([makeData({ + type, + length, + nullCount, + nullBitmap, + valueOffsets, + valueSizes, + child: childSlice + })]); + + const values = memoize(() => { + const entries = new Array(length) as (T['TValue'] | null)[]; + for (let i = 0; i < length; i++) { + entries[i] = vector.get(i); + } + return entries; + }); + + return { values, vector }; +} + function generateFixedSizeList(this: TestDataVectorGenerator, type: T, length = 100, nullCount = Math.trunc(length * 0.2), child = this.visit(type.children[0].type, length * type.listSize, nullCount * type.listSize)): GeneratedVector { const nullBitmap = createBitmap(length, nullCount); const values = memoize(() => { diff --git a/test/unit/builders/builder-tests.ts b/test/unit/builders/builder-tests.ts index e8684010..95bbf74d 100644 --- a/test/unit/builders/builder-tests.ts +++ b/test/unit/builders/builder-tests.ts @@ -60,6 +60,8 @@ describe('Generated Test Data', () => { describe('TimeNanosecondBuilder', () => { validateBuilder(generate.timeNanosecond); }); describe('DecimalBuilder', () => { validateBuilder(generate.decimal); }); describe('ListBuilder', () => { validateBuilder(generate.list); }); + describe('ListViewBuilder', () => { validateBuilder(generate.listView); }); + describe('LargeListViewBuilder', () => { validateBuilder(generate.largeListView); }); describe('StructBuilder', () => { validateBuilder(generate.struct); }); describe('DenseUnionBuilder', () => { validateBuilder(generate.denseUnion); }); describe('SparseUnionBuilder', () => { validateBuilder(generate.sparseUnion); }); diff --git a/test/unit/builders/listview-tests.ts b/test/unit/builders/listview-tests.ts new file mode 100644 index 00000000..69a908b1 --- /dev/null +++ b/test/unit/builders/listview-tests.ts @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { ListView, LargeListView, Int32 } from '../../../src/type.js'; +import { Field } from '../../../src/schema.js'; +import { ListViewBuilder, LargeListViewBuilder } from '../../../src/builder/listview.js'; +import { Int32Builder } from '../../../src/builder/int.js'; +import { Vector } from '../../../src/vector.js'; + +describe('ListViewBuilder', () => { + it('should build ListView with basic values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should handle multiple flushes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + const data1 = builder.flush(); + builder.append([3, 4]); + const data2 = builder.flush(); + + builder.finish(); + + const vector1 = new Vector([data1]); + const vector2 = new Vector([data2]); + + expect(vector1).toHaveLength(1); + expect(vector1.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector2).toHaveLength(1); + expect(vector2.get(0)?.toArray()).toEqual(new Int32Array([3, 4])); + }); + + it('should build ListView with varying list sizes', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1]); + builder.append([2, 3]); + builder.append([4, 5, 6]); + builder.append([7, 8, 9, 10]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(4); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([2, 3])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([4, 5, 6])); + expect(vector.get(3)?.toArray()).toEqual(new Int32Array([7, 8, 9, 10])); + }); +}); + +describe('LargeListViewBuilder', () => { + it('should build LargeListView with basic values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2, 3]); + builder.append([4, 5]); + builder.append([6]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2, 3])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([4, 5])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([6])); + }); + + it('should handle null values', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type, nullValues: [null] }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append(null); + builder.append([3, 4, 5]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([3, 4, 5])); + }); + + it('should handle empty lists', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([]); + builder.append([1, 2]); + builder.append([]); + + const vector = builder.finish().toVector(); + + expect(vector).toHaveLength(3); + expect(vector.get(0)?.toArray()).toEqual(new Int32Array([])); + expect(vector.get(1)?.toArray()).toEqual(new Int32Array([1, 2])); + expect(vector.get(2)?.toArray()).toEqual(new Int32Array([])); + }); + + it('should use BigInt offsets internally', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + + builder.append([1, 2]); + builder.append([3, 4, 5]); + + const data = builder.finish().flush(); + + // Verify that offsets and sizes are BigInt64Array + expect(data.valueOffsets).toBeInstanceOf(BigInt64Array); + expect(data.values).toBeInstanceOf(BigInt64Array); // sizes buffer + }); +}); + +describe('ListView type properties', () => { + it('should correctly report type name', () => { + const type = new ListView(new Field('item', new Int32())); + const builder = new ListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('ListView'); + }); + + it('should correctly report LargeListView type name', () => { + const type = new LargeListView(new Field('item', new Int32())); + const builder = new LargeListViewBuilder({ type }); + builder.addChild(new Int32Builder({ type: new Int32() }), 'item'); + expect(builder.type.toString()).toBe('LargeListView'); + }); +}); diff --git a/test/unit/builders/view-builders-tests.ts b/test/unit/builders/view-builders-tests.ts new file mode 100644 index 00000000..2c1958f9 --- /dev/null +++ b/test/unit/builders/view-builders-tests.ts @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import { BinaryView, Utf8View, ListView, LargeListView, Int32 } from '../../../src/type.js'; +import { Field } from '../../../src/schema.js'; +import { makeBuilder, vectorFromArray } from '../../../src/factories.js'; + +describe('BinaryViewBuilder', () => { + it('should build inline binary values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const values = [ + new Uint8Array([1, 2, 3]), + new Uint8Array([4, 5, 6, 7, 8, 9, 10, 11, 12]), + new Uint8Array([13]) + ]; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(values[0]); + expect(vector.get(1)).toEqual(values[1]); + expect(vector.get(2)).toEqual(values[2]); + }); + + it('should build out-of-line binary values (>12 bytes)', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const value = new Uint8Array(100); + for (let i = 0; i < 100; i++) { + value[i] = i % 256; + } + + builder.append(value); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toEqual(value); + }); + + it('should build mixed inline and out-of-line values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const small = new Uint8Array([1, 2, 3]); + const large = new Uint8Array(50); + for (let i = 0; i < 50; i++) { + large[i] = i % 256; + } + + builder.append(small); + builder.append(large); + builder.append(small); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(small); + expect(vector.get(1)).toEqual(large); + expect(vector.get(2)).toEqual(small); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new BinaryView(), nullValues: [null] }); + + builder.append(new Uint8Array([1, 2, 3])); + builder.append(null); + builder.append(new Uint8Array([4, 5, 6])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toEqual(new Uint8Array([1, 2, 3])); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toEqual(new Uint8Array([4, 5, 6])); + }); + + it('should handle empty values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([])); + builder.append(new Uint8Array([1])); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(new Uint8Array([])); + expect(vector.get(1)).toEqual(new Uint8Array([1])); + }); + + it('should handle exactly 12-byte boundary values', () => { + const builder = makeBuilder({ type: new BinaryView() }); + const exactly12 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + const exactly13 = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]); + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toEqual(exactly12); + expect(vector.get(1)).toEqual(exactly13); + }); + + it('should handle multiple flushes', () => { + const builder = makeBuilder({ type: new BinaryView() }); + + builder.append(new Uint8Array([1, 2])); + const data1 = builder.flush(); + expect(data1).toHaveLength(1); + + builder.append(new Uint8Array([3, 4])); + builder.append(new Uint8Array([5, 6])); + const data2 = builder.flush(); + expect(data2).toHaveLength(2); + }); +}); + +describe('Utf8ViewBuilder', () => { + it('should build inline string values (≤12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['hello', 'world', 'foo']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBe('foo'); + }); + + it('should build out-of-line string values (>12 bytes)', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const longString = 'This is a long string that exceeds 12 bytes'; + + builder.append(longString); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(1); + expect(vector.get(0)).toBe(longString); + }); + + it('should build mixed inline and out-of-line strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const short = 'hi'; + const long = 'This is a very long string that definitely exceeds the 12 byte inline capacity'; + + builder.append(short); + builder.append(long); + builder.append(short); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe(short); + expect(vector.get(1)).toBe(long); + expect(vector.get(2)).toBe(short); + }); + + it('should handle null values', () => { + const builder = makeBuilder({ type: new Utf8View(), nullValues: [null] }); + + builder.append('hello'); + builder.append(null); + builder.append('world'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBeNull(); + expect(vector.get(2)).toBe('world'); + }); + + it('should handle empty strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + + builder.append(''); + builder.append('a'); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(''); + expect(vector.get(1)).toBe('a'); + }); + + it('should handle UTF-8 multibyte characters', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const values = ['🚀', '你好', 'Ñoño', 'emoji: 🎉']; + + for (const value of values) { + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('🚀'); + expect(vector.get(1)).toBe('你好'); + expect(vector.get(2)).toBe('Ñoño'); + expect(vector.get(3)).toBe('emoji: 🎉'); + }); + + it('should handle exactly 12-byte boundary strings', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const exactly12 = 'twelve bytes'; // ASCII: 12 bytes + const exactly13 = 'thirteen byte'; // ASCII: 13 bytes + + builder.append(exactly12); + builder.append(exactly13); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBe(exactly12); + expect(vector.get(1)).toBe(exactly13); + }); + + it('should build from vectorFromArray', () => { + const values = ['hello', 'world', null, 'foo']; + const vector = vectorFromArray(values, new Utf8View()); + + expect(vector).toHaveLength(4); + expect(vector.get(0)).toBe('hello'); + expect(vector.get(1)).toBe('world'); + expect(vector.get(2)).toBeNull(); + expect(vector.get(3)).toBe('foo'); + }); + + it('should handle large batch of values', () => { + const builder = makeBuilder({ type: new Utf8View() }); + const count = 1000; + const values: string[] = []; + + for (let i = 0; i < count; i++) { + const value = i % 2 === 0 + ? `short_${i}` // inline + : `this_is_a_long_string_that_goes_out_of_line_${i}`; // out-of-line + values.push(value); + builder.append(value); + } + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(count); + + for (let i = 0; i < count; i++) { + expect(vector.get(i)).toBe(values[i]); + } + }); +}); + +describe('ListViewBuilder', () => { + const itemField = new Field('item', new Int32(), true); + + it('builds list views from plain arrays', () => { + const builder = makeBuilder({ type: new ListView(itemField) }); + builder.append([1, 2, 3]); + builder.append([]); + builder.append([4]); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.type).toBeInstanceOf(ListView); + expect(vector.get(0)?.toJSON()).toEqual([1, 2, 3]); + expect(vector.get(1)?.toJSON()).toEqual([]); + expect(vector.get(2)?.toJSON()).toEqual([4]); + }); + + it('handles nulls and vector values', () => { + const builder = makeBuilder({ type: new ListView(itemField), nullValues: [null] }); + const childVector = vectorFromArray([10, 11, 12], new Int32()); + + builder.append([5]); + builder.append(childVector.slice(1, 3)); + builder.append(null); + builder.append([]); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(4); + expect(vector.get(0)?.toJSON()).toEqual([5]); + expect(vector.get(1)?.toJSON()).toEqual([11, 12]); + expect(vector.get(2)).toBeNull(); + expect(vector.get(3)?.toJSON()).toEqual([]); + }); +}); + +describe('LargeListViewBuilder', () => { + const itemField = new Field('item', new Int32(), true); + + it('builds large list views with mixed lengths', () => { + const builder = makeBuilder({ type: new LargeListView(itemField) }); + builder.append([1, 2]); + builder.append([3, 4, 5, 6]); + builder.append([]); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(3); + expect(vector.type).toBeInstanceOf(LargeListView); + expect(vector.get(0)?.toJSON()).toEqual([1, 2]); + expect(vector.get(1)?.toJSON()).toEqual([3, 4, 5, 6]); + expect(vector.get(2)?.toJSON()).toEqual([]); + }); + + it('handles null values', () => { + const builder = makeBuilder({ type: new LargeListView(itemField), nullValues: [null] }); + builder.append(null); + builder.append([7, 8]); + + const vector = builder.finish().toVector(); + expect(vector).toHaveLength(2); + expect(vector.get(0)).toBeNull(); + expect(vector.get(1)?.toJSON()).toEqual([7, 8]); + }); +}); diff --git a/test/unit/generated-data-tests.ts b/test/unit/generated-data-tests.ts index 9affe5f6..be6fac07 100644 --- a/test/unit/generated-data-tests.ts +++ b/test/unit/generated-data-tests.ts @@ -39,8 +39,10 @@ describe('Generated Test Data', () => { describe('Float64', () => { validateVector(generate.float64()); }); describe('Utf8', () => { validateVector(generate.utf8()); }); describe('LargeUtf8', () => { validateVector(generate.largeUtf8()); }); + describe('Utf8View', () => { validateVector(generate.utf8View()); }); describe('Binary', () => { validateVector(generate.binary()); }); describe('LargeBinary', () => { validateVector(generate.largeBinary()); }); + describe('BinaryView', () => { validateVector(generate.binaryView()); }); describe('FixedSizeBinary', () => { validateVector(generate.fixedSizeBinary()); }); describe('DateDay', () => { validateVector(generate.dateDay()); }); describe('DateMillisecond', () => { validateVector(generate.dateMillisecond()); }); @@ -54,6 +56,8 @@ describe('Generated Test Data', () => { describe('TimeNanosecond', () => { validateVector(generate.timeNanosecond()); }); describe('Decimal', () => { validateVector(generate.decimal()); }); describe('List', () => { validateVector(generate.list()); }); + describe('ListView', () => { validateVector(generate.listView()); }); + describe('LargeListView', () => { validateVector(generate.largeListView()); }); describe('Struct', () => { validateVector(generate.struct()); }); describe('DenseUnion', () => { validateVector(generate.denseUnion()); }); describe('SparseUnion', () => { validateVector(generate.sparseUnion()); }); diff --git a/test/unit/vector/vector-tests.ts b/test/unit/vector/vector-tests.ts index 73c9cdbb..3a86d655 100644 --- a/test/unit/vector/vector-tests.ts +++ b/test/unit/vector/vector-tests.ts @@ -16,8 +16,9 @@ // under the License. import { - Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, makeVector, Struct, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, + Bool, DateDay, DateMillisecond, Dictionary, Float64, Int32, List, ListView, LargeListView, makeVector, Struct, Utf8, LargeUtf8, util, Vector, vectorFromArray, makeData, FixedSizeList, Field, } from 'apache-arrow'; +import { listView as generateListView, largeListView as generateLargeListView } from '../../generate-test-data.js'; describe(`makeVectorFromArray`, () => { describe(`works with null values`, () => { @@ -273,6 +274,100 @@ describe(`ListVector`, () => { }); }); +describe(`ListViewVector`, () => { + const generated = generateListView(); + const vector = generated.vector; + const expected = generated.values().map((value) => value == null ? null : value.toJSON()); + + test(`has listView type`, () => { + expect(vector.type).toBeInstanceOf(ListView); + }); + + test(`get value`, () => { + expected.forEach((expectedValue, index) => { + const actual = vector.get(index); + if (expectedValue === null) { + expect(actual).toBeNull(); + } else { + expect(actual?.toJSON()).toEqual(expectedValue); + } + }); + }); + + test(`iterates expected values`, () => { + const iterated = [] as (any[] | null)[]; + for (const value of vector) { + iterated.push(value == null ? null : value.toJSON()); + } + expect(iterated).toEqual(expected); + }); + + test(`indexOf matches non-null values`, () => { + for (let i = 0; i < vector.length; i++) { + const search = vector.get(i); + if (search !== null) { + const searchJSON = search.toJSON(); + const expectedIndex = expected.findIndex((value) => + value !== null && JSON.stringify(value) === JSON.stringify(searchJSON) + ); + expect(vector.indexOf(search)).toBe(expectedIndex); + } + } + }); + + test(`indexOf null matches first null`, () => { + const expectedNullIndex = expected.indexOf(null); + expect(vector.indexOf(null)).toBe(expectedNullIndex); + }); +}); + +describe(`LargeListViewVector`, () => { + const generated = generateLargeListView(); + const vector = generated.vector; + const expected = generated.values().map((value) => value == null ? null : value.toJSON()); + + test(`has largeListView type`, () => { + expect(vector.type).toBeInstanceOf(LargeListView); + }); + + test(`get value`, () => { + expected.forEach((expectedValue, index) => { + const actual = vector.get(index); + if (expectedValue === null) { + expect(actual).toBeNull(); + } else { + expect(actual?.toJSON()).toEqual(expectedValue); + } + }); + }); + + test(`iterates expected values`, () => { + const iterated = [] as (any[] | null)[]; + for (const value of vector) { + iterated.push(value == null ? null : value.toJSON()); + } + expect(iterated).toEqual(expected); + }); + + test(`indexOf matches non-null values`, () => { + for (let i = 0; i < vector.length; i++) { + const search = vector.get(i); + if (search !== null) { + const searchJSON = search.toJSON(); + const expectedIndex = expected.findIndex((value) => + value !== null && JSON.stringify(value) === JSON.stringify(searchJSON) + ); + expect(vector.indexOf(search)).toBe(expectedIndex); + } + } + }); + + test(`indexOf null matches first null`, () => { + const expectedNullIndex = expected.indexOf(null); + expect(vector.indexOf(null)).toBe(expectedNullIndex); + }); +}); + describe(`toArray()`, () => { test(`when some data blobs have been padded`, () => { const d1 = vectorFromArray([...new Array(16).keys()]);