|
1 | 1 | import ParquetWorker from './parquetWorker?worker&inline' |
2 | 2 | /// ^ the worker is bundled with the main thread code (inline) which is easier for users to import |
3 | 3 | /// (no need to copy the worker file to the right place) |
4 | | -import { AsyncBuffer, ColumnData, FileMetaData, ParquetReadOptions } from 'hyparquet' |
5 | | -import { asyncBufferFromUrl } from '../lib/utils.ts' |
6 | | - |
7 | | -// Serializable constructor for AsyncBuffers |
8 | | -export interface AsyncBufferFrom { |
9 | | - url: string |
10 | | - byteLength: number |
11 | | - headers?: Record<string, string> |
12 | | -} |
13 | | -// Same as ParquetReadOptions, but AsyncBufferFrom instead of AsyncBuffer |
14 | | -export interface ParquetReadWorkerOptions extends Omit<ParquetReadOptions, 'file'> { |
15 | | - from: AsyncBufferFrom |
16 | | - orderBy?: string |
17 | | - sortIndex?: boolean |
18 | | -} |
19 | | -// Row is defined in hightable, but not exported + we change any to unknown |
20 | | -export type Row = Record<string, unknown>; |
21 | | - |
22 | | -interface Message { |
23 | | - queryId: number |
24 | | -} |
25 | | -export interface ChunkMessage extends Message { |
26 | | - chunk: ColumnData |
27 | | -} |
28 | | -export interface ResultMessage extends Message { |
29 | | - result: Row[] |
30 | | -} |
31 | | -export interface IndicesMessage extends Message { |
32 | | - indices: number[] |
33 | | -} |
34 | | -export interface ErrorMessage extends Message { |
35 | | - error: Error |
36 | | -} |
37 | | - |
38 | | -export type ParquetMessage = ChunkMessage | ResultMessage | ErrorMessage |
39 | | -export type SortParquetMessage = IndicesMessage | ErrorMessage |
40 | | - |
41 | | -export interface ParquetSortIndexOptions { |
42 | | - metadata: FileMetaData |
43 | | - from: AsyncBufferFrom |
44 | | - orderBy: string |
45 | | -} |
46 | | - |
| 4 | +import { ColumnData } from 'hyparquet' |
| 5 | +import type { ParquetMessage, ParquetReadWorkerOptions, ParquetSortIndexOptions, Row, SortParquetMessage } from './types.d.ts' |
47 | 6 |
|
48 | 7 | let worker: Worker | undefined |
49 | 8 | let nextQueryId = 0 |
@@ -99,7 +58,6 @@ function getWorker() { |
99 | 58 | return worker |
100 | 59 | } |
101 | 60 |
|
102 | | - |
103 | 61 | /** |
104 | 62 | * Presents almost the same interface as parquetRead, but runs in a worker. |
105 | 63 | * This is useful for reading large parquet files without blocking the main thread. |
@@ -143,79 +101,3 @@ export function parquetSortIndexWorker({ metadata, from, orderBy }: ParquetSortI |
143 | 101 | }) |
144 | 102 | }) |
145 | 103 | } |
146 | | - |
147 | | -/** |
148 | | - * Convert AsyncBufferFrom to AsyncBuffer and cache results. |
149 | | - */ |
150 | | -export function asyncBufferFrom( |
151 | | - from: AsyncBufferFrom, |
152 | | -): Promise<AsyncBuffer> { |
153 | | - const key = JSON.stringify(from) |
154 | | - const cached = cache.get(key) |
155 | | - if (cached) return cached |
156 | | - const asyncBuffer = asyncBufferFromUrl(from).then(cachedAsyncBuffer) |
157 | | - cache.set(key, asyncBuffer) |
158 | | - return asyncBuffer |
159 | | -} |
160 | | -const cache = new Map<string, Promise<AsyncBuffer>>() |
161 | | - |
162 | | -export function compare<T>(a: T, b: T): number { |
163 | | - if (a < b) return -1 |
164 | | - if (a > b) return 1 |
165 | | - return 1 // TODO: how to handle nulls? |
166 | | -} |
167 | | - |
168 | | -// TODO(SL): once the types in cachedAsyncBuffer are fixed, import all the following from hyparquet |
169 | | -type Awaitable<T> = T | Promise<T>; |
170 | | - |
171 | | -function cachedAsyncBuffer(asyncBuffer: AsyncBuffer): AsyncBuffer { |
172 | | - const cache = new Map<string, Awaitable<ArrayBuffer>>() |
173 | | - const { byteLength } = asyncBuffer |
174 | | - return { |
175 | | - byteLength, |
176 | | - /** |
177 | | - * @param {number} start |
178 | | - * @param {number} [end] |
179 | | - * @returns {Awaitable<ArrayBuffer>} |
180 | | - */ |
181 | | - slice(start: number, end?: number): Awaitable<ArrayBuffer> { |
182 | | - const key = cacheKey(start, end, byteLength) |
183 | | - const cached = cache.get(key) |
184 | | - if (cached) return cached |
185 | | - // cache miss, read from file |
186 | | - const promise = asyncBuffer.slice(start, end) |
187 | | - cache.set(key, promise) |
188 | | - return promise |
189 | | - }, |
190 | | - } |
191 | | -} |
192 | | - |
193 | | -/** |
194 | | - * Returns canonical cache key for a byte range 'start,end'. |
195 | | - * Normalize int-range and suffix-range requests to the same key. |
196 | | - * |
197 | | - * @param {number} start start byte of range |
198 | | - * @param {number} [end] end byte of range, or undefined for suffix range |
199 | | - * @param {number} [size] size of file, or undefined for suffix range |
200 | | - * @returns {string} |
201 | | - */ |
202 | | -function cacheKey(start: number, end?: number, size?: number): string { |
203 | | - if (start < 0) { |
204 | | - if (end !== undefined) |
205 | | - throw new Error( |
206 | | - `invalid suffix range [${start.toString()}, ${end.toString()}]`, |
207 | | - ) |
208 | | - if (size === undefined) return `${start.toString()},` |
209 | | - return `${(size + start).toString()},${size.toString()}` |
210 | | - } else if (end !== undefined) { |
211 | | - if (start > end) |
212 | | - throw new Error( |
213 | | - `invalid empty range [${start.toString()}, ${end.toString()}]`, |
214 | | - ) |
215 | | - return `${start.toString()},${end.toString()}` |
216 | | - } else if (size === undefined) { |
217 | | - return `${start.toString()},` |
218 | | - } else { |
219 | | - return `${start.toString()},${size.toString()}` |
220 | | - } |
221 | | -} |
0 commit comments