Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 9 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,6 @@
[![workflow status](https://github.com/hyparam/demos/actions/workflows/ci_squirreling_demo.yml/badge.svg)](https://github.com/hyparam/demos/actions)
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)

## HighTable Demo :classical_building:

HighTable is a react virtual-scroll table component for view large datasets.

Demo: https://hyparam.github.io/demos/hightable/

HighTable: https://github.com/hyparam/hightable

Example project that demonstrates loading random data into a HighTable.

Uses vite for running client-side react.

## Hyparquet Demo :parrot:

Hyparquet is an Apache Parquet file parser implemented entirely in JavaScript.
Expand Down Expand Up @@ -60,11 +48,14 @@ Includes version dropdown for time-traveling to view the history of a dataset.

Uses vite for running client-side react.

## Iceberg-auth Demo :lock:
## HighTable Demo :classical_building:

HighTable is a react virtual-scroll table component for view large datasets.

Demo: https://hyparam.github.io/demos/hightable/

Same icebird viewer, but gated by AWS Cognito OAuth so a whitelisted user can
read Iceberg tables from a **private** S3 bucket, and chat with a model via
**Bedrock InvokeModel** — all directly from the browser, no backend.
HighTable: https://github.com/hyparam/hightable

See [iceberg-auth/README.md](iceberg-auth/README.md) for the Cognito + IAM
setup steps. Requires `VITE_*` env vars to be configured before building.
Example project that demonstrates loading random data into a HighTable.

Uses vite for running client-side react.
1 change: 1 addition & 0 deletions hypvector/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ npm-debug.log*
*.njsproj
*.sln
*.sw?
.vite
2 changes: 1 addition & 1 deletion hypvector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@ http-server dist/
- `hypvector` is currently consumed via a local `file:` dependency (`../../hypvector`).
Once it is published to npm this can be switched to a version range.
- The vector parquet on S3 (`s3.hyperparam.app/hypvector/wiki_en.vectors.parquet`) was
built by running `npm run data:embed` in the `hypvector` repo against the 50k
built by running `npm run data:embed` in the `hypvector` repo against the 156k
English Wikipedia sample and uploaded with `aws s3 cp ... --profile hyperparam-platypii`.
12 changes: 6 additions & 6 deletions hypvector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,27 @@
"typecheck": "tsc"
},
"dependencies": {
"@huggingface/transformers": "3.7.5",
"@huggingface/transformers": "4.2.0",
"hyparquet": "1.26.0",
"hyparquet-compressors": "1.1.1",
"hyperparam": "0.4.14",
"hypvector": "0.1.0",
"hypvector": "0.1.1",
"react": "19.2.6",
"react-dom": "19.2.6"
},
"devDependencies": {
"@types/react": "19.2.15",
"@types/react-dom": "19.2.3",
"@vitejs/plugin-react": "6.0.2",
"@vitest/coverage-v8": "4.1.6",
"@vitest/coverage-v8": "4.1.7",
"eslint": "9.39.4",
"eslint-plugin-react": "7.37.5",
"eslint-plugin-react-hooks": "7.1.1",
"eslint-plugin-react-refresh": "0.5.2",
"globals": "17.6.0",
"typescript": "6.0.3",
"typescript-eslint": "8.59.4",
"vite": "8.0.13",
"vitest": "4.1.6"
"typescript-eslint": "8.60.0",
"vite": "8.0.14",
"vitest": "4.1.7"
}
}
66 changes: 54 additions & 12 deletions hypvector/src/Page.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { type FeatureExtractionPipeline, pipeline } from '@huggingface/transformers'
import { AsyncBuffer, FileMetaData, asyncBufferFromUrl, cachedAsyncBuffer, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
import { compressors } from 'hyparquet-compressors'
import { type SearchResult, searchVectors } from 'hypvector'
import { type SearchResult, prefetchBinary, searchVectors } from 'hypvector'
import { ReactNode, useCallback, useEffect, useRef, useState } from 'react'

const vectorsUrl = 'https://s3.hyperparam.app/hypvector/wiki_en.vectors.parquet'
Expand All @@ -26,21 +26,45 @@ interface DisplayResult extends SearchResult {
title?: string
}

interface NetCounter {
fetches: number
bytes: number
maxConcurrent: number
}

interface QueryStats {
embedMs: number
searchMs: number
fetches: number
bytes: number
maxConcurrent: number
}

/** Wrap an AsyncBuffer to count fetches and bytes read. */
function instrumented(buffer: AsyncBuffer, counter: { fetches: number; bytes: number }): AsyncBuffer {
/**
* Wrap a raw (network-backed) AsyncBuffer to count actual fetches, bytes,
* and peak in-flight concurrency. Mount this BELOW cachedAsyncBuffer so
* cache hits are not counted as fetches.
*
* Counter is read from the ref each call so the demo can swap counters
* between queries without re-wrapping the buffer.
*/
function instrumentNetwork(buffer: AsyncBuffer, counterRef: { current: NetCounter | null }): AsyncBuffer {
let inFlight = 0
return {
byteLength: buffer.byteLength,
slice(start: number, end?: number): ArrayBuffer | Promise<ArrayBuffer> {
counter.fetches += 1
counter.bytes += (end ?? buffer.byteLength) - start
return buffer.slice(start, end)
const c = counterRef.current
if (c) {
c.fetches += 1
c.bytes += (end ?? buffer.byteLength) - start
inFlight += 1
if (inFlight > c.maxConcurrent) c.maxConcurrent = inFlight
}
const result = buffer.slice(start, end)
function finish() { inFlight -= 1 }
if (result instanceof Promise) return result.finally(finish)
finish()
return result
},
}
}
Expand All @@ -56,6 +80,8 @@ export default function Page({ setError }: PageProps): ReactNode {
const extractorRef = useRef<FeatureExtractionPipeline | undefined>(undefined)
const vectorsBufferRef = useRef<AsyncBuffer | undefined>(undefined)
const vectorsMetaRef = useRef<FileMetaData | undefined>(undefined)
const binaryRef = useRef<Uint8Array | undefined>(undefined)
const netCounterRef = useRef<NetCounter | null>(null)
const wikiBufferRef = useRef<AsyncBuffer | undefined>(undefined)
const wikiMetaRef = useRef<FileMetaData | undefined>(undefined)

Expand All @@ -81,11 +107,16 @@ export default function Page({ setError }: PageProps): ReactNode {
let cancelled = false
async function load() {
const raw = await asyncBufferFromUrl({ url: vectorsUrl })
const cached = cachedAsyncBuffer(raw)
const counted = instrumentNetwork(raw, netCounterRef)
const cached = cachedAsyncBuffer(counted)
const meta = await parquetMetadataAsync(cached)
if (cancelled) return
vectorsBufferRef.current = cached
vectorsMetaRef.current = meta
// Pull the small binary column into RAM up front so every query skips
// phase-1 fetches. ~7.5 MB at 156k × 384-dim.
const binary = await prefetchBinary({ source: cached, metadata: meta, compressors })
binaryRef.current = binary
setVectorsStatus('ready')
}
load().catch((e: unknown) => {
Expand Down Expand Up @@ -122,25 +153,35 @@ export default function Page({ setError }: PageProps): ReactNode {
const embedMs = performance.now() - embedStart
signal.throwIfAborted()

const counter = { fetches: 0, bytes: 0 }
const instrumentedBuffer = instrumented(buffer, counter)
// Swap in a fresh network counter for this query; reads pass through
// the already-wired instrumentNetwork layer below cachedAsyncBuffer.
const counter: NetCounter = { fetches: 0, bytes: 0, maxConcurrent: 0 }
netCounterRef.current = counter

const searchStart = performance.now()
const hits = await searchVectors({
source: instrumentedBuffer,
source: buffer,
metadata,
query: queryVec,
topK,
binary: binaryRef.current,
signal,
compressors,
})
const searchMs = performance.now() - searchStart
netCounterRef.current = null
signal.throwIfAborted()

// Show scores immediately, then fill in titles.
const initial: DisplayResult[] = hits.map(h => ({ ...h }))
setResults(initial)
setStats({ embedMs, searchMs, fetches: counter.fetches, bytes: counter.bytes })
setStats({
embedMs,
searchMs,
fetches: counter.fetches,
bytes: counter.bytes,
maxConcurrent: counter.maxConcurrent,
})

// Look up titles in the wiki parquet using ids as row indices.
const { buffer: wb, metadata: wm } = await ensureWiki()
Expand Down Expand Up @@ -226,13 +267,14 @@ export default function Page({ setError }: PageProps): ReactNode {
</div>}

<div className='stats-bar'>
<span>50,000 wiki titles · 384-dim float32 · 249 MB</span>
<span>156,289 wiki titles · 384-dim float32 · 249 MB</span>
{stats && <>
<span className='spacer' />
<span>embed: <code>{stats.embedMs.toFixed(0)} ms</code></span>
<span>search: <code>{stats.searchMs.toFixed(0)} ms</code></span>
<span>fetches: <code>{stats.fetches}</code></span>
<span>read: <code>{formatBytes(stats.bytes)}</code></span>
<span>max concurrent: <code>{stats.maxConcurrent}</code></span>
</>}
</div>

Expand Down