Skip to content

Commit f69391b

Browse files
committed
feat(knowledge-base): implement shared-table mode for knowledge bases
- Introduced shared-table mode allowing multiple knowledge bases to share the same underlying storage tables, reducing table proliferation. - Added `ScopedTabularStorage` and `ScopedVectorStorage` wrappers to manage data access scoped by `kb_id`. - Updated registration process to support shared table names and added utility functions for schema management. - Enhanced documentation to include new shared-table mode features and usage examples.
1 parent d2456ec commit f69391b

File tree

17 files changed

+1101
-130
lines changed

17 files changed

+1101
-130
lines changed

bun.lock

Lines changed: 80 additions & 80 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/developers/03_extending.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,14 @@ When defining task input schemas, you can use `format` annotations to enable aut
149149

150150
The system supports several format annotations out of the box:
151151

152-
| Format | Description | Helper Function |
153-
| ------------------------------ | ----------------------------------- | ----------------------------- |
154-
| `model` | Any AI model configuration | `TypeModel()` |
155-
| `model:TaskName` | Model compatible with specific task ||
156-
| `storage:tabular` | Tabular data dataset | `TypeTabularStorage()` |
157-
| `dataset:document-node-vector` | Vector storage dataset | `TypeChunkVectorRepository()` |
158-
| `dataset:document` | Document dataset | `TypeDocumentRepository()` |
152+
| Format | Description | Helper Function |
153+
| --------------------------------- | ----------------------------------- | ----------------------------- |
154+
| `model` | Any AI model configuration | `TypeModel()` |
155+
| `model:TaskName` | Model compatible with specific task ||
156+
| `storage:tabular` | Tabular data storage | `TypeTabularStorage()` |
157+
| `knowledge-base` | Knowledge base instance | `TypeKnowledgeBase()` |
158+
| `credential` | Credential from credential store ||
159+
| `tasks` | Task class from task registry ||
159160

160161
### Example: Using Format Annotations
161162

examples/cli/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@
4747
"@inkjs/ui": "^2.0.0",
4848
"chalk": "^5.6.2",
4949
"commander": "^14.0.3",
50-
"ink": "^6.8.0",
51-
"react": "^19.2.4",
50+
"ink": "^7.0.0",
51+
"react": "^19.2.5",
5252
"smol-toml": "^1.6.1"
5353
},
5454
"devDependencies": {

examples/web/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
"@uiw/react-codemirror": "^4.25.9",
3434
"@xyflow/react": "^12.10.2",
3535
"clsx": "^2.1.1",
36-
"react": "^19.2.4",
37-
"react-dom": "^19.2.4",
36+
"react": "^19.2.5",
37+
"react-dom": "^19.2.5",
3838
"react-icons": "^5.6.0",
3939
"react-resizable-panels": "^4.8.0",
4040
"tailwind-merge": "^3.3.0"

package.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@
4646
"publish-workspaces": "bun ./scripts/publish-workspaces.ts"
4747
},
4848
"dependencies": {
49-
"caniuse-lite": "^1.0.30001786"
49+
"caniuse-lite": "^1.0.30001787"
5050
},
5151
"catalog": {
5252
"@huggingface/transformers": "^4.0.1",
5353
"@mediapipe/tasks-text": "^0.10.34",
5454
"@mediapipe/tasks-vision": "^0.10.34",
5555
"@mediapipe/tasks-audio": "^0.10.34",
5656
"@mediapipe/tasks-genai": "^0.10.27",
57-
"@anthropic-ai/sdk": "^0.82.0",
57+
"@anthropic-ai/sdk": "^0.86.1",
5858
"@google/generative-ai": "^0.24.1",
5959
"node-llama-cpp": "^3.18.1",
6060
"@huggingface/inference": "^4.13.15",
@@ -65,19 +65,19 @@
6565
"@modelcontextprotocol/sdk": "^1.29.0",
6666
"pg": "^8.20.0",
6767
"@electric-sql/pglite": "^0.4.3",
68-
"@supabase/supabase-js": "^2.101.1",
68+
"@supabase/supabase-js": "^2.102.1",
6969
"@sqlite.org/sqlite-wasm": "^3.51.2-build8",
7070
"better-sqlite3": "^12.8.0"
7171
},
7272
"devDependencies": {
73-
"@sqliteai/sqlite-vector": "^0.9.94",
73+
"@sqliteai/sqlite-vector": "^0.9.95",
7474
"@types/bun": "^1.3.11",
7575
"@types/node": "^24.12.2",
76-
"@typescript-eslint/eslint-plugin": "^8.58.0",
77-
"@typescript-eslint/parser": "^8.58.0",
78-
"@typescript/native-preview": "=7.0.0-dev.20260406.1",
79-
"@vitest/coverage-v8": "^4.1.2",
80-
"@vitest/ui": "4.1.2",
76+
"@typescript-eslint/eslint-plugin": "^8.58.1",
77+
"@typescript-eslint/parser": "^8.58.1",
78+
"@typescript/native-preview": "=7.0.0-dev.20260408.1",
79+
"@vitest/coverage-v8": "^4.1.3",
80+
"@vitest/ui": "4.1.3",
8181
"bunset": "^1.0.10",
8282
"concurrently": "^9.2.1",
8383
"eslint": "^10.2.0",
@@ -88,9 +88,9 @@
8888
"globals": "^17.4.0",
8989
"nyc": "^18.0.0",
9090
"prettier": "^3.8.1",
91-
"turbo": "^2.9.4",
91+
"turbo": "^2.9.5",
9292
"typescript": "6.0.2",
93-
"vitest": "^4.1.2"
93+
"vitest": "^4.1.3"
9494
},
9595
"engines": {
9696
"bun": "^1.3.11"

packages/ai/README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -722,13 +722,14 @@ This resolution is handled by the input resolver system, which inspects schema `
722722

723723
### Supported Format Annotations
724724

725-
| Format | Description | Resolver |
726-
| --------------------------------- | ---------------------------------------- | -------------------------- |
727-
| `model` | Any AI model configuration | ModelRepository |
728-
| `model:TaskName` | Model compatible with specific task type | ModelRepository |
729-
| `repository:tabular` | Tabular data repository | TabularStorageRegistry |
730-
| `repository:document-node-vector` | Vector storage repository | VectorRepositoryRegistry |
731-
| `repository:document` | Document repository | DocumentRepositoryRegistry |
725+
| Format | Description | Resolver |
726+
| ----------------- | ---------------------------------------- | ----------------------- |
727+
| `model` | Any AI model configuration | ModelRepository |
728+
| `model:TaskName` | Model compatible with specific task type | ModelRepository |
729+
| `storage:tabular` | Tabular data storage | TabularStorageRegistry |
730+
| `knowledge-base` | Knowledge base instance | KnowledgeBaseRegistry |
731+
| `credential` | Credential from credential store | CredentialStoreRegistry |
732+
| `tasks` | Task class from task registry | TaskRegistry |
732733

733734
### Custom Model Validation
734735

packages/knowledge-base/README.md

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,22 @@ Document management, hierarchical chunking, and knowledge base infrastructure fo
2121
- [Tree Traversal](#tree-traversal)
2222
- [Lifecycle Management](#lifecycle-management)
2323
- [Registry](#registry)
24+
- [Shared-Table Mode](#shared-table-mode)
25+
- [Overview](#overview-1)
26+
- [Setting Up Shared Storage](#setting-up-shared-storage)
27+
- [Scoped Wrappers](#scoped-wrappers)
28+
- [Registering with Shared Tables](#registering-with-shared-tables)
29+
- [Schemas and Indexes](#schemas-and-indexes)
30+
- [When to Use Shared Tables](#when-to-use-shared-tables)
2431
- [Data Flow](#data-flow)
2532
- [Ingestion Pipeline](#ingestion-pipeline)
2633
- [Retrieval Pipeline](#retrieval-pipeline)
2734
- [API Reference](#api-reference)
2835
- [Document](#document)
2936
- [KnowledgeBase](#knowledgebase-1)
3037
- [createKnowledgeBase](#createknowledgebase)
38+
- [ScopedTabularStorage](#scopedtabularstorage)
39+
- [ScopedVectorStorage](#scopedvectorstorage)
3140
- [StructuralParser](#structuralparser)
3241
- [Type Helpers](#type-helpers)
3342
- [License](#license)
@@ -442,6 +451,134 @@ await task.run({ knowledgeBase: kb }); // Direct instance
442451
await task.run({ knowledgeBase: "my-kb" }); // Resolved from registry
443452
```
444453

454+
## Shared-Table Mode
455+
456+
### Overview
457+
458+
By default, each `KnowledgeBase` gets its own document table and chunk table. **Shared-table mode** lets multiple knowledge bases share the same underlying storage tables, partitioned by a `kb_id` column. This is useful when you have many knowledge bases and want to reduce table proliferation in your database.
459+
460+
```
461+
Default mode (per-KB tables): Shared-table mode:
462+
┌──────────────────────┐ ┌──────────────────────────┐
463+
│ kb_docs_my_kb │ │ shared_documents │
464+
│ (doc_id, data) │ │ (doc_id, kb_id, data) │
465+
├──────────────────────┤ │ ├─ kb_id = "kb-1" rows │
466+
│ kb_chunks_my_kb │ │ └─ kb_id = "kb-2" rows │
467+
│ (chunk_id, vector..) │ ├──────────────────────────┤
468+
├──────────────────────┤ │ shared_chunks │
469+
│ kb_docs_other_kb │ │ (chunk_id, kb_id, vec..) │
470+
│ (doc_id, data) │ │ ├─ kb_id = "kb-1" rows │
471+
├──────────────────────┤ │ └─ kb_id = "kb-2" rows │
472+
│ kb_chunks_other_kb │ └──────────────────────────┘
473+
│ (chunk_id, vector..) │
474+
└──────────────────────┘
475+
```
476+
477+
The `KnowledgeBase` class itself is unchanged — shared-table mode is implemented via thin wrapper classes (`ScopedTabularStorage`, `ScopedVectorStorage`) that inject `kb_id` on writes and filter by `kb_id` on reads.
478+
479+
### Setting Up Shared Storage
480+
481+
Create the shared storage instances once, globally:
482+
483+
```typescript
484+
import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage";
485+
import {
486+
SharedDocumentStorageSchema,
487+
SharedChunkVectorStorageSchema,
488+
SharedDocumentIndexes,
489+
SharedChunkIndexes,
490+
SHARED_DOCUMENT_TABLE,
491+
SHARED_CHUNK_TABLE,
492+
DocumentStorageKey,
493+
ChunkVectorPrimaryKey,
494+
} from "@workglow/knowledge-base";
495+
496+
const sharedDocStorage = new InMemoryTabularStorage(
497+
SharedDocumentStorageSchema,
498+
DocumentStorageKey,
499+
SharedDocumentIndexes
500+
);
501+
502+
const sharedChunkStorage = new InMemoryVectorStorage(
503+
SharedChunkVectorStorageSchema,
504+
ChunkVectorPrimaryKey,
505+
SharedChunkIndexes,
506+
1024 // vector dimensions
507+
);
508+
```
509+
510+
For SQL backends (SQLite, PostgreSQL), replace `InMemoryTabularStorage` / `InMemoryVectorStorage` with the appropriate implementations. The shared schemas include indexes on `kb_id` and `[kb_id, doc_id]` for efficient scoped queries.
511+
512+
### Scoped Wrappers
513+
514+
For each knowledge base, create scoped wrappers that filter to that KB's data:
515+
516+
```typescript
517+
import {
518+
ScopedTabularStorage,
519+
ScopedVectorStorage,
520+
KnowledgeBase,
521+
} from "@workglow/knowledge-base";
522+
523+
// KB 1
524+
const scopedDocs1 = new ScopedTabularStorage(sharedDocStorage, "kb-1");
525+
const scopedChunks1 = new ScopedVectorStorage(sharedChunkStorage, "kb-1");
526+
const kb1 = new KnowledgeBase("kb-1", scopedDocs1, scopedChunks1);
527+
528+
// KB 2
529+
const scopedDocs2 = new ScopedTabularStorage(sharedDocStorage, "kb-2");
530+
const scopedChunks2 = new ScopedVectorStorage(sharedChunkStorage, "kb-2");
531+
const kb2 = new KnowledgeBase("kb-2", scopedDocs2, scopedChunks2);
532+
```
533+
534+
Each `KnowledgeBase` instance works exactly the same as in default mode — all CRUD, search, and lifecycle operations are transparently scoped to the KB's data.
535+
536+
### Registering with Shared Tables
537+
538+
Pass `{ sharedTables: true }` when registering so that the metadata record uses the shared table names:
539+
540+
```typescript
541+
import { registerKnowledgeBase } from "@workglow/knowledge-base";
542+
543+
await registerKnowledgeBase("kb-1", kb1, { sharedTables: true });
544+
await registerKnowledgeBase("kb-2", kb2, { sharedTables: true });
545+
```
546+
547+
You can check whether a persisted record uses shared tables with the `isSharedTableMode` helper:
548+
549+
```typescript
550+
import { isSharedTableMode } from "@workglow/knowledge-base";
551+
552+
const record = await repo.getKnowledgeBase("kb-1");
553+
if (isSharedTableMode(record)) {
554+
// reconstruct using scoped wrappers
555+
}
556+
```
557+
558+
### Schemas and Indexes
559+
560+
The shared schemas augment the standard schemas with a `kb_id` column:
561+
562+
| Schema | Base Schema | Added Column |
563+
| ------------------------------- | -------------------------- | ------------ |
564+
| `SharedDocumentStorageSchema` | `DocumentStorageSchema` | `kb_id: string` |
565+
| `SharedChunkVectorStorageSchema`| `ChunkVectorStorageSchema` | `kb_id: string` |
566+
567+
Default shared table names: `SHARED_DOCUMENT_TABLE = "shared_documents"`, `SHARED_CHUNK_TABLE = "shared_chunks"`.
568+
569+
Pre-defined index arrays for efficient queries:
570+
- `SharedDocumentIndexes``[["kb_id"]]`
571+
- `SharedChunkIndexes``[["kb_id"], ["kb_id", "doc_id"]]`
572+
573+
### When to Use Shared Tables
574+
575+
| Scenario | Recommendation |
576+
| --- | --- |
577+
| Few knowledge bases, each large | Default (per-KB tables) — simpler, no `kb_id` overhead |
578+
| Many knowledge bases (e.g., per-user, per-tenant) | Shared tables — avoids table proliferation |
579+
| Need cross-KB queries | Shared tables — query the shared storage directly |
580+
| Using managed databases with table limits | Shared tables |
581+
445582
## Data Flow
446583

447584
### Ingestion Pipeline
@@ -640,6 +777,35 @@ interface CreateKnowledgeBaseOptions {
640777
}
641778
```
642779

780+
### ScopedTabularStorage
781+
782+
```typescript
783+
class ScopedTabularStorage<Schema, PrimaryKeyNames, Entity, PrimaryKey, InsertType>
784+
implements ITabularStorage<Schema, PrimaryKeyNames, Entity, PrimaryKey, InsertType>
785+
{
786+
constructor(inner: AnyTabularStorage, kbId: string);
787+
788+
// All ITabularStorage methods are implemented.
789+
// Writes inject kb_id, reads filter by kb_id, results strip kb_id.
790+
// setupDatabase() and destroy() are no-ops (shared storage lifecycle is external).
791+
}
792+
```
793+
794+
### ScopedVectorStorage
795+
796+
```typescript
797+
class ScopedVectorStorage<Metadata, Schema, Entity, PrimaryKeyNames>
798+
extends ScopedTabularStorage<Schema, PrimaryKeyNames, Entity>
799+
implements IVectorStorage<Metadata, Schema, Entity, PrimaryKeyNames>
800+
{
801+
constructor(inner: AnyVectorStorage, kbId: string);
802+
803+
getVectorDimensions(): number; // Delegates to inner
804+
similaritySearch(query, options?): Promise<(Entity & { score })[]>; // Post-filters by kb_id
805+
hybridSearch?(query, options): Promise<(Entity & { score })[]>; // Post-filters by kb_id
806+
}
807+
```
808+
643809
### StructuralParser
644810

645811
```typescript

packages/knowledge-base/src/common.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ export * from "./knowledge-base/KnowledgeBaseRepository";
1212
export * from "./knowledge-base/InMemoryKnowledgeBaseRepository";
1313
export * from "./knowledge-base/KnowledgeBaseRegistry";
1414
export * from "./knowledge-base/createKnowledgeBase";
15+
export * from "./knowledge-base/ScopedTabularStorage";
16+
export * from "./knowledge-base/ScopedVectorStorage";
17+
export * from "./knowledge-base/SharedTableSchemas";
1518

1619
export * from "./util/DatasetSchema";
1720
export * from "./document/Document";

packages/knowledge-base/src/knowledge-base/KnowledgeBaseRegistry.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ import {
1414
import { InMemoryKnowledgeBaseRepository } from "./InMemoryKnowledgeBaseRepository";
1515
import type { KnowledgeBase } from "./KnowledgeBase";
1616
import { KnowledgeBaseRepository } from "./KnowledgeBaseRepository";
17-
import { knowledgeBaseTableNames } from "./KnowledgeBaseSchema";
1817
import type { KnowledgeBaseRecord } from "./KnowledgeBaseSchema";
18+
import { knowledgeBaseTableNames } from "./KnowledgeBaseSchema";
19+
import { SHARED_CHUNK_TABLE, SHARED_DOCUMENT_TABLE } from "./SharedTableSchemas";
1920

2021
/**
2122
* Service token for the knowledge base registry
@@ -70,15 +71,28 @@ export function setGlobalKnowledgeBaseRepository(repository: KnowledgeBaseReposi
7071
globalServiceRegistry.registerInstance(KNOWLEDGE_BASE_REPOSITORY, repository);
7172
}
7273

74+
export interface RegisterKnowledgeBaseOptions {
75+
/** When true, record uses shared table names instead of per-KB table names. */
76+
readonly sharedTables?: boolean;
77+
}
78+
7379
/**
7480
* Registers a knowledge base globally by ID.
7581
* Adds to both the live Map and the persistent repository.
7682
*/
77-
export async function registerKnowledgeBase(id: string, kb: KnowledgeBase): Promise<void> {
83+
84+
export async function registerKnowledgeBase(
85+
id: string,
86+
kb: KnowledgeBase,
87+
options?: RegisterKnowledgeBaseOptions
88+
): Promise<void> {
7889
const kbs = getGlobalKnowledgeBases();
7990

8091
const now = new Date().toISOString();
81-
const tableNames = knowledgeBaseTableNames(id);
92+
const useShared = options?.sharedTables === true;
93+
const tableNames = useShared
94+
? { documentTable: SHARED_DOCUMENT_TABLE, chunkTable: SHARED_CHUNK_TABLE }
95+
: knowledgeBaseTableNames(id);
8296
const record: KnowledgeBaseRecord = {
8397
kb_id: id,
8498
title: kb.title,

packages/knowledge-base/src/knowledge-base/KnowledgeBaseSchema.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*/
66

77
import type { DataPortSchemaObject, FromSchema } from "@workglow/util/schema";
8+
import { SHARED_CHUNK_TABLE, SHARED_DOCUMENT_TABLE } from "./SharedTableSchemas";
89

910
/**
1011
* Schema for persisting KnowledgeBase metadata to tabular storage.
@@ -50,3 +51,10 @@ export function knowledgeBaseTableNames(kbId: string): {
5051
chunkTable: `kb_chunks_${safe}`,
5152
};
5253
}
54+
55+
/**
56+
* Checks whether a KnowledgeBaseRecord uses shared-table mode.
57+
*/
58+
export function isSharedTableMode(record: KnowledgeBaseRecord): boolean {
59+
return record.document_table === SHARED_DOCUMENT_TABLE && record.chunk_table === SHARED_CHUNK_TABLE;
60+
}

0 commit comments

Comments
 (0)