-
Notifications
You must be signed in to change notification settings - Fork 656
feat: add Python 3.13 support with optional annoy dependency #1718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
e7f2f78
e36a173
ae08b33
636874d
c4c6e9b
86d9aed
7db842b
2434ec0
c315832
3a29d19
15f9a70
9684e21
3aacd37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,145 @@ | ||||||||||||||||
| # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||||||||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||||||||||||
| # | ||||||||||||||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||||||
| # you may not use this file except in compliance with the License. | ||||||||||||||||
| # You may obtain a copy of the License at | ||||||||||||||||
| # | ||||||||||||||||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||
| # | ||||||||||||||||
| # Unless required by applicable law or agreed to in writing, software | ||||||||||||||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||||||
| # See the License for the specific language governing permissions and | ||||||||||||||||
| # limitations under the License. | ||||||||||||||||
|
|
||||||||||||||||
| """Numpy-based drop-in replacement for annoy.AnnoyIndex. | ||||||||||||||||
|
|
||||||||||||||||
| This module provides a pure-numpy alternative to the Annoy library for | ||||||||||||||||
| nearest-neighbour search over embedding vectors. It is used as a fallback | ||||||||||||||||
| when annoy is not installed (e.g. on Python 3.13+ where the annoy C++ | ||||||||||||||||
| extension triggers a SIGILL). | ||||||||||||||||
|
|
||||||||||||||||
| For the typical guardrails index sizes (tens to hundreds of items) the | ||||||||||||||||
| brute-force cosine search is more than fast enough. | ||||||||||||||||
| """ | ||||||||||||||||
|
|
||||||||||||||||
| from typing import List, Optional, Tuple | ||||||||||||||||
|
|
||||||||||||||||
| import numpy as np | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| class NumpyAnnoyIndex: | ||||||||||||||||
| """A numpy-backed nearest-neighbour index that exposes the same API surface | ||||||||||||||||
| as ``annoy.AnnoyIndex`` for the subset used by NeMo Guardrails. | ||||||||||||||||
|
|
||||||||||||||||
| Supported operations: | ||||||||||||||||
| * ``add_item(i, vector)`` | ||||||||||||||||
| * ``build(n_trees)`` (no-op -- kept for interface compatibility) | ||||||||||||||||
| * ``get_nns_by_vector(vector, n, include_distances=False)`` | ||||||||||||||||
| * ``save(path)`` / ``load(path)`` | ||||||||||||||||
|
|
||||||||||||||||
| The metric is *angular* distance, matching Annoy's default for text | ||||||||||||||||
| embeddings. Angular distance is defined as | ||||||||||||||||
| ``sqrt(2 * (1 - cos_sim))`` so that it is ``0`` for identical vectors | ||||||||||||||||
| and ``2`` for diametrically opposed ones. | ||||||||||||||||
| """ | ||||||||||||||||
|
|
||||||||||||||||
| def __init__(self, embedding_size: int, metric: str = "angular"): | ||||||||||||||||
| self._embedding_size = embedding_size | ||||||||||||||||
| self._metric = metric | ||||||||||||||||
| # Sparse storage during build phase (id -> vector) | ||||||||||||||||
| self._vectors_dict: dict = {} | ||||||||||||||||
| # Dense numpy matrix after build() | ||||||||||||||||
| self._vectors: Optional[np.ndarray] = None | ||||||||||||||||
| self._built = False | ||||||||||||||||
|
cluster2600 marked this conversation as resolved.
|
||||||||||||||||
|
|
||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
| # Build interface | ||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
|
|
||||||||||||||||
| def add_item(self, i: int, vector) -> None: | ||||||||||||||||
| """Add a single vector with integer id *i*.""" | ||||||||||||||||
| self._vectors_dict[i] = np.asarray(vector, dtype=np.float32) | ||||||||||||||||
|
|
||||||||||||||||
| def build(self, n_trees: int = 10) -> None: | ||||||||||||||||
| """Finalise the index. The *n_trees* parameter is ignored (kept | ||||||||||||||||
| for API compatibility with Annoy).""" | ||||||||||||||||
| if not self._vectors_dict: | ||||||||||||||||
| self._vectors = np.empty((0, self._embedding_size), dtype=np.float32) | ||||||||||||||||
| else: | ||||||||||||||||
| max_id = max(self._vectors_dict.keys()) | ||||||||||||||||
| self._vectors = np.zeros( | ||||||||||||||||
| (max_id + 1, self._embedding_size), dtype=np.float32 | ||||||||||||||||
| ) | ||||||||||||||||
| for idx, vec in self._vectors_dict.items(): | ||||||||||||||||
| self._vectors[idx] = vec | ||||||||||||||||
| self._built = True | ||||||||||||||||
|
cluster2600 marked this conversation as resolved.
|
||||||||||||||||
|
|
||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
| # Query interface | ||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
|
|
||||||||||||||||
| def get_nns_by_vector( | ||||||||||||||||
| self, vector, n: int, include_distances: bool = False | ||||||||||||||||
| ) -> Tuple[List[int], ...]: | ||||||||||||||||
|
cluster2600 marked this conversation as resolved.
Outdated
|
||||||||||||||||
| """Return the *n* nearest neighbours of *vector*. | ||||||||||||||||
|
|
||||||||||||||||
| When *include_distances* is ``True`` the return value is a tuple | ||||||||||||||||
| ``(ids, distances)``; otherwise just ``ids``. | ||||||||||||||||
| """ | ||||||||||||||||
| if self._vectors is None or len(self._vectors) == 0: | ||||||||||||||||
| return ([], []) if include_distances else [] | ||||||||||||||||
|
|
||||||||||||||||
| query = np.asarray(vector, dtype=np.float32) | ||||||||||||||||
|
|
||||||||||||||||
| # Cosine similarity via normalised dot product | ||||||||||||||||
| norms = np.linalg.norm(self._vectors, axis=1, keepdims=True) | ||||||||||||||||
| # Avoid division by zero for zero-vectors | ||||||||||||||||
| safe_norms = np.where(norms == 0, 1.0, norms) | ||||||||||||||||
| normed = self._vectors / safe_norms | ||||||||||||||||
|
|
||||||||||||||||
| query_norm = np.linalg.norm(query) | ||||||||||||||||
| if query_norm == 0: | ||||||||||||||||
| query_normed = query | ||||||||||||||||
| else: | ||||||||||||||||
| query_normed = query / query_norm | ||||||||||||||||
|
|
||||||||||||||||
| cos_sim = normed @ query_normed # shape: (num_items,) | ||||||||||||||||
|
|
||||||||||||||||
| # Angular distance (matches Annoy's definition) | ||||||||||||||||
| cos_sim_clipped = np.clip(cos_sim, -1.0, 1.0) | ||||||||||||||||
| distances = np.sqrt(2.0 * (1.0 - cos_sim_clipped)) | ||||||||||||||||
|
|
||||||||||||||||
| # Get top-n indices (lowest distance first) | ||||||||||||||||
| n = min(n, len(distances)) | ||||||||||||||||
| top_indices = np.argpartition(distances, n)[:n] | ||||||||||||||||
| top_indices = top_indices[np.argsort(distances[top_indices])] | ||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Reproduce: idx = NumpyAnnoyIndex(3, "angular")
idx.add_item(0, [1, 0, 0])
idx.build(10)
idx.get_nns_by_vector([1, 0, 0], 5) # ValueError: kth(=1) out of bounds (1)Fix — clamp
Suggested change
This ensures the partition index is always within bounds while still taking all Prompt To Fix With AIThis is a comment left during a code review.
Path: nemoguardrails/embeddings/numpy_index.py
Line: 115-117
Comment:
**`np.argpartition` crashes when `n == len(distances)`**
`numpy.argpartition(a, kth)` requires `kth` to be in the range `[-a.size, a.size - 1]`. After `n = min(n, len(distances))`, `n` can equal `len(distances)`, which is out of bounds and raises a `ValueError`. This will happen in practice any time the caller requests as many (or more) results than there are items in the index — a very common scenario for small knowledge bases (e.g., the default `max_results=20` with fewer than 20 chunks indexed).
Reproduce:
```python
idx = NumpyAnnoyIndex(3, "angular")
idx.add_item(0, [1, 0, 0])
idx.build(10)
idx.get_nns_by_vector([1, 0, 0], 5) # ValueError: kth(=1) out of bounds (1)
```
Fix — clamp `kth` to `len(distances) - 1`:
```suggestion
n = min(n, len(distances))
kth = min(n, len(distances) - 1)
top_indices = np.argpartition(distances, kth)[:n]
top_indices = top_indices[np.argsort(distances[top_indices])]
```
This ensures the partition index is always within bounds while still taking all `n` results.
How can I resolve this? If you propose a fix, please make it concise.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — this was already identified and fixed during testing on Python 3.13 (the |
||||||||||||||||
|
|
||||||||||||||||
| ids = top_indices.tolist() | ||||||||||||||||
| if include_distances: | ||||||||||||||||
| return ids, distances[top_indices].tolist() | ||||||||||||||||
| return ids | ||||||||||||||||
|
|
||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
| # Persistence | ||||||||||||||||
| # ------------------------------------------------------------------ | ||||||||||||||||
|
|
||||||||||||||||
| def save(self, path: str) -> None: | ||||||||||||||||
| """Save the index to disk as a ``.npy`` file. | ||||||||||||||||
|
|
||||||||||||||||
| If the caller supplies a path ending in ``.ann`` (the annoy | ||||||||||||||||
| convention), we silently swap the extension to ``.npy`` so that | ||||||||||||||||
| both backends can coexist in the same cache directory. | ||||||||||||||||
| """ | ||||||||||||||||
| if path.endswith(".ann"): | ||||||||||||||||
| path = path[:-4] + ".npy" | ||||||||||||||||
| if self._vectors is not None: | ||||||||||||||||
| np.save(path, self._vectors) | ||||||||||||||||
|
cluster2600 marked this conversation as resolved.
cluster2600 marked this conversation as resolved.
|
||||||||||||||||
|
|
||||||||||||||||
| def load(self, path: str) -> None: | ||||||||||||||||
| """Load a previously saved index from disk.""" | ||||||||||||||||
| if path.endswith(".ann"): | ||||||||||||||||
| path = path[:-4] + ".npy" | ||||||||||||||||
| self._vectors = np.load(path).astype(np.float32) | ||||||||||||||||
| self._built = True | ||||||||||||||||
|
cluster2600 marked this conversation as resolved.
|
||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.