Merge pull request #19 from de9uch1/convert_opq

Extended faiss_to_nanopq() to convert faiss.IndexPreTransform (OPQMatrix+IndexPQ)
matsui528 · Jun 28, 2022 · 3a62e81 · 3a62e81
2 parents 4c1d724 + 593fbbe
commit 3a62e81
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 14 deletions.
diff --git a/nanopq/convert_faiss.py b/nanopq/convert_faiss.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 
+from .opq import OPQ
 from .pq import PQ
 
 
@@ -48,29 +49,51 @@ def nanopq_to_faiss(pq_nanopq):
 
 
 def faiss_to_nanopq(pq_faiss):
-    """Convert a `faiss.IndexPQ <https://github.com/facebookresearch/faiss/blob/master/IndexPQ.h>`_ instance to :class:`nanopq.PQ`.
+    """Convert a `faiss.IndexPQ <https://github.com/facebookresearch/faiss/blob/master/IndexPQ.h>`_ 
+    or a `faiss.IndexPreTransform <https://github.com/facebookresearch/faiss/blob/master/IndexPreTransform.h>`_ instance to :class:`nanopq.OPQ`.
     To use this function, `faiss module needs to be installed <https://github.com/facebookresearch/faiss/blob/master/INSTALL.md>`_.
 
     Args:
-        pq_faiss (faiss.IndexPQ): An input PQ instance.
+        pq_faiss (Union[faiss.IndexPQ, faiss.IndexPreTransform]): An input PQ or OPQ instance.
 
     Returns:
         tuple:
-            * nanopq.PQ: A converted PQ instance, with the same codewords to the input.
+            * Union[nanopq.PQ, nanopq.OPQ]: A converted PQ or OPQ instance, with the same codewords to the input.
             * np.ndarray: Stored PQ codes in the input IndexPQ, with the shape=(N, M). This will be empty if codes are not stored
 
     """
-    assert isinstance(pq_faiss, faiss.IndexPQ), "Error. pq_faiss must be IndexPQ"
+    assert isinstance(
+        pq_faiss, (faiss.IndexPQ, faiss.IndexPreTransform)
+    ), "Error. pq_faiss must be IndexPQ or IndexPreTransform"
     assert pq_faiss.is_trained, "Error. pq_faiss must have been trained"
 
-    pq_nanopq = PQ(M=pq_faiss.pq.M, Ks=int(2 ** pq_faiss.pq.nbits))
-    pq_nanopq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M)
-
-    # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds
-    codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape(
-        pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds
-    )
-
-    pq_nanopq.codewords = codewords
+    if isinstance(pq_faiss, faiss.IndexPreTransform):
+        opq_matrix: faiss.LinearTransform = faiss.downcast_VectorTransform(
+            pq_faiss.chain.at(0)
+        )
+        pq_faiss: faiss.IndexPQ = faiss.downcast_index(pq_faiss.index)
+        pq_nanopq = OPQ(M=pq_faiss.pq.M, Ks=int(2**pq_faiss.pq.nbits))
+        pq_nanopq.pq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M)
+
+        # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds
+        codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape(
+            pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds
+        )
+
+        pq_nanopq.pq.codewords = codewords
+        pq_nanopq.R = (
+            faiss.vector_to_array(opq_matrix.A)
+            .reshape(opq_matrix.d_out, opq_matrix.d_in)
+            .transpose(1, 0)
+        )
+    else:
+        pq_nanopq = PQ(M=pq_faiss.pq.M, Ks=int(2**pq_faiss.pq.nbits))
+        pq_nanopq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M)
+
+        # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds
+        codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape(
+            pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds
+        )
+        pq_nanopq.codewords = codewords
 
     return pq_nanopq, faiss.vector_to_array(pq_faiss.codes).reshape(-1, pq_faiss.pq.M)
diff --git a/tests/test_convert_faiss.py b/tests/test_convert_faiss.py
@@ -56,7 +56,7 @@ def test_nanopq_to_faiss(self):
 
         self.assertTrue(np.array_equal(ids1, ids2))
 
-    def test_faiss_to_nanopq(self):
+    def test_faiss_to_nanopq_pq(self):
         D, M, Ks = 32, 4, 256
         Nt, Nb, Nq = 2000, 10000, 100
         nbits = int(np.log2(Ks))
@@ -70,6 +70,41 @@ def test_faiss_to_nanopq(self):
         pq_faiss.add(x=Xb)
 
         pq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq(pq_faiss=pq_faiss)
+        self.assertIsInstance(pq_nanopq, nanopq.PQ)
+        self.assertEqual(pq_nanopq.codewords.shape, (M, Ks, int(D / M)))
+
+        # Encoded results should be same
+        Cb_nanopq = pq_nanopq.encode(vecs=Xb)
+        self.assertTrue(np.array_equal(Cb_nanopq, Cb_faiss))
+
+        # Search result should be same
+        topk = 100
+        _, ids1 = pq_faiss.search(x=Xq, k=topk)
+        ids2 = np.array(
+            [
+                np.argsort(pq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk]
+                for xq in Xq
+            ]
+        )
+        self.assertTrue(np.array_equal(ids1, ids2))
+
+    def test_faiss_to_nanopq_opq(self):
+        D, M, Ks = 32, 4, 256
+        Nt, Nb, Nq = 2000, 10000, 100
+        nbits = int(np.log2(Ks))
+        assert nbits == 8
+        Xt = np.random.rand(Nt, D).astype(np.float32)
+        Xb = np.random.rand(Nb, D).astype(np.float32)
+        Xq = np.random.rand(Nq, D).astype(np.float32)
+
+        pq_faiss = faiss.IndexPQ(D, M, nbits)
+        opq_matrix = faiss.OPQMatrix(D, M=M)
+        pq_faiss = faiss.IndexPreTransform(opq_matrix, pq_faiss)
+        pq_faiss.train(x=Xt)
+        pq_faiss.add(x=Xb)
+
+        pq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq(pq_faiss=pq_faiss)
+        self.assertIsInstance(pq_nanopq, nanopq.OPQ)
         self.assertEqual(pq_nanopq.codewords.shape, (M, Ks, int(D / M)))
 
         # Encoded results should be same