From 922f879e5e57f4824f672f992f4e820947833501 Mon Sep 17 00:00:00 2001 From: Hiroyuki Deguchi Date: Tue, 8 Feb 2022 15:20:10 +0900 Subject: [PATCH 1/2] Implement faiss OPQ+IndexPQ to nanopq.OPQ converter --- nanopq/__init__.py | 4 ++-- nanopq/convert_faiss.py | 39 +++++++++++++++++++++++++++++++++++++ tests/test_convert_faiss.py | 33 +++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/nanopq/__init__.py b/nanopq/__init__.py index df71855..88b0295 100644 --- a/nanopq/__init__.py +++ b/nanopq/__init__.py @@ -1,6 +1,6 @@ -__all__ = ["PQ", "OPQ", "DistanceTable", "nanopq_to_faiss", "faiss_to_nanopq"] +__all__ = ["PQ", "OPQ", "DistanceTable", "nanopq_to_faiss", "faiss_to_nanopq", "faiss_to_nanopq_opq"] __version__ = "0.1.10" -from .convert_faiss import faiss_to_nanopq, nanopq_to_faiss +from .convert_faiss import faiss_to_nanopq, nanopq_to_faiss, faiss_to_nanopq_opq from .opq import OPQ from .pq import PQ, DistanceTable diff --git a/nanopq/convert_faiss.py b/nanopq/convert_faiss.py index dda127f..b511812 100644 --- a/nanopq/convert_faiss.py +++ b/nanopq/convert_faiss.py @@ -10,6 +10,7 @@ import numpy as np from .pq import PQ +from .opq import OPQ def nanopq_to_faiss(pq_nanopq): @@ -74,3 +75,41 @@ def faiss_to_nanopq(pq_faiss): pq_nanopq.codewords = codewords return pq_nanopq, faiss.vector_to_array(pq_faiss.codes).reshape(-1, pq_faiss.pq.M) + + +def faiss_to_nanopq_opq(opq_faiss): + """Convert a `faiss.IndexPreTransform `_ instance to :class:`nanopq.OPQ`. + To use this function, `faiss module needs to be installed `_. + + Args: + opq_faiss (faiss.IndexPreTransform): An input OPQ instance. It only supports `OPQMatrix + IndexPQ`, not `+ IndexIVFPQ`. + + Returns: + tuple: + * nanopq.OPQ: A converted OPQ instance, with the same codewords to the input. + * np.ndarray: Stored PQ codes in the input IndexPQ, with the shape=(N, M). This will be empty if codes are not stored + + """ + assert isinstance(opq_faiss, faiss.IndexPreTransform), "Error. opq_faiss must be IndexPreTransform" + assert opq_faiss.is_trained, "Error. opq_faiss must have been trained" + + opq_matrix: faiss.LinearTransform = faiss.downcast_VectorTransform(opq_faiss.chain.at(0)) + pq_index: faiss.IndexPQ = faiss.downcast_index(opq_faiss.index) + + opq_nanopq = OPQ(M=pq_index.pq.M, Ks=int(2 ** pq_index.pq.nbits)) + opq_nanopq.pq.Ds = int(pq_index.pq.d / pq_index.pq.M) + + # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds + codewords = faiss.vector_to_array(pq_index.pq.centroids).reshape( + opq_nanopq.M, opq_nanopq.Ks, opq_nanopq.Ds + ) + + opq_nanopq.pq.codewords = codewords + + opq_nanopq.R = ( + faiss.vector_to_array(opq_matrix.A) + .reshape(opq_matrix.d_in, opq_matrix.d_out) + .transpose(1, 0) + ) + + return opq_nanopq, faiss.vector_to_array(pq_index.codes).reshape(-1, pq_index.pq.M) diff --git a/tests/test_convert_faiss.py b/tests/test_convert_faiss.py index dd23d9d..b1329ed 100644 --- a/tests/test_convert_faiss.py +++ b/tests/test_convert_faiss.py @@ -87,6 +87,39 @@ def test_faiss_to_nanopq(self): ) self.assertTrue(np.array_equal(ids1, ids2)) + def test_faiss_to_nanopq_opq(self): + D, M, Ks = 32, 4, 256 + Nt, Nb, Nq = 2000, 10000, 100 + nbits = int(np.log2(Ks)) + assert nbits == 8 + Xt = np.random.rand(Nt, D).astype(np.float32) + Xb = np.random.rand(Nb, D).astype(np.float32) + Xq = np.random.rand(Nq, D).astype(np.float32) + + pq_faiss = faiss.IndexPQ(D, M, nbits) + opq_matrix = faiss.OPQMatrix(D, M=M) + opq_faiss = faiss.IndexPreTransform(opq_matrix, pq_faiss) + opq_faiss.train(x=Xt) + opq_faiss.add(x=Xb) + + opq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq_opq(opq_faiss=opq_faiss) + self.assertEqual(opq_nanopq.codewords.shape, (M, Ks, int(D / M))) + + # Encoded results should be same + Cb_nanopq = opq_nanopq.encode(vecs=Xb) + self.assertTrue(np.array_equal(Cb_nanopq, Cb_faiss)) + + # Search result should be same + topk = 100 + _, ids1 = opq_faiss.search(x=Xq, k=topk) + ids2 = np.array( + [ + np.argsort(opq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk] + for xq in Xq + ] + ) + self.assertTrue(np.array_equal(ids1, ids2)) + def test_faiss_nanopq_compare_accuracy(self): D, M, Ks = 32, 4, 256 Nt, Nb, Nq = 20000, 10000, 100 From 593fbbefba8246b3e7da5187dba39c11667d6e0a Mon Sep 17 00:00:00 2001 From: Hiroyuki Deguchi Date: Fri, 1 Apr 2022 18:07:11 +0900 Subject: [PATCH 2/2] Merge PQ and OPQ converters to faiss_to_nanopq() --- nanopq/__init__.py | 4 +- nanopq/convert_faiss.py | 88 +++++++++++++++---------------------- tests/test_convert_faiss.py | 20 +++++---- 3 files changed, 49 insertions(+), 63 deletions(-) diff --git a/nanopq/__init__.py b/nanopq/__init__.py index 88b0295..df71855 100644 --- a/nanopq/__init__.py +++ b/nanopq/__init__.py @@ -1,6 +1,6 @@ -__all__ = ["PQ", "OPQ", "DistanceTable", "nanopq_to_faiss", "faiss_to_nanopq", "faiss_to_nanopq_opq"] +__all__ = ["PQ", "OPQ", "DistanceTable", "nanopq_to_faiss", "faiss_to_nanopq"] __version__ = "0.1.10" -from .convert_faiss import faiss_to_nanopq, nanopq_to_faiss, faiss_to_nanopq_opq +from .convert_faiss import faiss_to_nanopq, nanopq_to_faiss from .opq import OPQ from .pq import PQ, DistanceTable diff --git a/nanopq/convert_faiss.py b/nanopq/convert_faiss.py index b511812..8c7fd7f 100644 --- a/nanopq/convert_faiss.py +++ b/nanopq/convert_faiss.py @@ -9,8 +9,8 @@ import numpy as np -from .pq import PQ from .opq import OPQ +from .pq import PQ def nanopq_to_faiss(pq_nanopq): @@ -49,67 +49,51 @@ def nanopq_to_faiss(pq_nanopq): def faiss_to_nanopq(pq_faiss): - """Convert a `faiss.IndexPQ `_ instance to :class:`nanopq.PQ`. + """Convert a `faiss.IndexPQ `_ + or a `faiss.IndexPreTransform `_ instance to :class:`nanopq.OPQ`. To use this function, `faiss module needs to be installed `_. Args: - pq_faiss (faiss.IndexPQ): An input PQ instance. + pq_faiss (Union[faiss.IndexPQ, faiss.IndexPreTransform]): An input PQ or OPQ instance. Returns: tuple: - * nanopq.PQ: A converted PQ instance, with the same codewords to the input. + * Union[nanopq.PQ, nanopq.OPQ]: A converted PQ or OPQ instance, with the same codewords to the input. * np.ndarray: Stored PQ codes in the input IndexPQ, with the shape=(N, M). This will be empty if codes are not stored """ - assert isinstance(pq_faiss, faiss.IndexPQ), "Error. pq_faiss must be IndexPQ" + assert isinstance( + pq_faiss, (faiss.IndexPQ, faiss.IndexPreTransform) + ), "Error. pq_faiss must be IndexPQ or IndexPreTransform" assert pq_faiss.is_trained, "Error. pq_faiss must have been trained" - pq_nanopq = PQ(M=pq_faiss.pq.M, Ks=int(2 ** pq_faiss.pq.nbits)) - pq_nanopq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M) - - # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds - codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape( - pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds - ) - - pq_nanopq.codewords = codewords + if isinstance(pq_faiss, faiss.IndexPreTransform): + opq_matrix: faiss.LinearTransform = faiss.downcast_VectorTransform( + pq_faiss.chain.at(0) + ) + pq_faiss: faiss.IndexPQ = faiss.downcast_index(pq_faiss.index) + pq_nanopq = OPQ(M=pq_faiss.pq.M, Ks=int(2**pq_faiss.pq.nbits)) + pq_nanopq.pq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M) + + # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds + codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape( + pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds + ) + + pq_nanopq.pq.codewords = codewords + pq_nanopq.R = ( + faiss.vector_to_array(opq_matrix.A) + .reshape(opq_matrix.d_out, opq_matrix.d_in) + .transpose(1, 0) + ) + else: + pq_nanopq = PQ(M=pq_faiss.pq.M, Ks=int(2**pq_faiss.pq.nbits)) + pq_nanopq.Ds = int(pq_faiss.pq.d / pq_faiss.pq.M) + + # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds + codewords = faiss.vector_to_array(pq_faiss.pq.centroids).reshape( + pq_nanopq.M, pq_nanopq.Ks, pq_nanopq.Ds + ) + pq_nanopq.codewords = codewords return pq_nanopq, faiss.vector_to_array(pq_faiss.codes).reshape(-1, pq_faiss.pq.M) - - -def faiss_to_nanopq_opq(opq_faiss): - """Convert a `faiss.IndexPreTransform `_ instance to :class:`nanopq.OPQ`. - To use this function, `faiss module needs to be installed `_. - - Args: - opq_faiss (faiss.IndexPreTransform): An input OPQ instance. It only supports `OPQMatrix + IndexPQ`, not `+ IndexIVFPQ`. - - Returns: - tuple: - * nanopq.OPQ: A converted OPQ instance, with the same codewords to the input. - * np.ndarray: Stored PQ codes in the input IndexPQ, with the shape=(N, M). This will be empty if codes are not stored - - """ - assert isinstance(opq_faiss, faiss.IndexPreTransform), "Error. opq_faiss must be IndexPreTransform" - assert opq_faiss.is_trained, "Error. opq_faiss must have been trained" - - opq_matrix: faiss.LinearTransform = faiss.downcast_VectorTransform(opq_faiss.chain.at(0)) - pq_index: faiss.IndexPQ = faiss.downcast_index(opq_faiss.index) - - opq_nanopq = OPQ(M=pq_index.pq.M, Ks=int(2 ** pq_index.pq.nbits)) - opq_nanopq.pq.Ds = int(pq_index.pq.d / pq_index.pq.M) - - # Extract codewords from pq_IndexPQ.ProductQuantizer, reshape them to M*Ks*Ds - codewords = faiss.vector_to_array(pq_index.pq.centroids).reshape( - opq_nanopq.M, opq_nanopq.Ks, opq_nanopq.Ds - ) - - opq_nanopq.pq.codewords = codewords - - opq_nanopq.R = ( - faiss.vector_to_array(opq_matrix.A) - .reshape(opq_matrix.d_in, opq_matrix.d_out) - .transpose(1, 0) - ) - - return opq_nanopq, faiss.vector_to_array(pq_index.codes).reshape(-1, pq_index.pq.M) diff --git a/tests/test_convert_faiss.py b/tests/test_convert_faiss.py index b1329ed..2e2ba18 100644 --- a/tests/test_convert_faiss.py +++ b/tests/test_convert_faiss.py @@ -56,7 +56,7 @@ def test_nanopq_to_faiss(self): self.assertTrue(np.array_equal(ids1, ids2)) - def test_faiss_to_nanopq(self): + def test_faiss_to_nanopq_pq(self): D, M, Ks = 32, 4, 256 Nt, Nb, Nq = 2000, 10000, 100 nbits = int(np.log2(Ks)) @@ -70,6 +70,7 @@ def test_faiss_to_nanopq(self): pq_faiss.add(x=Xb) pq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq(pq_faiss=pq_faiss) + self.assertIsInstance(pq_nanopq, nanopq.PQ) self.assertEqual(pq_nanopq.codewords.shape, (M, Ks, int(D / M))) # Encoded results should be same @@ -98,23 +99,24 @@ def test_faiss_to_nanopq_opq(self): pq_faiss = faiss.IndexPQ(D, M, nbits) opq_matrix = faiss.OPQMatrix(D, M=M) - opq_faiss = faiss.IndexPreTransform(opq_matrix, pq_faiss) - opq_faiss.train(x=Xt) - opq_faiss.add(x=Xb) + pq_faiss = faiss.IndexPreTransform(opq_matrix, pq_faiss) + pq_faiss.train(x=Xt) + pq_faiss.add(x=Xb) - opq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq_opq(opq_faiss=opq_faiss) - self.assertEqual(opq_nanopq.codewords.shape, (M, Ks, int(D / M))) + pq_nanopq, Cb_faiss = nanopq.faiss_to_nanopq(pq_faiss=pq_faiss) + self.assertIsInstance(pq_nanopq, nanopq.OPQ) + self.assertEqual(pq_nanopq.codewords.shape, (M, Ks, int(D / M))) # Encoded results should be same - Cb_nanopq = opq_nanopq.encode(vecs=Xb) + Cb_nanopq = pq_nanopq.encode(vecs=Xb) self.assertTrue(np.array_equal(Cb_nanopq, Cb_faiss)) # Search result should be same topk = 100 - _, ids1 = opq_faiss.search(x=Xq, k=topk) + _, ids1 = pq_faiss.search(x=Xq, k=topk) ids2 = np.array( [ - np.argsort(opq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk] + np.argsort(pq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk] for xq in Xq ] )