File size: 9,300 Bytes

import tarfile
from collections import defaultdict
from pathlib import Path

import faiss
import numpy as np
import pyarrow as pa
import requests
from tqdm import tqdm

__all__ = ["RetrievalDatabase", "download_retrieval_databases"]

RETRIEVAL_DATABASES_URLS = {
    "cc12m": {
        "url": "https://storage-cased.alessandroconti.me/cc12m.tar.gz",
        "cache_subdir": "./cc12m/vit-l-14/",
    },
}


def download_retrieval_databases(cache_dir: str):
    """Download data if needed.

    Args:
        cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
    """
    databases_path = Path(cache_dir, "databases")

    for name, items in RETRIEVAL_DATABASES_URLS.items():
        url = items["url"]
        database_path = Path(databases_path, name)
        if database_path.exists():
            continue

        # download data
        target_path = Path(databases_path, name + ".tar.gz")
        target_path.parent.mkdir(parents=True, exist_ok=True)
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_bytes_size = int(r.headers.get("content-length", 0))
            chunk_size = 8192
            p_bar = tqdm(
                desc="Downloading cc12m index",
                total=total_bytes_size,
                unit="iB",
                unit_scale=True,
            )
            with open(target_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    p_bar.update(len(chunk))
            p_bar.close()

        # extract data
        tar = tarfile.open(target_path, "r:gz")
        tar.extractall(target_path.parent)
        tar.close()
        target_path.unlink()


class RetrievalDatabaseMetadataProvider:
    """Metadata provider for the retrieval database.

    Args:
        metadata_dir (str): Path to the metadata directory.
    """

    def __init__(self, metadata_dir: str):
        metadatas = [str(a) for a in sorted(Path(metadata_dir).glob("**/*")) if a.is_file()]
        self.table = pa.concat_tables(
            [
                pa.ipc.RecordBatchFileReader(pa.memory_map(metadata, "r")).read_all()
                for metadata in metadatas
            ]
        )

    def get(self, ids):
        """Get the metadata for the given ids.

        Args:
            ids (list): List of ids.
        """
        columns = self.table.schema.names
        end_ids = [i + 1 for i in ids]
        t = pa.concat_tables([self.table[start:end] for start, end in zip(ids, end_ids)])
        return t.select(columns).to_pandas().to_dict("records")


class RetrievalDatabase:
    """Retrieval database.

    Args:
        database_name (str): Name of the database.
        cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
    """

    def __init__(self, database_name: str, cache_dir: str):
        assert database_name in RETRIEVAL_DATABASES_URLS.keys(), (
            f"Database name should be one of "
            f"{list(RETRIEVAL_DATABASES_URLS.keys())}, got {database_name}."
        )

        database_dir = Path(cache_dir) / "databases"
        database_dir = database_dir / RETRIEVAL_DATABASES_URLS[database_name]["cache_subdir"]
        self._database_dir = database_dir

        image_index_fp = Path(database_dir) / "image.index"
        text_index_fp = Path(database_dir) / "text.index"

        image_index = (
            faiss.read_index(str(image_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
            if image_index_fp.exists()
            else None
        )
        text_index = (
            faiss.read_index(str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
            if text_index_fp.exists()
            else None
        )

        metadata_dir = str(Path(database_dir) / "metadata")
        metadata_provider = RetrievalDatabaseMetadataProvider(metadata_dir)

        self._image_index = image_index
        self._text_index = text_index
        self._metadata_provider = metadata_provider

    def _map_to_metadata(self, indices: list, distances: list, embs: list, num_images: int):
        """Map the indices to metadata.

        Args:
            indices (list): List of indices.
            distances (list): List of distances.
            embs (list): List of results embeddings.
            num_images (int): Number of images.
        """
        results = []
        metas = self._metadata_provider.get(indices[:num_images])
        for key, (d, i, emb) in enumerate(zip(distances, indices, embs)):
            output = {}
            meta = None if key + 1 > len(metas) else metas[key]
            if meta is not None:
                output.update(self._meta_to_dict(meta))
            output["id"] = i.item()
            output["similarity"] = d.item()
            output["sample_z"] = emb.tolist()
            results.append(output)

        return results

    def _meta_to_dict(self, metadata):
        """Convert metadata to dict.

        Args:
            metadata (dict): Metadata.
        """
        output = {}
        for k, v in metadata.items():
            if isinstance(v, bytes):
                v = v.decode()
            elif type(v).__module__ == np.__name__:
                v = v.item()
            output[k] = v
        return output

    def _get_connected_components(self, neighbors):
        """Find connected components in a graph.

        Args:
            neighbors (dict): Dictionary of neighbors.
        """
        seen = set()

        def component(node):
            r = []
            nodes = {node}
            while nodes:
                node = nodes.pop()
                seen.add(node)
                nodes |= set(neighbors[node]) - seen
                r.append(node)
            return r

        u = []
        for node in neighbors:
            if node not in seen:
                u.append(component(node))
        return u

    def _deduplicate_embeddings(self, embeddings, threshold=0.94):
        """Deduplicate embeddings.

        Args:
            embeddings (np.matrix): Embeddings to deduplicate.
            threshold (float): Threshold to use for deduplication. Default is 0.94.
        """
        index = faiss.IndexFlatIP(embeddings.shape[1])
        index.add(embeddings)
        l, _, indices = index.range_search(embeddings, threshold)

        same_mapping = defaultdict(list)

        for i in range(embeddings.shape[0]):
            start = l[i]
            end = l[i + 1]
            for j in indices[start:end]:
                same_mapping[int(i)].append(int(j))

        groups = self._get_connected_components(same_mapping)
        non_uniques = set()
        for g in groups:
            for e in g[1:]:
                non_uniques.add(e)

        return set(list(non_uniques))

    def query(
        self, query: np.matrix, modality: str = "text", num_samples: int = 10
    ) -> list[list[dict]]:
        """Query the database.

        Args:
            query (np.matrix): Query to search.
            modality (str): Modality to search. One of `image` or `text`. Default to `text`.
            num_samples (int): Number of samples to return. Default is 40.
        """
        index = self._image_index if modality == "image" else self._text_index

        distances, indices, embeddings = index.search_and_reconstruct(query, num_samples)
        results = [indices[i] for i in range(len(indices))]

        nb_results = [np.where(r == -1)[0] for r in results]
        total_distances = []
        total_indices = []
        total_embeddings = []
        for i in range(len(results)):
            num_res = nb_results[i][0] if len(nb_results[i]) > 0 else len(results[i])

            result_indices = results[i][:num_res]
            result_distances = distances[i][:num_res]
            result_embeddings = embeddings[i][:num_res]

            # normalise embeddings
            l2 = np.atleast_1d(np.linalg.norm(result_embeddings, 2, -1))
            l2[l2 == 0] = 1
            result_embeddings = result_embeddings / np.expand_dims(l2, -1)

            # deduplicate embeddings
            local_indices_to_remove = self._deduplicate_embeddings(result_embeddings)
            indices_to_remove = set()
            for local_index in local_indices_to_remove:
                indices_to_remove.add(result_indices[local_index])

            curr_indices = []
            curr_distances = []
            curr_embeddings = []
            for ind, dis, emb in zip(result_indices, result_distances, result_embeddings):
                if ind not in indices_to_remove:
                    indices_to_remove.add(ind)
                    curr_indices.append(ind)
                    curr_distances.append(dis)
                    curr_embeddings.append(emb)

            total_indices.append(curr_indices)
            total_distances.append(curr_distances)
            total_embeddings.append(curr_embeddings)

        if len(total_distances) == 0:
            return []

        total_results = []
        for i in range(len(total_distances)):
            results = self._map_to_metadata(
                total_indices[i], total_distances[i], total_embeddings[i], num_samples
            )
            total_results.append(results)

        return total_results