Source code for domino._embed.clip

from typing import Dict, Union

from .encoder import Encoder


[docs]def clip( variant: str = "ViT-B/32", device: Union[int, str] = "cpu" ) -> Dict[str, Encoder]: """Contrastive Language-Image Pre-training (CLIP) encoders [radford_2021]_. Includes encoders for the following modalities: - "text" - "image" Encoders will map these different modalities to the same embedding space. Args: variant (str, optional): A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict. Defaults to "ViT-B/32". device (Union[int, str], optional): The device on which the encoders will be loaded. Defaults to "cpu". .. [radford_2021] Radford, A. et al. Learning Transferable Visual Models From Natural Language Supervision. arXiv [cs.CV] (2021) """ try: from clip import load, tokenize except ImportError: raise ImportError( "To embed with CLIP run pip install git+https://github.com/openai/CLIP.git" "and install domino with the `clip` submodule. For example, " "`pip install domino[clip]`" ) model, preprocess = load(variant, device=device) return { "image": Encoder(encode=model.encode_image, preprocess=preprocess), "text": Encoder( # need to squeeze out the batch dimension for compatibility with collate encode=model.encode_text, preprocess=lambda x: tokenize(x, truncate=True).squeeze(0), ), }