Source code for domino._embed.clip
from typing import Dict, Union
from .encoder import Encoder
[docs]def clip(
variant: str = "ViT-B/32", device: Union[int, str] = "cpu"
) -> Dict[str, Encoder]:
"""Contrastive Language-Image Pre-training (CLIP) encoders [radford_2021]_. Includes
encoders for the following modalities:
- "text"
- "image"
Encoders will map these different modalities to the same embedding space.
Args:
variant (str, optional): A model name listed by `clip.available_models()`, or
the path to a model checkpoint containing the state_dict. Defaults to
"ViT-B/32".
device (Union[int, str], optional): The device on which the encoders will be
loaded. Defaults to "cpu".
.. [radford_2021]
Radford, A. et al. Learning Transferable Visual Models From Natural Language
Supervision. arXiv [cs.CV] (2021)
"""
try:
from clip import load, tokenize
except ImportError:
raise ImportError(
"To embed with CLIP run pip install git+https://github.com/openai/CLIP.git"
"and install domino with the `clip` submodule. For example, "
"`pip install domino[clip]`"
)
model, preprocess = load(variant, device=device)
return {
"image": Encoder(encode=model.encode_image, preprocess=preprocess),
"text": Encoder(
# need to squeeze out the batch dimension for compatibility with collate
encode=model.encode_text,
preprocess=lambda x: tokenize(x, truncate=True).squeeze(0),
),
}