tests/test_compute_embeddings.py

"""
Computes embeddings
"""


import unittest
from sentence_transformers import SentenceTransformer
import numpy as np

class ComputeEmbeddingsTest(unittest.TestCase):
    def setUp(self):
        self.model = SentenceTransformer('paraphrase-distilroberta-base-v1')


    def test_encode_token_embeddings(self):
        """
        Test that encode(output_value='token_embeddings') works
        :return:
        """
        sent = ["Hello Word, a test sentence", "Here comes another sentence", "My final sentence", "Sentences", "Sentence five five five five five five five"]
        emb = self.model.encode(sent, output_value='token_embeddings', batch_size=2)
        assert len(emb) == len(sent)
        for s, e in zip(sent, emb):
            assert len(self.model.tokenize([s])['input_ids'][0]) == e.shape[0]


    def test_encode_single_sentences(self):
        #Single sentence
        emb = self.model.encode("Hello Word, a test sentence")
        assert emb.shape == (768,)
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Single sentence as list
        emb = self.model.encode(["Hello Word, a test sentence"])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Sentence list
        emb = self.model.encode(["Hello Word, a test sentence", "Here comes another sentence", "My final sentence"])
        assert emb.shape == (3, 768)
        assert abs(np.sum(emb) - 22.968266) < 0.001

    def test_encode_normalize(self):
        emb = self.model.encode(["Hello Word, a test sentence", "Here comes another sentence", "My final sentence"], normalize_embeddings=True)
        assert emb.shape == (3, 768)
        for norm in np.linalg.norm(emb, axis=1):
            assert abs(norm - 1) < 0.001

    def test_encode_tuple_sentences(self):
        # Input a sentence tuple
        emb = self.model.encode([("Hello Word, a test sentence", "Second input for model")])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 9.503508) < 0.001

        # List of sentence tuples
        emb = self.model.encode([("Hello Word, a test sentence", "Second input for model"), ("My second tuple", "With two inputs"), ("Final tuple", "final test")])
        assert emb.shape == (3, 768)
        assert abs(np.sum(emb) - 32.14627) < 0.001