migrating gensim >= 4.0.0

jeremy-costello · Jul 26, 2021 · 057418e · 057418e
1 parent 4939c09
commit 057418e
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 7 deletions.
diff --git a/octis/evaluation_metrics/coherence_metrics.py b/octis/evaluation_metrics/coherence_metrics.py
@@ -117,7 +117,7 @@ def score(self, model_output):
                 if len(topic) > 0:
                     local_simi = []
                     for w1, w2 in itertools.combinations(topic[0:self.topk], 2):
-                        if w1 in self._wv.vocab and w2 in self._wv.vocab:
+                        if w1 in self._wv.key_to_index.keys() and w2 in self._wv.key_to_index.keys():
                             local_simi.append(self._wv.similarity(w1, w2))
                     arrays.append(np.mean(local_simi))
             return np.mean(arrays)
@@ -167,7 +167,7 @@ def score(self, model_output):
             for topic in topics:
                 topic_coherence = 0
                 for w1, w2 in itertools.combinations(topic, 2):
-                    if w1 in self._wv.vocab and w2 in self._wv.vocab:
+                    if w1 in self._wv.key_to_index.keys() and w2 in self._wv.key_to_index.keys():
                         distance = spatial.distance.cosine(self._wv.__getitem__(w1), self._wv.__getitem__(w2))
                         topic_coherence += distance - 1
                         count = count + 1

diff --git a/octis/evaluation_metrics/similarity_metrics.py b/octis/evaluation_metrics/similarity_metrics.py
@@ -91,7 +91,7 @@ def score(self, model_output):
                 sim = 0
                 for word1 in list1[:self.topk]:
                     for word2 in list2[:self.topk]:
-                        if word1 in self.wv.wv.vocab and word2 in self.wv.wv.vocab:
+                        if word1 in self.wv.key_to_index.keys() and word2 in self.wv.key_to_index.keys():
                             sim = sim + self.wv.similarity(word1, word2)
                             word_counts = word_counts + 1
                 sim = sim / word_counts
@@ -134,11 +134,11 @@ def score(self, model_output):
                 centroid2 = np.zeros(self.wv.vector_size)
                 count1, count2 = 0, 0
                 for word1 in list1[:self.topk]:
-                    if word1 in self.wv.wv.vocab:
+                    if word1 in self.wv.key_to_index.keys():
                         centroid1 = centroid1 + self.wv[word1]
                         count1 += 1
                 for word2 in list2[:self.topk]:
-                    if word2 in self.wv.wv.vocab:
+                    if word2 in self.wv.key_to_index.keys():
                         centroid2 = centroid2 + self.wv[word2]
                         count2 += 1
                 centroid1 = centroid1 / count1

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-gensim==3.8.3
+gensim>=4.0.0
 nltk
 pandas
 spacy

diff --git a/tests/test_evaluation_metrics.py b/tests/test_evaluation_metrics.py
@@ -11,7 +11,7 @@
 from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO, KLDivergence, LogOddsRatio, \
     WordEmbeddingsInvertedRBO
 from octis.evaluation_metrics.similarity_metrics import WordEmbeddingsRBOMatch, PairwiseJaccardSimilarity, RBO, \
-    WordEmbeddingsCentroidSimilarity
+    WordEmbeddingsCentroidSimilarity, WordEmbeddingsPairwiseSimilarity
 
 from octis.evaluation_metrics.coherence_metrics import *
 from octis.dataset.dataset import Dataset
@@ -135,6 +135,12 @@ def test_similarity_measures(dataset, model_output):
     assert type(score) == np.float64 or type(score) == float
     assert 0 <= score <= 1
 
+    metric = WordEmbeddingsPairwiseSimilarity(topk=10)
+    score = metric.score(model_output)
+    assert type(score) == np.float64 or type(score) == float
+    assert 0 <= score <= 1
+
+
 
 def test_irbo(dataset, model_output):
     metric = InvertedRBO(topk=10)