From f913ee5a5b92323266cfccf0691292d541bc58ac Mon Sep 17 00:00:00 2001
From: "shengzhe.li" <shengzhe.li@sbintuitions.co.jp>
Date: Mon, 9 Sep 2024 23:23:20 +0900
Subject: [PATCH 1/7] Add ruri-large/base/small to leaderboard

---
 docs/results/cl-nagoya/ruri-base/summary.json | 62 +++++++++++++++++++
 .../results/cl-nagoya/ruri-large/summary.json | 62 +++++++++++++++++++
 .../results/cl-nagoya/ruri-small/summary.json | 62 +++++++++++++++++++
 leaderboard.md                                | 35 ++++++++---
 4 files changed, 214 insertions(+), 7 deletions(-)
 create mode 100644 docs/results/cl-nagoya/ruri-base/summary.json
 create mode 100644 docs/results/cl-nagoya/ruri-large/summary.json
 create mode 100644 docs/results/cl-nagoya/ruri-small/summary.json

diff --git a/docs/results/cl-nagoya/ruri-base/summary.json b/docs/results/cl-nagoya/ruri-base/summary.json
new file mode 100644
index 0000000..a7c7b05
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-base/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7665550732749669
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5575876111411316
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.8141210121425055
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8848812917656395
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9290942178703699
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.7455660589538348
+        },
+        "jaqket": {
+            "ndcg@10": 0.5012253145754781
+        },
+        "mrtydi": {
+            "ndcg@10": 0.3545113073009125
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.8689204088388403
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9656989703684407
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.7531306059721564
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8231772134744029
+        },
+        "jsts": {
+            "spearman": 0.8342848039994751
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5427223607801758
+        },
+        "mewsc16": {
+            "v_measure_score": 0.5404099864321413
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6237623762376238
+        }
+    }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-large/summary.json b/docs/results/cl-nagoya/ruri-large/summary.json
new file mode 100644
index 0000000..e86c46b
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-large/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.8080806321853091
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5680171450057119
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.8255898596881264
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8956410349938264
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9298524733536755
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.7667506664925435
+        },
+        "jaqket": {
+            "ndcg@10": 0.6173871224245404
+        },
+        "mrtydi": {
+            "ndcg@10": 0.3803302462897418
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.8712459719069233
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9657898747088243
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.779665053945222
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8199959693684533
+        },
+        "jsts": {
+            "spearman": 0.8426164139167538
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5139491572866559
+        },
+        "mewsc16": {
+            "v_measure_score": 0.5225025331595674
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6228813559322034
+        }
+    }
+}
\ No newline at end of file
diff --git a/docs/results/cl-nagoya/ruri-small/summary.json b/docs/results/cl-nagoya/ruri-small/summary.json
new file mode 100644
index 0000000..cb591ea
--- /dev/null
+++ b/docs/results/cl-nagoya/ruri-small/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7991935990685706
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.556129066893332
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.8148895285345188
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8787774569382543
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9300177985352138
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.736494039429321
+        },
+        "jaqket": {
+            "ndcg@10": 0.484437639428696
+        },
+        "mrtydi": {
+            "ndcg@10": 0.3342716158897666
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.8768878489670099
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9716879343439146
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.7608660955794895
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8343927017558587
+        },
+        "jsts": {
+            "spearman": 0.8213297790184827
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5096442244018489
+        },
+        "mewsc16": {
+            "v_measure_score": 0.5141045788711239
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6211267605633802
+        }
+    }
+}
\ No newline at end of file
diff --git a/leaderboard.md b/leaderboard.md
index b07dbca..93d3988 100644
--- a/leaderboard.md
+++ b/leaderboard.md
@@ -7,7 +7,10 @@ The summary shows the average scores within each task.
 
 | Model                                         | Avg.      | Retrieval   | STS       | Classification   | Reranking   | Clustering   | PairClassification   |
 |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------|
-| OpenAI/text-embedding-3-large                 | **73.97** | **74.48**   | 82.52     | **77.58**        | **93.58**   | **53.32**    | 62.35                |
+| OpenAI/text-embedding-3-large                 | **73.97** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
+| cl-nagoya/ruri-large                          | 73.45     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
+| cl-nagoya/ruri-base                           | 72.95     | 69.82       | 82.87     | 75.58            | 92.91       | **54.16**    | 62.38                |
+| cl-nagoya/ruri-small                          | 72.45     | 69.41       | 82.79     | 76.22            | 93.00       | 51.19        | 62.11                |
 | intfloat/multilingual-e5-large                | 71.65     | 70.98       | 79.70     | 72.89            | 92.96       | 51.24        | 62.15                |
 | OpenAI/text-embedding-3-small                 | 70.86     | 66.39       | 79.46     | 73.06            | 92.92       | 51.06        | 62.27                |
 | pkshatech/GLuCoSE-base-ja                     | 70.44     | 59.02       | 78.71     | 76.82            | 91.90       | 49.78        | **66.39**            |
@@ -33,8 +36,11 @@ The summary shows the average scores within each task.
 ## Retrieval
 | Model                                         | Avg.      | jagovfaqs_22k<br>(ndcg@10)   | jaqket<br>(ndcg@10)   | mrtydi<br>(ndcg@10)   | nlp_journal_abs_intro<br>(ndcg@10)   | nlp_journal_title_abs<br>(ndcg@10)   | nlp_journal_title_intro<br>(ndcg@10)   |
 |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------|
-| OpenAI/text-embedding-3-large                 | **74.48** | **72.41**                    | 48.21                 | 34.88                 | **99.33**                            | **96.55**                            | **95.47**                              |
-| intfloat/multilingual-e5-large                | 70.98     | 70.30                        | **58.78**             | **43.63**             | 86.00                                | 94.70                                | 72.48                                  |
+| OpenAI/text-embedding-3-large                 | **74.48** | 72.41                        | 48.21                 | 34.88                 | **99.33**                            | 96.55                                | **95.47**                              |
+| cl-nagoya/ruri-large                          | 73.02     | **76.68**                    | **61.74**             | 38.03                 | 87.12                                | 96.58                                | 77.97                                  |
+| intfloat/multilingual-e5-large                | 70.98     | 70.30                        | 58.78                 | **43.63**             | 86.00                                | 94.70                                | 72.48                                  |
+| cl-nagoya/ruri-base                           | 69.82     | 74.56                        | 50.12                 | 35.45                 | 86.89                                | 96.57                                | 75.31                                  |
+| cl-nagoya/ruri-small                          | 69.41     | 73.65                        | 48.44                 | 33.43                 | 87.69                                | **97.17**                            | 76.09                                  |
 | intfloat/multilingual-e5-base                 | 68.21     | 65.34                        | 50.67                 | 38.38                 | 87.10                                | 94.73                                | 73.05                                  |
 | intfloat/multilingual-e5-small                | 67.27     | 64.11                        | 49.97                 | 36.05                 | 85.21                                | 95.26                                | 72.99                                  |
 | OpenAI/text-embedding-3-small                 | 66.39     | 64.02                        | 33.94                 | 20.03                 | 98.47                                | 91.70                                | 90.17                                  |
@@ -60,7 +66,10 @@ The summary shows the average scores within each task.
 | Model                                         | Avg.      | jsick<br>(spearman)   | jsts<br>(spearman)   |
 |:----------------------------------------------|:----------|:----------------------|:---------------------|
 | cl-nagoya/sup-simcse-ja-large                 | **83.18** | **83.80**             | 82.57                |
-| OpenAI/text-embedding-3-large                 | 82.52     | 81.27                 | **83.77**            |
+| cl-nagoya/ruri-large                          | 83.13     | 82.00                 | **84.26**            |
+| cl-nagoya/ruri-base                           | 82.87     | 82.32                 | 83.43                |
+| cl-nagoya/ruri-small                          | 82.79     | 83.44                 | 82.13                |
+| OpenAI/text-embedding-3-large                 | 82.52     | 81.27                 | 83.77                |
 | cl-nagoya/sup-simcse-ja-base                  | 82.05     | 82.83                 | 81.27                |
 | cl-nagoya/unsup-simcse-ja-large               | 80.56     | 80.15                 | 80.98                |
 | intfloat/multilingual-e5-small                | 80.07     | 81.50                 | 78.65                |
@@ -85,9 +94,12 @@ The summary shows the average scores within each task.
 ## Classification
 | Model                                         | Avg.      | amazon_counterfactual<br>(macro_f1)   | amazon_review<br>(macro_f1)   | massive_intent<br>(macro_f1)   | massive_scenario<br>(macro_f1)   |
 |:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------|
-| OpenAI/text-embedding-3-large                 | **77.58** | 77.90                                 | **60.44**                     | **80.91**                      | **91.08**                        |
+| OpenAI/text-embedding-3-large                 | **77.58** | 77.90                                 | **60.44**                     | 80.91                          | **91.08**                        |
+| cl-nagoya/ruri-large                          | 77.43     | 80.81                                 | 56.80                         | **82.56**                      | 89.56                            |
 | pkshatech/GLuCoSE-base-ja                     | 76.82     | **82.44**                             | 58.07                         | 78.85                          | 87.94                            |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 76.61     | 79.95                                 | 57.48                         | 80.26                          | 88.75                            |
+| cl-nagoya/ruri-small                          | 76.22     | 79.92                                 | 55.61                         | 81.49                          | 87.88                            |
+| cl-nagoya/ruri-base                           | 75.58     | 76.66                                 | 55.76                         | 81.41                          | 88.49                            |
 | cl-nagoya/unsup-simcse-ja-large               | 74.66     | 76.79                                 | 55.37                         | 79.13                          | 87.36                            |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 73.77     | 78.10                                 | 51.56                         | 78.79                          | 86.63                            |
 | cl-nagoya/sup-simcse-ja-large                 | 73.73     | 73.21                                 | 54.76                         | 79.23                          | 87.72                            |
@@ -114,8 +126,11 @@ The summary shows the average scores within each task.
 | OpenAI/text-embedding-3-large                 | **93.58** | **93.58**           |
 | OpenAI/text-embedding-ada-002                 | 93.04     | 93.04               |
 | intfloat/multilingual-e5-small                | 93.03     | 93.03               |
+| cl-nagoya/ruri-small                          | 93.00     | 93.00               |
+| cl-nagoya/ruri-large                          | 92.99     | 92.99               |
 | intfloat/multilingual-e5-large                | 92.96     | 92.96               |
 | OpenAI/text-embedding-3-small                 | 92.92     | 92.92               |
+| cl-nagoya/ruri-base                           | 92.91     | 92.91               |
 | intfloat/multilingual-e5-base                 | 92.85     | 92.85               |
 | pkshatech/GLuCoSE-base-ja                     | 91.90     | 91.90               |
 | cl-nagoya/sup-simcse-ja-base                  | 91.83     | 91.83               |
@@ -137,12 +152,15 @@ The summary shows the average scores within each task.
 ## Clustering
 | Model                                         | Avg.      | livedoor_news<br>(v_measure_score)   | mewsc16<br>(v_measure_score)   |
 |:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------|
-| OpenAI/text-embedding-3-large                 | **53.32** | 57.09                                | 49.55                          |
+| cl-nagoya/ruri-base                           | **54.16** | 54.27                                | **54.04**                      |
+| OpenAI/text-embedding-3-large                 | 53.32     | 57.09                                | 49.55                          |
+| cl-nagoya/ruri-large                          | 51.82     | 51.39                                | 52.25                          |
 | cl-nagoya/sup-simcse-ja-base                  | 51.79     | 52.67                                | 50.91                          |
 | intfloat/multilingual-e5-large                | 51.24     | **57.13**                            | 45.34                          |
+| cl-nagoya/ruri-small                          | 51.19     | 50.96                                | 51.41                          |
 | OpenAI/text-embedding-3-small                 | 51.06     | 54.57                                | 47.55                          |
 | cl-nagoya/sup-simcse-ja-large                 | 50.56     | 50.75                                | 50.38                          |
-| oshizo/sbert-jsnli-luke-japanese-base-lite    | 50.33     | 46.77                                | **53.89**                      |
+| oshizo/sbert-jsnli-luke-japanese-base-lite    | 50.33     | 46.77                                | 53.89                          |
 | pkshatech/GLuCoSE-base-ja                     | 49.78     | 49.89                                | 49.68                          |
 | cl-nagoya/unsup-simcse-ja-large               | 48.41     | 50.90                                | 45.92                          |
 | OpenAI/text-embedding-ada-002                 | 48.30     | 49.67                                | 46.92                          |
@@ -171,6 +189,7 @@ The summary shows the average scores within each task.
 | pkshatech/simcse-ja-bert-base-clcmlp          | 62.40     | 62.40                      |
 | OpenAI/text-embedding-ada-002                 | 62.40     | 62.40                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 62.38     | 62.38                      |
+| cl-nagoya/ruri-base                           | 62.38     | 62.38                      |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 62.38     | 62.38                      |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 62.38     | 62.38                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 62.37     | 62.37                      |
@@ -179,10 +198,12 @@ The summary shows the average scores within each task.
 | MU-Kindai/Japanese-MixCSE-BERT-base           | 62.33     | 62.33                      |
 | sentence-transformers/LaBSE                   | 62.33     | 62.33                      |
 | colorfulscoop/sbert-base-ja                   | 62.31     | 62.31                      |
+| cl-nagoya/ruri-large                          | 62.29     | 62.29                      |
 | OpenAI/text-embedding-3-small                 | 62.27     | 62.27                      |
 | MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 62.27     | 62.27                      |
 | intfloat/multilingual-e5-base                 | 62.26     | 62.26                      |
 | sentence-transformers/stsb-xlm-r-multilingual | 62.20     | 62.20                      |
 | intfloat/multilingual-e5-small                | 62.19     | 62.19                      |
 | intfloat/multilingual-e5-large                | 62.15     | 62.15                      |
+| cl-nagoya/ruri-small                          | 62.11     | 62.11                      |
 

From 73a304dcc36b5a1aa13ec9ca7778dc83d6e5acb6 Mon Sep 17 00:00:00 2001
From: "shengzhe.li" <shengzhe.li@sbintuitions.co.jp>
Date: Tue, 10 Sep 2024 21:24:22 +0900
Subject: [PATCH 2/7] Add pkshatech/RoSEtta-base-ja and
 pkshatech/GLuCoSE-base-ja-v2 to leaderboard

---
 .../pkshatech/GLuCoSE-base-ja-v2/summary.json | 62 +++++++++++++++++++
 .../pkshatech/RoSEtta-base-ja/summary.json    | 62 +++++++++++++++++++
 leaderboard.md                                | 20 +++++-
 3 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100644 docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
 create mode 100644 docs/results/pkshatech/RoSEtta-base-ja/summary.json

diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
new file mode 100644
index 0000000..60223bc
--- /dev/null
+++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7528271196943096
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5561679575066396
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.8058990735631814
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8729457394926279
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9289703513027785
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.6842208748694516
+        },
+        "jaqket": {
+            "ndcg@10": 0.666162910609933
+        },
+        "mrtydi": {
+            "ndcg@10": 0.3679312414893066
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.8961561684616985
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9465973412523236
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.7514787290834406
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8499279029619572
+        },
+        "jsts": {
+            "spearman": 0.8150603412605322
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5165568486237136
+        },
+        "mewsc16": {
+            "v_measure_score": 0.4970285237567235
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6239830208701804
+        }
+    }
+}
\ No newline at end of file
diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
new file mode 100644
index 0000000..5025c4d
--- /dev/null
+++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7006688790331752
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5299983831023539
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7952268533717546
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.869707847800633
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9267539503767978
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.6379929234552755
+        },
+        "jaqket": {
+            "ndcg@10": 0.6533570255483011
+        },
+        "mrtydi": {
+            "ndcg@10": 0.3407337609040446
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.9577227924391506
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9282272189004226
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.7938878816204916
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8302539464008364
+        },
+        "jsts": {
+            "spearman": 0.7961383132420531
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5503116157834466
+        },
+        "mewsc16": {
+            "v_measure_score": 0.389105324755125
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.6218727662616155
+        }
+    }
+}
\ No newline at end of file
diff --git a/leaderboard.md b/leaderboard.md
index 93d3988..b41c49c 100644
--- a/leaderboard.md
+++ b/leaderboard.md
@@ -10,8 +10,10 @@ The summary shows the average scores within each task.
 | OpenAI/text-embedding-3-large                 | **73.97** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
 | cl-nagoya/ruri-large                          | 73.45     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
 | cl-nagoya/ruri-base                           | 72.95     | 69.82       | 82.87     | 75.58            | 92.91       | **54.16**    | 62.38                |
+| pkshatech/GLuCoSE-base-ja-v2                  | 72.63     | 71.88       | **83.25** | 74.70            | 92.90       | 50.68        | 62.40                |
 | cl-nagoya/ruri-small                          | 72.45     | 69.41       | 82.79     | 76.22            | 93.00       | 51.19        | 62.11                |
 | intfloat/multilingual-e5-large                | 71.65     | 70.98       | 79.70     | 72.89            | 92.96       | 51.24        | 62.15                |
+| pkshatech/RoSEtta-base-ja                     | 71.23     | 71.87       | 81.32     | 72.39            | 92.68       | 46.97        | 62.19                |
 | OpenAI/text-embedding-3-small                 | 70.86     | 66.39       | 79.46     | 73.06            | 92.92       | 51.06        | 62.27                |
 | pkshatech/GLuCoSE-base-ja                     | 70.44     | 59.02       | 78.71     | 76.82            | 91.90       | 49.78        | **66.39**            |
 | intfloat/multilingual-e5-base                 | 70.12     | 68.21       | 79.84     | 69.30            | 92.85       | 48.26        | 62.26                |
@@ -20,7 +22,7 @@ The summary shows the average scores within each task.
 | cl-nagoya/sup-simcse-ja-base                  | 68.56     | 49.64       | 82.05     | 73.47            | 91.83       | 51.79        | 62.57                |
 | MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 66.89     | 47.38       | 78.99     | 73.13            | 91.30       | 48.25        | 62.27                |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 66.75     | 43.00       | 76.60     | 76.61            | 91.56       | 50.33        | 62.38                |
-| cl-nagoya/sup-simcse-ja-large                 | 66.51     | 37.62       | **83.18** | 73.73            | 91.48       | 50.56        | 62.51                |
+| cl-nagoya/sup-simcse-ja-large                 | 66.51     | 37.62       | 83.18     | 73.73            | 91.48       | 50.56        | 62.51                |
 | cl-nagoya/unsup-simcse-ja-large               | 66.27     | 40.53       | 80.56     | 74.66            | 90.95       | 48.41        | 62.49                |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 66.23     | 46.36       | 77.49     | 73.30            | 91.16       | 46.68        | 62.38                |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 65.28     | 40.82       | 78.28     | 73.47            | 90.95       | 45.81        | 62.35                |
@@ -37,7 +39,9 @@ The summary shows the average scores within each task.
 | Model                                         | Avg.      | jagovfaqs_22k<br>(ndcg@10)   | jaqket<br>(ndcg@10)   | mrtydi<br>(ndcg@10)   | nlp_journal_abs_intro<br>(ndcg@10)   | nlp_journal_title_abs<br>(ndcg@10)   | nlp_journal_title_intro<br>(ndcg@10)   |
 |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------|
 | OpenAI/text-embedding-3-large                 | **74.48** | 72.41                        | 48.21                 | 34.88                 | **99.33**                            | 96.55                                | **95.47**                              |
-| cl-nagoya/ruri-large                          | 73.02     | **76.68**                    | **61.74**             | 38.03                 | 87.12                                | 96.58                                | 77.97                                  |
+| cl-nagoya/ruri-large                          | 73.02     | **76.68**                    | 61.74                 | 38.03                 | 87.12                                | 96.58                                | 77.97                                  |
+| pkshatech/GLuCoSE-base-ja-v2                  | 71.88     | 68.42                        | **66.62**             | 36.79                 | 89.62                                | 94.66                                | 75.15                                  |
+| pkshatech/RoSEtta-base-ja                     | 71.87     | 63.80                        | 65.34                 | 34.07                 | 95.77                                | 92.82                                | 79.39                                  |
 | intfloat/multilingual-e5-large                | 70.98     | 70.30                        | 58.78                 | **43.63**             | 86.00                                | 94.70                                | 72.48                                  |
 | cl-nagoya/ruri-base                           | 69.82     | 74.56                        | 50.12                 | 35.45                 | 86.89                                | 96.57                                | 75.31                                  |
 | cl-nagoya/ruri-small                          | 69.41     | 73.65                        | 48.44                 | 33.43                 | 87.69                                | **97.17**                            | 76.09                                  |
@@ -65,12 +69,14 @@ The summary shows the average scores within each task.
 ## STS
 | Model                                         | Avg.      | jsick<br>(spearman)   | jsts<br>(spearman)   |
 |:----------------------------------------------|:----------|:----------------------|:---------------------|
-| cl-nagoya/sup-simcse-ja-large                 | **83.18** | **83.80**             | 82.57                |
+| pkshatech/GLuCoSE-base-ja-v2                  | **83.25** | **84.99**             | 81.51                |
+| cl-nagoya/sup-simcse-ja-large                 | 83.18     | 83.80                 | 82.57                |
 | cl-nagoya/ruri-large                          | 83.13     | 82.00                 | **84.26**            |
 | cl-nagoya/ruri-base                           | 82.87     | 82.32                 | 83.43                |
 | cl-nagoya/ruri-small                          | 82.79     | 83.44                 | 82.13                |
 | OpenAI/text-embedding-3-large                 | 82.52     | 81.27                 | 83.77                |
 | cl-nagoya/sup-simcse-ja-base                  | 82.05     | 82.83                 | 81.27                |
+| pkshatech/RoSEtta-base-ja                     | 81.32     | 83.03                 | 79.61                |
 | cl-nagoya/unsup-simcse-ja-large               | 80.56     | 80.15                 | 80.98                |
 | intfloat/multilingual-e5-small                | 80.07     | 81.50                 | 78.65                |
 | intfloat/multilingual-e5-base                 | 79.84     | 81.28                 | 78.39                |
@@ -100,6 +106,7 @@ The summary shows the average scores within each task.
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 76.61     | 79.95                                 | 57.48                         | 80.26                          | 88.75                            |
 | cl-nagoya/ruri-small                          | 76.22     | 79.92                                 | 55.61                         | 81.49                          | 87.88                            |
 | cl-nagoya/ruri-base                           | 75.58     | 76.66                                 | 55.76                         | 81.41                          | 88.49                            |
+| pkshatech/GLuCoSE-base-ja-v2                  | 74.70     | 75.28                                 | 55.62                         | 80.59                          | 87.29                            |
 | cl-nagoya/unsup-simcse-ja-large               | 74.66     | 76.79                                 | 55.37                         | 79.13                          | 87.36                            |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 73.77     | 78.10                                 | 51.56                         | 78.79                          | 86.63                            |
 | cl-nagoya/sup-simcse-ja-large                 | 73.73     | 73.21                                 | 54.76                         | 79.23                          | 87.72                            |
@@ -113,6 +120,7 @@ The summary shows the average scores within each task.
 | intfloat/multilingual-e5-large                | 72.89     | 70.66                                 | 56.54                         | 75.78                          | 88.59                            |
 | MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 72.76     | 76.20                                 | 52.06                         | 77.89                          | 84.90                            |
 | sentence-transformers/LaBSE                   | 72.66     | 73.61                                 | 51.70                         | 76.99                          | 88.35                            |
+| pkshatech/RoSEtta-base-ja                     | 72.39     | 70.07                                 | 53.00                         | 79.52                          | 86.97                            |
 | sentence-transformers/stsb-xlm-r-multilingual | 71.84     | 75.65                                 | 51.32                         | 74.28                          | 86.10                            |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 71.30     | 67.49                                 | 50.85                         | 79.67                          | 87.20                            |
 | OpenAI/text-embedding-ada-002                 | 69.75     | 64.42                                 | 53.13                         | 74.57                          | 86.89                            |
@@ -131,7 +139,9 @@ The summary shows the average scores within each task.
 | intfloat/multilingual-e5-large                | 92.96     | 92.96               |
 | OpenAI/text-embedding-3-small                 | 92.92     | 92.92               |
 | cl-nagoya/ruri-base                           | 92.91     | 92.91               |
+| pkshatech/GLuCoSE-base-ja-v2                  | 92.90     | 92.90               |
 | intfloat/multilingual-e5-base                 | 92.85     | 92.85               |
+| pkshatech/RoSEtta-base-ja                     | 92.68     | 92.68               |
 | pkshatech/GLuCoSE-base-ja                     | 91.90     | 91.90               |
 | cl-nagoya/sup-simcse-ja-base                  | 91.83     | 91.83               |
 | sentence-transformers/LaBSE                   | 91.63     | 91.63               |
@@ -159,6 +169,7 @@ The summary shows the average scores within each task.
 | intfloat/multilingual-e5-large                | 51.24     | **57.13**                            | 45.34                          |
 | cl-nagoya/ruri-small                          | 51.19     | 50.96                                | 51.41                          |
 | OpenAI/text-embedding-3-small                 | 51.06     | 54.57                                | 47.55                          |
+| pkshatech/GLuCoSE-base-ja-v2                  | 50.68     | 51.66                                | 49.70                          |
 | cl-nagoya/sup-simcse-ja-large                 | 50.56     | 50.75                                | 50.38                          |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 50.33     | 46.77                                | 53.89                          |
 | pkshatech/GLuCoSE-base-ja                     | 49.78     | 49.89                                | 49.68                          |
@@ -167,6 +178,7 @@ The summary shows the average scores within each task.
 | intfloat/multilingual-e5-base                 | 48.26     | 55.03                                | 41.49                          |
 | MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 48.25     | 53.20                                | 43.31                          |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 47.53     | 44.77                                | 50.30                          |
+| pkshatech/RoSEtta-base-ja                     | 46.97     | 55.03                                | 38.91                          |
 | intfloat/multilingual-e5-small                | 46.91     | 54.70                                | 39.12                          |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 46.68     | 53.02                                | 40.35                          |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 45.81     | 48.45                                | 43.17                          |
@@ -188,6 +200,7 @@ The summary shows the average scores within each task.
 | cl-nagoya/unsup-simcse-ja-base                | 62.44     | 62.44                      |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 62.40     | 62.40                      |
 | OpenAI/text-embedding-ada-002                 | 62.40     | 62.40                      |
+| pkshatech/GLuCoSE-base-ja-v2                  | 62.40     | 62.40                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 62.38     | 62.38                      |
 | cl-nagoya/ruri-base                           | 62.38     | 62.38                      |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 62.38     | 62.38                      |
@@ -204,6 +217,7 @@ The summary shows the average scores within each task.
 | intfloat/multilingual-e5-base                 | 62.26     | 62.26                      |
 | sentence-transformers/stsb-xlm-r-multilingual | 62.20     | 62.20                      |
 | intfloat/multilingual-e5-small                | 62.19     | 62.19                      |
+| pkshatech/RoSEtta-base-ja                     | 62.19     | 62.19                      |
 | intfloat/multilingual-e5-large                | 62.15     | 62.15                      |
 | cl-nagoya/ruri-small                          | 62.11     | 62.11                      |
 

From 9df586a2677472c1286727a9ce15cc5d2c2b4673 Mon Sep 17 00:00:00 2001
From: "shengzhe.li" <shengzhe.li@sbintuitions.co.jp>
Date: Wed, 11 Sep 2024 19:55:46 +0900
Subject: [PATCH 3/7] Fix leaderboard summary average score to micro-average

---
 leaderboard.md      | 56 ++++++++++++++++++++++-----------------------
 make_leaderboard.py | 14 ++++++++++--
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/leaderboard.md b/leaderboard.md
index b41c49c..45be95b 100644
--- a/leaderboard.md
+++ b/leaderboard.md
@@ -3,37 +3,37 @@ This leaderboard shows the results stored under `docs/results`. The scores are a
 
 ## Summary
 
-The summary shows the average scores within each task.
+The summary shows the average scores within each task. The average score is the average of scores by dataset.
 
 | Model                                         | Avg.      | Retrieval   | STS       | Classification   | Reranking   | Clustering   | PairClassification   |
 |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------|
-| OpenAI/text-embedding-3-large                 | **73.97** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
-| cl-nagoya/ruri-large                          | 73.45     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
-| cl-nagoya/ruri-base                           | 72.95     | 69.82       | 82.87     | 75.58            | 92.91       | **54.16**    | 62.38                |
-| pkshatech/GLuCoSE-base-ja-v2                  | 72.63     | 71.88       | **83.25** | 74.70            | 92.90       | 50.68        | 62.40                |
-| cl-nagoya/ruri-small                          | 72.45     | 69.41       | 82.79     | 76.22            | 93.00       | 51.19        | 62.11                |
-| intfloat/multilingual-e5-large                | 71.65     | 70.98       | 79.70     | 72.89            | 92.96       | 51.24        | 62.15                |
-| pkshatech/RoSEtta-base-ja                     | 71.23     | 71.87       | 81.32     | 72.39            | 92.68       | 46.97        | 62.19                |
-| OpenAI/text-embedding-3-small                 | 70.86     | 66.39       | 79.46     | 73.06            | 92.92       | 51.06        | 62.27                |
-| pkshatech/GLuCoSE-base-ja                     | 70.44     | 59.02       | 78.71     | 76.82            | 91.90       | 49.78        | **66.39**            |
-| intfloat/multilingual-e5-base                 | 70.12     | 68.21       | 79.84     | 69.30            | 92.85       | 48.26        | 62.26                |
-| intfloat/multilingual-e5-small                | 69.52     | 67.27       | 80.07     | 67.62            | 93.03       | 46.91        | 62.19                |
-| OpenAI/text-embedding-ada-002                 | 69.48     | 64.38       | 79.02     | 69.75            | 93.04       | 48.30        | 62.40                |
-| cl-nagoya/sup-simcse-ja-base                  | 68.56     | 49.64       | 82.05     | 73.47            | 91.83       | 51.79        | 62.57                |
-| MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 66.89     | 47.38       | 78.99     | 73.13            | 91.30       | 48.25        | 62.27                |
-| oshizo/sbert-jsnli-luke-japanese-base-lite    | 66.75     | 43.00       | 76.60     | 76.61            | 91.56       | 50.33        | 62.38                |
-| cl-nagoya/sup-simcse-ja-large                 | 66.51     | 37.62       | 83.18     | 73.73            | 91.48       | 50.56        | 62.51                |
-| cl-nagoya/unsup-simcse-ja-large               | 66.27     | 40.53       | 80.56     | 74.66            | 90.95       | 48.41        | 62.49                |
-| MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 66.23     | 46.36       | 77.49     | 73.30            | 91.16       | 46.68        | 62.38                |
-| MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 65.28     | 40.82       | 78.28     | 73.47            | 90.95       | 45.81        | 62.35                |
-| MU-Kindai/Japanese-MixCSE-BERT-base           | 65.14     | 42.59       | 77.05     | 72.90            | 91.01       | 44.95        | 62.33                |
-| cl-nagoya/unsup-simcse-ja-base                | 65.07     | 40.23       | 78.72     | 73.07            | 91.16       | 44.77        | 62.44                |
-| MU-Kindai/Japanese-DiffCSE-BERT-base          | 64.77     | 41.79       | 75.50     | 73.77            | 90.95       | 44.22        | 62.38                |
-| sentence-transformers/LaBSE                   | 64.70     | 40.12       | 76.56     | 72.66            | 91.63       | 44.88        | 62.33                |
-| pkshatech/simcse-ja-bert-base-clcmlp          | 64.42     | 37.00       | 76.80     | 71.30            | 91.49       | 47.53        | 62.40                |
-| MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 64.15     | 41.32       | 74.66     | 72.76            | 90.66       | 43.11        | 62.37                |
-| colorfulscoop/sbert-base-ja                   | 58.85     | 16.52       | 70.42     | 69.07            | 89.97       | 44.81        | 62.31                |
-| sentence-transformers/stsb-xlm-r-multilingual | 58.01     | 21.00       | 75.40     | 71.84            | 90.20       | 27.46        | 62.20                |
+| OpenAI/text-embedding-3-large                 | **74.05** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
+| cl-nagoya/ruri-large                          | 73.31     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
+| pkshatech/GLuCoSE-base-ja-v2                  | 72.07     | 71.88       | **83.25** | 74.70            | 92.90       | 50.68        | 62.40                |
+| cl-nagoya/ruri-base                           | 71.91     | 69.82       | 82.87     | 75.58            | 92.91       | **54.16**    | 62.38                |
+| cl-nagoya/ruri-small                          | 71.53     | 69.41       | 82.79     | 76.22            | 93.00       | 51.19        | 62.11                |
+| intfloat/multilingual-e5-large                | 70.90     | 70.98       | 79.70     | 72.89            | 92.96       | 51.24        | 62.15                |
+| pkshatech/RoSEtta-base-ja                     | 70.76     | 71.87       | 81.32     | 72.39            | 92.68       | 46.97        | 62.19                |
+| OpenAI/text-embedding-3-small                 | 69.18     | 66.39       | 79.46     | 73.06            | 92.92       | 51.06        | 62.27                |
+| intfloat/multilingual-e5-base                 | 68.61     | 68.21       | 79.84     | 69.30            | 92.85       | 48.26        | 62.26                |
+| intfloat/multilingual-e5-small                | 67.71     | 67.27       | 80.07     | 67.62            | 93.03       | 46.91        | 62.19                |
+| pkshatech/GLuCoSE-base-ja                     | 67.29     | 59.02       | 78.71     | 76.82            | 91.90       | 49.78        | **66.39**            |
+| OpenAI/text-embedding-ada-002                 | 67.21     | 64.38       | 79.02     | 69.75            | 93.04       | 48.30        | 62.40                |
+| cl-nagoya/sup-simcse-ja-base                  | 63.36     | 49.64       | 82.05     | 73.47            | 91.83       | 51.79        | 62.57                |
+| MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 61.55     | 47.38       | 78.99     | 73.13            | 91.30       | 48.25        | 62.27                |
+| MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 60.83     | 46.36       | 77.49     | 73.30            | 91.16       | 46.68        | 62.38                |
+| oshizo/sbert-jsnli-luke-japanese-base-lite    | 60.77     | 43.00       | 76.60     | 76.61            | 91.56       | 50.33        | 62.38                |
+| cl-nagoya/unsup-simcse-ja-large               | 59.58     | 40.53       | 80.56     | 74.66            | 90.95       | 48.41        | 62.49                |
+| MU-Kindai/Japanese-MixCSE-BERT-base           | 59.03     | 42.59       | 77.05     | 72.90            | 91.01       | 44.95        | 62.33                |
+| cl-nagoya/sup-simcse-ja-large                 | 58.88     | 37.62       | 83.18     | 73.73            | 91.48       | 50.56        | 62.51                |
+| MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 58.77     | 40.82       | 78.28     | 73.47            | 90.95       | 45.81        | 62.35                |
+| MU-Kindai/Japanese-DiffCSE-BERT-base          | 58.66     | 41.79       | 75.50     | 73.77            | 90.95       | 44.22        | 62.38                |
+| cl-nagoya/unsup-simcse-ja-base                | 58.39     | 40.23       | 78.72     | 73.07            | 91.16       | 44.77        | 62.44                |
+| sentence-transformers/LaBSE                   | 58.01     | 40.12       | 76.56     | 72.66            | 91.63       | 44.88        | 62.33                |
+| MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 57.97     | 41.32       | 74.66     | 72.76            | 90.66       | 43.11        | 62.37                |
+| pkshatech/simcse-ja-bert-base-clcmlp          | 56.86     | 37.00       | 76.80     | 71.30            | 91.49       | 47.53        | 62.40                |
+| sentence-transformers/stsb-xlm-r-multilingual | 48.21     | 21.00       | 75.40     | 71.84            | 90.20       | 27.46        | 62.20                |
+| colorfulscoop/sbert-base-ja                   | 47.38     | 16.52       | 70.42     | 69.07            | 89.97       | 44.81        | 62.31                |
 
 ## Retrieval
 | Model                                         | Avg.      | jagovfaqs_22k<br>(ndcg@10)   | jaqket<br>(ndcg@10)   | mrtydi<br>(ndcg@10)   | nlp_journal_abs_intro<br>(ndcg@10)   | nlp_journal_title_abs<br>(ndcg@10)   | nlp_journal_title_intro<br>(ndcg@10)   |
diff --git a/make_leaderboard.py b/make_leaderboard.py
index ff3a330..0e43ccf 100644
--- a/make_leaderboard.py
+++ b/make_leaderboard.py
@@ -62,7 +62,14 @@ def format_score(score: float) -> str:
     table_list: list[list[str | float]] = []
     for model_signature, dataset_scores in task_results.items():
         model_scores = [dataset_scores[k] for k in dataset_keys]
-        average_score = sum(model_scores) / len(model_scores)
+        if task_name == SUMMARY_KEY:
+            scores_by_dataset = []
+            for _task_name, _task_results in all_results.items():
+                if _task_name != SUMMARY_KEY:
+                    scores_by_dataset.extend(list(_task_results[model_signature].values()))
+            average_score = sum(scores_by_dataset) / len(scores_by_dataset)
+        else:
+            average_score = sum(model_scores) / len(model_scores)
         table_list.append([model_signature, average_score, *model_scores])
 
     # sort by the average score
@@ -97,7 +104,10 @@ def format_score(score: float) -> str:
         f.write(f"## {task_name}\n")
 
         if task_name == SUMMARY_KEY:
-            f.write("\nThe summary shows the average scores within each task.\n\n")
+            f.write(
+                "\nThe summary shows the average scores within each task. "
+                "The average score is the average of scores by dataset.\n\n"
+            )
 
         f.write(markdown_table)
         f.write("\n\n")

From 5242d370be95bfd753c60a821538e79d90bfc84e Mon Sep 17 00:00:00 2001
From: lsz05 <shengzhe.li@sbintuitions.co.jp>
Date: Thu, 12 Sep 2024 11:13:56 +0900
Subject: [PATCH 4/7] Update `pkshatech/RoSEtta-base-ja` scores

https://github.com/sbintuitions/JMTEB/issues/71#issuecomment-2343044173
---
 .../pkshatech/RoSEtta-base-ja/summary.json    | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/results/pkshatech/RoSEtta-base-ja/summary.json b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
index 5025c4d..d82af4b 100644
--- a/docs/results/pkshatech/RoSEtta-base-ja/summary.json
+++ b/docs/results/pkshatech/RoSEtta-base-ja/summary.json
@@ -1,62 +1,62 @@
 {
     "Classification": {
         "amazon_counterfactual_classification": {
-            "macro_f1": 0.7006688790331752
+            "macro_f1": 0.7005147244958231
         },
         "amazon_review_classification": {
-            "macro_f1": 0.5299983831023539
+            "macro_f1": 0.5263680453119501
         },
         "massive_intent_classification": {
-            "macro_f1": 0.7952268533717546
+            "macro_f1": 0.7983787583297884
         },
         "massive_scenario_classification": {
-            "macro_f1": 0.869707847800633
+            "macro_f1": 0.8709593192703351
         }
     },
     "Reranking": {
         "esci": {
-            "ndcg@10": 0.9267539503767978
+            "ndcg@10": 0.9268625513429571
         }
     },
     "Retrieval": {
         "jagovfaqs_22k": {
-            "ndcg@10": 0.6379929234552755
+            "ndcg@10": 0.6595934642903105
         },
         "jaqket": {
-            "ndcg@10": 0.6533570255483011
+            "ndcg@10": 0.6533452086105761
         },
         "mrtydi": {
-            "ndcg@10": 0.3407337609040446
+            "ndcg@10": 0.36731170141136216
         },
         "nlp_journal_abs_intro": {
-            "ndcg@10": 0.9577227924391506
+            "ndcg@10": 0.9553567926226499
         },
         "nlp_journal_title_abs": {
-            "ndcg@10": 0.9282272189004226
+            "ndcg@10": 0.940828991756893
         },
         "nlp_journal_title_intro": {
-            "ndcg@10": 0.7938878816204916
+            "ndcg@10": 0.8163161967769845
         }
     },
     "STS": {
         "jsick": {
-            "spearman": 0.8302539464008364
+            "spearman": 0.8383455453168481
         },
         "jsts": {
-            "spearman": 0.7961383132420531
+            "spearman": 0.7895388048564987
         }
     },
     "Clustering": {
         "livedoor_news": {
-            "v_measure_score": 0.5503116157834466
+            "v_measure_score": 0.5861760622672214
         },
         "mewsc16": {
-            "v_measure_score": 0.389105324755125
+            "v_measure_score": 0.4784844036038961
         }
     },
     "PairClassification": {
         "paws_x_ja": {
-            "binary_f1": 0.6218727662616155
+            "binary_f1": 0.6173974540311173
         }
     }
-}
\ No newline at end of file
+}

From 76fd77a346ac9af565463464750619961180b901 Mon Sep 17 00:00:00 2001
From: lsz05 <shengzhe.li@sbintuitions.co.jp>
Date: Thu, 12 Sep 2024 11:26:33 +0900
Subject: [PATCH 5/7] Update `pkshatech/GLuCoSE-base-ja-v2` scores

https://github.com/sbintuitions/JMTEB/issues/72#issuecomment-2343043103
---
 .../pkshatech/GLuCoSE-base-ja-v2/summary.json | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
index 60223bc..7318aab 100644
--- a/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
+++ b/docs/results/pkshatech/GLuCoSE-base-ja-v2/summary.json
@@ -1,62 +1,62 @@
 {
     "Classification": {
         "amazon_counterfactual_classification": {
-            "macro_f1": 0.7528271196943096
+            "macro_f1": 0.7492232749031491
         },
         "amazon_review_classification": {
-            "macro_f1": 0.5561679575066396
+            "macro_f1": 0.5530707609927811
         },
         "massive_intent_classification": {
-            "macro_f1": 0.8058990735631814
+            "macro_f1": 0.7979144461303402
         },
         "massive_scenario_classification": {
-            "macro_f1": 0.8729457394926279
+            "macro_f1": 0.8683641924034757
         }
     },
     "Reranking": {
         "esci": {
-            "ndcg@10": 0.9289703513027785
+            "ndcg@10": 0.9301469431250418
         }
     },
     "Retrieval": {
         "jagovfaqs_22k": {
-            "ndcg@10": 0.6842208748694516
+            "ndcg@10": 0.6979374757372254
         },
         "jaqket": {
-            "ndcg@10": 0.666162910609933
+            "ndcg@10": 0.6729417850207029
         },
         "mrtydi": {
-            "ndcg@10": 0.3679312414893066
+            "ndcg@10": 0.41858579533990486
         },
         "nlp_journal_abs_intro": {
-            "ndcg@10": 0.8961561684616985
+            "ndcg@10": 0.9029337913460675
         },
         "nlp_journal_title_abs": {
-            "ndcg@10": 0.9465973412523236
+            "ndcg@10": 0.9511153967130517
         },
         "nlp_journal_title_intro": {
-            "ndcg@10": 0.7514787290834406
+            "ndcg@10": 0.7580448576047344
         }
     },
     "STS": {
         "jsick": {
-            "spearman": 0.8499279029619572
+            "spearman": 0.849637366944316
         },
         "jsts": {
-            "spearman": 0.8150603412605322
+            "spearman": 0.8095684318108997
         }
     },
     "Clustering": {
         "livedoor_news": {
-            "v_measure_score": 0.5165568486237136
+            "v_measure_score": 0.5151536908540161
         },
         "mewsc16": {
-            "v_measure_score": 0.4970285237567235
+            "v_measure_score": 0.45782610528001805
         }
     },
     "PairClassification": {
         "paws_x_ja": {
-            "binary_f1": 0.6239830208701804
+            "binary_f1": 0.623716814159292
         }
     }
-}
\ No newline at end of file
+}

From 4dbe7a3da8ddefcaacdb3fbb4fbbc3e46c4ab5da Mon Sep 17 00:00:00 2001
From: "shengzhe.li" <shengzhe.li@sbintuitions.co.jp>
Date: Thu, 12 Sep 2024 11:28:56 +0900
Subject: [PATCH 6/7] Update leaderboard

---
 leaderboard.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/leaderboard.md b/leaderboard.md
index 45be95b..4b05e46 100644
--- a/leaderboard.md
+++ b/leaderboard.md
@@ -9,11 +9,11 @@ The summary shows the average scores within each task. The average score is the
 |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------|
 | OpenAI/text-embedding-3-large                 | **74.05** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
 | cl-nagoya/ruri-large                          | 73.31     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
-| pkshatech/GLuCoSE-base-ja-v2                  | 72.07     | 71.88       | **83.25** | 74.70            | 92.90       | 50.68        | 62.40                |
+| pkshatech/GLuCoSE-base-ja-v2                  | 72.23     | 73.36       | 82.96     | 74.21            | 93.01       | 48.65        | 62.37                |
+| pkshatech/RoSEtta-base-ja                     | 72.04     | 73.21       | 81.39     | 72.41            | 92.69       | 53.23        | 61.74                |
 | cl-nagoya/ruri-base                           | 71.91     | 69.82       | 82.87     | 75.58            | 92.91       | **54.16**    | 62.38                |
 | cl-nagoya/ruri-small                          | 71.53     | 69.41       | 82.79     | 76.22            | 93.00       | 51.19        | 62.11                |
 | intfloat/multilingual-e5-large                | 70.90     | 70.98       | 79.70     | 72.89            | 92.96       | 51.24        | 62.15                |
-| pkshatech/RoSEtta-base-ja                     | 70.76     | 71.87       | 81.32     | 72.39            | 92.68       | 46.97        | 62.19                |
 | OpenAI/text-embedding-3-small                 | 69.18     | 66.39       | 79.46     | 73.06            | 92.92       | 51.06        | 62.27                |
 | intfloat/multilingual-e5-base                 | 68.61     | 68.21       | 79.84     | 69.30            | 92.85       | 48.26        | 62.26                |
 | intfloat/multilingual-e5-small                | 67.71     | 67.27       | 80.07     | 67.62            | 93.03       | 46.91        | 62.19                |
@@ -25,7 +25,7 @@ The summary shows the average scores within each task. The average score is the
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 60.77     | 43.00       | 76.60     | 76.61            | 91.56       | 50.33        | 62.38                |
 | cl-nagoya/unsup-simcse-ja-large               | 59.58     | 40.53       | 80.56     | 74.66            | 90.95       | 48.41        | 62.49                |
 | MU-Kindai/Japanese-MixCSE-BERT-base           | 59.03     | 42.59       | 77.05     | 72.90            | 91.01       | 44.95        | 62.33                |
-| cl-nagoya/sup-simcse-ja-large                 | 58.88     | 37.62       | 83.18     | 73.73            | 91.48       | 50.56        | 62.51                |
+| cl-nagoya/sup-simcse-ja-large                 | 58.88     | 37.62       | **83.18** | 73.73            | 91.48       | 50.56        | 62.51                |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 58.77     | 40.82       | 78.28     | 73.47            | 90.95       | 45.81        | 62.35                |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 58.66     | 41.79       | 75.50     | 73.77            | 90.95       | 44.22        | 62.38                |
 | cl-nagoya/unsup-simcse-ja-base                | 58.39     | 40.23       | 78.72     | 73.07            | 91.16       | 44.77        | 62.44                |
@@ -39,9 +39,9 @@ The summary shows the average scores within each task. The average score is the
 | Model                                         | Avg.      | jagovfaqs_22k<br>(ndcg@10)   | jaqket<br>(ndcg@10)   | mrtydi<br>(ndcg@10)   | nlp_journal_abs_intro<br>(ndcg@10)   | nlp_journal_title_abs<br>(ndcg@10)   | nlp_journal_title_intro<br>(ndcg@10)   |
 |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------|
 | OpenAI/text-embedding-3-large                 | **74.48** | 72.41                        | 48.21                 | 34.88                 | **99.33**                            | 96.55                                | **95.47**                              |
+| pkshatech/GLuCoSE-base-ja-v2                  | 73.36     | 69.79                        | **67.29**             | 41.86                 | 90.29                                | 95.11                                | 75.80                                  |
+| pkshatech/RoSEtta-base-ja                     | 73.21     | 65.96                        | 65.33                 | 36.73                 | 95.54                                | 94.08                                | 81.63                                  |
 | cl-nagoya/ruri-large                          | 73.02     | **76.68**                    | 61.74                 | 38.03                 | 87.12                                | 96.58                                | 77.97                                  |
-| pkshatech/GLuCoSE-base-ja-v2                  | 71.88     | 68.42                        | **66.62**             | 36.79                 | 89.62                                | 94.66                                | 75.15                                  |
-| pkshatech/RoSEtta-base-ja                     | 71.87     | 63.80                        | 65.34                 | 34.07                 | 95.77                                | 92.82                                | 79.39                                  |
 | intfloat/multilingual-e5-large                | 70.98     | 70.30                        | 58.78                 | **43.63**             | 86.00                                | 94.70                                | 72.48                                  |
 | cl-nagoya/ruri-base                           | 69.82     | 74.56                        | 50.12                 | 35.45                 | 86.89                                | 96.57                                | 75.31                                  |
 | cl-nagoya/ruri-small                          | 69.41     | 73.65                        | 48.44                 | 33.43                 | 87.69                                | **97.17**                            | 76.09                                  |
@@ -69,14 +69,14 @@ The summary shows the average scores within each task. The average score is the
 ## STS
 | Model                                         | Avg.      | jsick<br>(spearman)   | jsts<br>(spearman)   |
 |:----------------------------------------------|:----------|:----------------------|:---------------------|
-| pkshatech/GLuCoSE-base-ja-v2                  | **83.25** | **84.99**             | 81.51                |
-| cl-nagoya/sup-simcse-ja-large                 | 83.18     | 83.80                 | 82.57                |
+| cl-nagoya/sup-simcse-ja-large                 | **83.18** | 83.80                 | 82.57                |
 | cl-nagoya/ruri-large                          | 83.13     | 82.00                 | **84.26**            |
+| pkshatech/GLuCoSE-base-ja-v2                  | 82.96     | **84.96**             | 80.96                |
 | cl-nagoya/ruri-base                           | 82.87     | 82.32                 | 83.43                |
 | cl-nagoya/ruri-small                          | 82.79     | 83.44                 | 82.13                |
 | OpenAI/text-embedding-3-large                 | 82.52     | 81.27                 | 83.77                |
 | cl-nagoya/sup-simcse-ja-base                  | 82.05     | 82.83                 | 81.27                |
-| pkshatech/RoSEtta-base-ja                     | 81.32     | 83.03                 | 79.61                |
+| pkshatech/RoSEtta-base-ja                     | 81.39     | 83.83                 | 78.95                |
 | cl-nagoya/unsup-simcse-ja-large               | 80.56     | 80.15                 | 80.98                |
 | intfloat/multilingual-e5-small                | 80.07     | 81.50                 | 78.65                |
 | intfloat/multilingual-e5-base                 | 79.84     | 81.28                 | 78.39                |
@@ -106,8 +106,8 @@ The summary shows the average scores within each task. The average score is the
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 76.61     | 79.95                                 | 57.48                         | 80.26                          | 88.75                            |
 | cl-nagoya/ruri-small                          | 76.22     | 79.92                                 | 55.61                         | 81.49                          | 87.88                            |
 | cl-nagoya/ruri-base                           | 75.58     | 76.66                                 | 55.76                         | 81.41                          | 88.49                            |
-| pkshatech/GLuCoSE-base-ja-v2                  | 74.70     | 75.28                                 | 55.62                         | 80.59                          | 87.29                            |
 | cl-nagoya/unsup-simcse-ja-large               | 74.66     | 76.79                                 | 55.37                         | 79.13                          | 87.36                            |
+| pkshatech/GLuCoSE-base-ja-v2                  | 74.21     | 74.92                                 | 55.31                         | 79.79                          | 86.84                            |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 73.77     | 78.10                                 | 51.56                         | 78.79                          | 86.63                            |
 | cl-nagoya/sup-simcse-ja-large                 | 73.73     | 73.21                                 | 54.76                         | 79.23                          | 87.72                            |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 73.47     | 77.25                                 | 53.42                         | 76.83                          | 86.39                            |
@@ -120,7 +120,7 @@ The summary shows the average scores within each task. The average score is the
 | intfloat/multilingual-e5-large                | 72.89     | 70.66                                 | 56.54                         | 75.78                          | 88.59                            |
 | MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 72.76     | 76.20                                 | 52.06                         | 77.89                          | 84.90                            |
 | sentence-transformers/LaBSE                   | 72.66     | 73.61                                 | 51.70                         | 76.99                          | 88.35                            |
-| pkshatech/RoSEtta-base-ja                     | 72.39     | 70.07                                 | 53.00                         | 79.52                          | 86.97                            |
+| pkshatech/RoSEtta-base-ja                     | 72.41     | 70.05                                 | 52.64                         | 79.84                          | 87.10                            |
 | sentence-transformers/stsb-xlm-r-multilingual | 71.84     | 75.65                                 | 51.32                         | 74.28                          | 86.10                            |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 71.30     | 67.49                                 | 50.85                         | 79.67                          | 87.20                            |
 | OpenAI/text-embedding-ada-002                 | 69.75     | 64.42                                 | 53.13                         | 74.57                          | 86.89                            |
@@ -134,14 +134,14 @@ The summary shows the average scores within each task. The average score is the
 | OpenAI/text-embedding-3-large                 | **93.58** | **93.58**           |
 | OpenAI/text-embedding-ada-002                 | 93.04     | 93.04               |
 | intfloat/multilingual-e5-small                | 93.03     | 93.03               |
+| pkshatech/GLuCoSE-base-ja-v2                  | 93.01     | 93.01               |
 | cl-nagoya/ruri-small                          | 93.00     | 93.00               |
 | cl-nagoya/ruri-large                          | 92.99     | 92.99               |
 | intfloat/multilingual-e5-large                | 92.96     | 92.96               |
 | OpenAI/text-embedding-3-small                 | 92.92     | 92.92               |
 | cl-nagoya/ruri-base                           | 92.91     | 92.91               |
-| pkshatech/GLuCoSE-base-ja-v2                  | 92.90     | 92.90               |
 | intfloat/multilingual-e5-base                 | 92.85     | 92.85               |
-| pkshatech/RoSEtta-base-ja                     | 92.68     | 92.68               |
+| pkshatech/RoSEtta-base-ja                     | 92.69     | 92.69               |
 | pkshatech/GLuCoSE-base-ja                     | 91.90     | 91.90               |
 | cl-nagoya/sup-simcse-ja-base                  | 91.83     | 91.83               |
 | sentence-transformers/LaBSE                   | 91.63     | 91.63               |
@@ -164,21 +164,21 @@ The summary shows the average scores within each task. The average score is the
 |:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------|
 | cl-nagoya/ruri-base                           | **54.16** | 54.27                                | **54.04**                      |
 | OpenAI/text-embedding-3-large                 | 53.32     | 57.09                                | 49.55                          |
+| pkshatech/RoSEtta-base-ja                     | 53.23     | **58.62**                            | 47.85                          |
 | cl-nagoya/ruri-large                          | 51.82     | 51.39                                | 52.25                          |
 | cl-nagoya/sup-simcse-ja-base                  | 51.79     | 52.67                                | 50.91                          |
-| intfloat/multilingual-e5-large                | 51.24     | **57.13**                            | 45.34                          |
+| intfloat/multilingual-e5-large                | 51.24     | 57.13                                | 45.34                          |
 | cl-nagoya/ruri-small                          | 51.19     | 50.96                                | 51.41                          |
 | OpenAI/text-embedding-3-small                 | 51.06     | 54.57                                | 47.55                          |
-| pkshatech/GLuCoSE-base-ja-v2                  | 50.68     | 51.66                                | 49.70                          |
 | cl-nagoya/sup-simcse-ja-large                 | 50.56     | 50.75                                | 50.38                          |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 50.33     | 46.77                                | 53.89                          |
 | pkshatech/GLuCoSE-base-ja                     | 49.78     | 49.89                                | 49.68                          |
+| pkshatech/GLuCoSE-base-ja-v2                  | 48.65     | 51.52                                | 45.78                          |
 | cl-nagoya/unsup-simcse-ja-large               | 48.41     | 50.90                                | 45.92                          |
 | OpenAI/text-embedding-ada-002                 | 48.30     | 49.67                                | 46.92                          |
 | intfloat/multilingual-e5-base                 | 48.26     | 55.03                                | 41.49                          |
 | MU-Kindai/Japanese-SimCSE-BERT-large-unsup    | 48.25     | 53.20                                | 43.31                          |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 47.53     | 44.77                                | 50.30                          |
-| pkshatech/RoSEtta-base-ja                     | 46.97     | 55.03                                | 38.91                          |
 | intfloat/multilingual-e5-small                | 46.91     | 54.70                                | 39.12                          |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 46.68     | 53.02                                | 40.35                          |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 45.81     | 48.45                                | 43.17                          |
@@ -200,11 +200,11 @@ The summary shows the average scores within each task. The average score is the
 | cl-nagoya/unsup-simcse-ja-base                | 62.44     | 62.44                      |
 | pkshatech/simcse-ja-bert-base-clcmlp          | 62.40     | 62.40                      |
 | OpenAI/text-embedding-ada-002                 | 62.40     | 62.40                      |
-| pkshatech/GLuCoSE-base-ja-v2                  | 62.40     | 62.40                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-unsup     | 62.38     | 62.38                      |
 | cl-nagoya/ruri-base                           | 62.38     | 62.38                      |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 62.38     | 62.38                      |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 62.38     | 62.38                      |
+| pkshatech/GLuCoSE-base-ja-v2                  | 62.37     | 62.37                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 62.37     | 62.37                      |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 62.35     | 62.35                      |
 | OpenAI/text-embedding-3-large                 | 62.35     | 62.35                      |
@@ -217,7 +217,7 @@ The summary shows the average scores within each task. The average score is the
 | intfloat/multilingual-e5-base                 | 62.26     | 62.26                      |
 | sentence-transformers/stsb-xlm-r-multilingual | 62.20     | 62.20                      |
 | intfloat/multilingual-e5-small                | 62.19     | 62.19                      |
-| pkshatech/RoSEtta-base-ja                     | 62.19     | 62.19                      |
 | intfloat/multilingual-e5-large                | 62.15     | 62.15                      |
 | cl-nagoya/ruri-small                          | 62.11     | 62.11                      |
+| pkshatech/RoSEtta-base-ja                     | 61.74     | 61.74                      |
 

From d3f81ec96476798545418a0366676d07145b179b Mon Sep 17 00:00:00 2001
From: "shengzhe.li" <shengzhe.li@sbintuitions.co.jp>
Date: Tue, 17 Sep 2024 14:48:12 +0900
Subject: [PATCH 7/7] version 1.3.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 57e4f74..743b6bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ description = "The evaluation scripts for JMTEB (Japanese Massive Text Embedding
 name = "JMTEB"
 packages = [{from = "src", include = "jmteb"}]
 readme = "README.md"
-version = "1.3.1"
+version = "1.3.2"
 
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"