add template

wangyuxinwhy · Jun 18, 2023 · 7a4ad34 · 7a4ad34
1 parent f952089
commit 7a4ad34
Show file tree

Hide file tree

Showing 13 changed files with 175 additions and 36 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,28 @@
+name: 🐛 Bug Report
+description: 提交报告以帮助我们重现和修复 bug
+labels: bug
+assignees:
+  - wangyuxinwhy
+body:
+  - type: markdown
+    attributes:
+      value: >
+        在提交您遇到的 bug 之前, 请在 [这里](https://github.com/wangyuxinwhy/uniem/issues?q=) 看看有没有人有类似的问题 🌝
+  - type: textarea
+    id: description
+    attributes:
+      label: 🐛 bug 说明
+    validations:
+      required: true
+  - type: dropdown
+    id: python-version
+    attributes:
+      label: Python Version
+      options:
+        - "3.10"
+        - 3.11
+        - other
+  - type: markdown
+    attributes:
+      value: >
+        感谢您的支持 🎉!
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,16 @@
+name: 🚀 Feature request
+description: 提交您希望 uniem 支持的新特性
+labels: enhancement
+assignees:
+  - wangyuxinwhy
+body:
+  - type: textarea
+    id: description
+    attributes:
+      label: 🚀 The feature
+    validations:
+      required: true
+  - type: markdown
+    attributes:
+      value: >
+        感谢您的支持 🎉!
diff --git a/.github/ISSUE_TEMPLATE/mtebzh_dataset.yml b/.github/ISSUE_TEMPLATE/mtebzh_dataset.yml
@@ -0,0 +1,45 @@
+name: 💯 MTEB-zh Dataset
+description: 提交您希望 MTEB-zh 支持的数据集
+labels: dataset
+assignees:
+  - wangyuxinwhy
+body:
+  - type: markdown
+    attributes:
+      value: |
+        MTEB-zh 是以 [MTEB](https://github.com/embeddings-benchmark/mteb) 为基础 ，为中文文本嵌入模型设计的 BenckMark。
+        您需要提供以下信息，以帮助 uniem 团队将您提供的数据集加入到 MTEB-zh 评测标准中。
+  - type: input
+    id: name
+    attributes:
+      label: dataset name
+      description: 数据集的名称，请使用英文字母，数字，- 或者 _
+      placeholder: "Example: MedQQpairs" 
+    validations:
+      required: true
+  - type: input
+    id: id
+    attributes:
+      label: huggingface dataset id
+      description: MTEB-zh 和 MTEB 一样，需要您将数据集开源并托管在 [HuggingFace Datasets](https://huggingface.co/datasets)
+      placeholder: "Example: vegaviazhang/Med_QQpairs"
+    validations:
+      required: true
+  - type: dropdown
+    id: task_type
+    attributes:
+      label: task type
+      desciption: MTEB-zh 支持多种类型的文本嵌入评测任务，比如文本分类，文本检索，文本聚类等，如果您不确定您的数据集属于何种任务，可以不进行选择。
+      options:
+        - Classification
+        - PairClassification
+        - Reranking
+        - Retrieval
+        - BitextMining
+        - Clustering
+        - STS
+        - Summarization
+  - type: textarea
+    id: description
+    attributes:
+      label: dataset description
diff --git a/.github/ISSUE_TEMPLATE/mtebzh_model.yml b/.github/ISSUE_TEMPLATE/mtebzh_model.yml
@@ -0,0 +1,45 @@
+name: 💎 MTEB-zh Model
+description: 提交您希望 MTEB-zh 支持的模型
+labels: model
+assignees:
+  - wangyuxinwhy
+body:
+  - type: markdown
+    attributes:
+      value: |
+        MTEB-zh 是以 [MTEB](https://github.com/embeddings-benchmark/mteb) 为基础 ，为中文文本嵌入模型设计的 BenckMark。
+        您需要提供以下信息，以帮助 uniem 团队将您提供的模型加入到 MTEB-zh 评测标准中。
+  - type: input
+    id: name
+    attributes:
+      label: model name
+      description: 模型的名称，请使用英文字母，数字，- 或者 _
+      placeholder: "Example: m3e-base" 
+    validations:
+      required: true
+  - type: dropdown
+    id: type
+    attributes:
+      label: model type
+      desciption: |
+        MTEB-zh 支持多种类型的文本嵌入模型，比如 sentence_transformer, text2vec, luotuo 等。
+        您可以在[这里](https://github.com/wangyuxinwhy/uniem/blob/main/mteb-zh/mteb_zh/models.py)找到全部支持的类型。
+        如果 MTEB-zh 还未支持您的模型，请选择 others ，并在 model description 提供使用方式。
+      options:
+        - sentence_transformer
+        - text2vec
+        - luotuo
+        - erlangshen
+        - others
+      validations:
+        required: true
+  - type: input
+    id: id
+    attributes:
+      label: huggingface model id
+      description: 如果您的模型支持通过 [HuggingFace Models](https://huggingface.co/models) 进行加载，请提供 model id
+      placeholder: "Example: moka-ai/m3e-base"
+  - type: textarea
+    id: description
+    attributes:
+      label: model description
diff --git a/mteb-zh/mteb_zh/models.py b/mteb-zh/mteb_zh/models.py
@@ -28,39 +28,42 @@ class ModelType(str, Enum):
     azure = 'azure'
 
 
-def load_model(model_type: ModelType, model_name: str | None = None) -> MTEBModel:
+def load_model(model_type: ModelType, model_id: str | None = None) -> MTEBModel:
     match model_type:
         case ModelType.sentence_transformer:
-            if model_name is None:
+            if model_id is None:
                 raise ValueError('model_name must be specified for sentence_transformer')
-            return SentenceTransformer(model_name)
+            return SentenceTransformer(model_id)
         case ModelType.text2vec:
-            from text2vec import SentenceModel
+            try:
+                from text2vec import SentenceModel  # type: ignore
+            except ImportError:
+                raise ImportError('text2vec is not installed, please install it with "pip install text2vec"')
 
-            if model_name is None:
+            if model_id is None:
                 return SentenceModel()
             else:
-                return SentenceModel(model_name)
+                return SentenceModel(model_id)
         case ModelType.openai:
-            if model_name is None:
+            if model_id is None:
                 return OpenAIModel(model_name='text-embedding-ada-002')
             else:
-                return OpenAIModel(model_name=model_name)
+                return OpenAIModel(model_name=model_id)
         case ModelType.azure:
-            if model_name is None:
+            if model_id is None:
                 return AzureModel(model_name='text-embedding-ada-002')
             else:
-                return AzureModel(model_name=model_name)
+                return AzureModel(model_name=model_id)
         case ModelType.luotuo:
-            if model_name is None:
+            if model_id is None:
                 return LuotuoBertModel(model_name='silk-road/luotuo-bert')
             else:
-                return LuotuoBertModel(model_name=model_name)
+                return LuotuoBertModel(model_name=model_id)
         case ModelType.erlangshen:
-            if model_name is None:
+            if model_id is None:
                 return ErLangShenModel(model_name='IDEA-CCNL/Erlangshen-SimCSE-110M-Chinese')
             else:
-                return ErLangShenModel(model_name=model_name)
+                return ErLangShenModel(model_name=model_id)
         case _:
             raise ValueError(f'Unknown model type: {model_type}')
 

diff --git a/mteb-zh/run_mteb_zh.py b/mteb-zh/run_mteb_zh.py
@@ -34,21 +34,23 @@
 
 def main(
     model_type: Annotated[ModelType, typer.Option()],
+    model_id: str | None = None,
     model_name: str | None = None,
     task_type: TaskType = TaskType.Classification,
     output_folder: Path = Path('results'),
 ):
     output_folder = Path(output_folder)
-    model = load_model(model_type, model_name)
+    model = load_model(model_type, model_id)
 
     if task_type is TaskType.All:
         tasks = default_tasks
     else:
         tasks = [task for task in default_tasks if task.description['type'] == task_type.value]  # type: ignore
 
     evaluation = MTEB(tasks=tasks)
-    model_id = model_type.value + (f'-{model_name.replace("/", "-")}' if model_name else '')
-    evaluation.run(model, output_folder=str(output_folder / model_id))
+    if model_name is None:
+        model_name = model_type.value + (f'-{model_id.replace("/", "-")}' if model_id else '')
+    evaluation.run(model, output_folder=str(output_folder / model_name))
 
 
 if __name__ == '__main__':

diff --git a/scripts/process_zh_datasets.py b/scripts/process_zh_datasets.py
@@ -298,7 +298,7 @@ def load_miracl():
     dataset_dict = cast(DatasetDict, dataset_dict)
 
     try:
-        from zhconv import convert
+        from zhconv import convert  # type: ignore
     except ImportError:
         raise ImportError('Please install zhconv first: pip install zhconv')
 

diff --git a/uniem/data.py b/uniem/data.py
@@ -2,13 +2,13 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, cast, Sequence
+from typing import Any, Sequence, cast
 
 import torch
+from datasets import Dataset as HfDataset
 from torch.utils.data import Dataset, RandomSampler
 
-from datasets import Dataset as HfDataset
-from uniem.data_structures import RecordType, PairRecord, ScoredPairRecord, TripletRecord, get_record_type, record_type_cls_map
+from uniem.data_structures import PairRecord, RecordType, ScoredPairRecord, TripletRecord, get_record_type, record_type_cls_map
 from uniem.types import Tokenizer
 
 

diff --git a/uniem/finetuner.py b/uniem/finetuner.py
@@ -1,17 +1,18 @@
 import functools
-import os
 import logging
+import os
 from pathlib import Path
 from typing import Sequence, cast
 
 import torch
-from datasets import DatasetDict as HFDatasetDict, Dataset as HFDataset
 from accelerate import Accelerator
 from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import Dataset as HFDataset
+from datasets import DatasetDict as HFDatasetDict
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer, get_cosine_schedule_with_warmup  # type: ignore
 
-from uniem.data import PairCollator, ScoredPairCollator, TripletCollator, FinetuneDataset, PrefixFinetuneDataset
+from uniem.data import FinetuneDataset, PairCollator, PrefixFinetuneDataset, ScoredPairCollator, TripletCollator
 from uniem.data_structures import RecordType, get_record_type
 from uniem.model import (
     EmbedderForPairInBatchNegTrain,
@@ -25,7 +26,6 @@
 from uniem.types import MixedPrecisionType
 from uniem.utils import create_adamw_optimizer, split_dataset_dict
 
-
 logger = logging.getLogger(__name__)
 RawDataset = Sequence[dict] | dict[str, Sequence[dict]] | HFDatasetDict | HFDataset
 
@@ -219,7 +219,7 @@ def __init__(
         super().__init__(model_name_or_path, dataset)
         self.special_prefix_tokens = additional_special_tokens
         self.prefix = ''.join(self.special_prefix_tokens) if prefix is None else prefix
-        self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})
+        self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens})  # type: ignore
         self.additional_special_token_ids = self.tokenizer.convert_tokens_to_ids(additional_special_tokens)
         self.only_train_additional_special_tokens = only_train_additional_special_tokens
 
@@ -241,9 +241,10 @@ def create_finetune_model(
         if self.only_train_additional_special_tokens:
             for param in model.parameters():
                 param.requires_grad = False
-            embedding_layer = model.embedder.encoder.get_input_embeddings()
-            embedding_layer.weight.requires_grad = True
-            embedding_layer.weight.register_hook(hook)
+            embedding_layer_weight = model.embedder.encoder.get_input_embeddings().weight
+            embedding_layer_weight = cast(torch.nn.Parameter, embedding_layer_weight)
+            embedding_layer_weight.requires_grad = True
+            embedding_layer_weight.register_hook(hook)
         return model
 
     def run(

diff --git a/uniem/model.py b/uniem/model.py
@@ -2,10 +2,10 @@
 from pathlib import Path
 from typing import ClassVar, Literal, Type, TypeVar, cast
 
-import tqdm
-import torch
 import numpy as np
-from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel  # type: ignore
+import torch
+import tqdm
+from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedModel  # type: ignore
 
 from uniem.criteria import (
     CoSentLoss,

diff --git a/uniem/trainer.py b/uniem/trainer.py
@@ -3,11 +3,11 @@
 from typing import Any
 
 import torch
-from tqdm.auto import tqdm
 from accelerate import Accelerator
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
 
 
 class Trainer:

diff --git a/uniem/types.py b/uniem/types.py
@@ -2,11 +2,10 @@
 from enum import Enum
 from typing import Callable, TypeAlias
 
+from datasets import DatasetDict
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 
-from datasets import DatasetDict
-
 Tokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
 
 

diff --git a/uniem/utils.py b/uniem/utils.py
@@ -1,5 +1,5 @@
-from itertools import islice
 import logging
+from itertools import islice
 from typing import Generator, Iterable, TypeVar
 
 import torch