From 7a4ad34e2ccde2e3f5a0ca99e5d5edbaf4d794ce Mon Sep 17 00:00:00 2001 From: wangyuxin Date: Sun, 18 Jun 2023 17:52:03 +0800 Subject: [PATCH] add template --- .github/ISSUE_TEMPLATE/bug_report.yml | 28 ++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.yml | 16 ++++++++ .github/ISSUE_TEMPLATE/mtebzh_dataset.yml | 45 ++++++++++++++++++++++ .github/ISSUE_TEMPLATE/mtebzh_model.yml | 45 ++++++++++++++++++++++ mteb-zh/mteb_zh/models.py | 31 ++++++++------- mteb-zh/run_mteb_zh.py | 8 ++-- scripts/process_zh_datasets.py | 2 +- uniem/data.py | 6 +-- uniem/finetuner.py | 17 ++++---- uniem/model.py | 6 +-- uniem/trainer.py | 2 +- uniem/types.py | 3 +- uniem/utils.py | 2 +- 13 files changed, 175 insertions(+), 36 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/ISSUE_TEMPLATE/mtebzh_dataset.yml create mode 100644 .github/ISSUE_TEMPLATE/mtebzh_model.yml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..0f5889c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,28 @@ +name: 🐛 Bug Report +description: 提交报告以帮助我们重现和修复 bug +labels: bug +assignees: + - wangyuxinwhy +body: + - type: markdown + attributes: + value: > + 在提交您遇到的 bug 之前, 请在 [这里](https://github.com/wangyuxinwhy/uniem/issues?q=) 看看有没有人有类似的问题 🌝 + - type: textarea + id: description + attributes: + label: 🐛 bug 说明 + validations: + required: true + - type: dropdown + id: python-version + attributes: + label: Python Version + options: + - "3.10" + - 3.11 + - other + - type: markdown + attributes: + value: > + 感谢您的支持 🎉! \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..9e0f6ba --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,16 @@ +name: 🚀 Feature request +description: 提交您希望 uniem 支持的新特性 +labels: enhancement +assignees: + - wangyuxinwhy +body: + - type: textarea + id: description + attributes: + label: 🚀 The feature + validations: + required: true + - type: markdown + attributes: + value: > + 感谢您的支持 🎉! \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/mtebzh_dataset.yml b/.github/ISSUE_TEMPLATE/mtebzh_dataset.yml new file mode 100644 index 0000000..bf52308 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/mtebzh_dataset.yml @@ -0,0 +1,45 @@ +name: 💯 MTEB-zh Dataset +description: 提交您希望 MTEB-zh 支持的数据集 +labels: dataset +assignees: + - wangyuxinwhy +body: + - type: markdown + attributes: + value: | + MTEB-zh 是以 [MTEB](https://github.com/embeddings-benchmark/mteb) 为基础 ,为中文文本嵌入模型设计的 BenckMark。 + 您需要提供以下信息,以帮助 uniem 团队将您提供的数据集加入到 MTEB-zh 评测标准中。 + - type: input + id: name + attributes: + label: dataset name + description: 数据集的名称,请使用英文字母,数字,- 或者 _ + placeholder: "Example: MedQQpairs" + validations: + required: true + - type: input + id: id + attributes: + label: huggingface dataset id + description: MTEB-zh 和 MTEB 一样,需要您将数据集开源并托管在 [HuggingFace Datasets](https://huggingface.co/datasets) + placeholder: "Example: vegaviazhang/Med_QQpairs" + validations: + required: true + - type: dropdown + id: task_type + attributes: + label: task type + desciption: MTEB-zh 支持多种类型的文本嵌入评测任务,比如文本分类,文本检索,文本聚类等,如果您不确定您的数据集属于何种任务,可以不进行选择。 + options: + - Classification + - PairClassification + - Reranking + - Retrieval + - BitextMining + - Clustering + - STS + - Summarization + - type: textarea + id: description + attributes: + label: dataset description diff --git a/.github/ISSUE_TEMPLATE/mtebzh_model.yml b/.github/ISSUE_TEMPLATE/mtebzh_model.yml new file mode 100644 index 0000000..98f6426 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/mtebzh_model.yml @@ -0,0 +1,45 @@ +name: 💎 MTEB-zh Model +description: 提交您希望 MTEB-zh 支持的模型 +labels: model +assignees: + - wangyuxinwhy +body: + - type: markdown + attributes: + value: | + MTEB-zh 是以 [MTEB](https://github.com/embeddings-benchmark/mteb) 为基础 ,为中文文本嵌入模型设计的 BenckMark。 + 您需要提供以下信息,以帮助 uniem 团队将您提供的模型加入到 MTEB-zh 评测标准中。 + - type: input + id: name + attributes: + label: model name + description: 模型的名称,请使用英文字母,数字,- 或者 _ + placeholder: "Example: m3e-base" + validations: + required: true + - type: dropdown + id: type + attributes: + label: model type + desciption: | + MTEB-zh 支持多种类型的文本嵌入模型,比如 sentence_transformer, text2vec, luotuo 等。 + 您可以在[这里](https://github.com/wangyuxinwhy/uniem/blob/main/mteb-zh/mteb_zh/models.py)找到全部支持的类型。 + 如果 MTEB-zh 还未支持您的模型,请选择 others ,并在 model description 提供使用方式。 + options: + - sentence_transformer + - text2vec + - luotuo + - erlangshen + - others + validations: + required: true + - type: input + id: id + attributes: + label: huggingface model id + description: 如果您的模型支持通过 [HuggingFace Models](https://huggingface.co/models) 进行加载,请提供 model id + placeholder: "Example: moka-ai/m3e-base" + - type: textarea + id: description + attributes: + label: model description diff --git a/mteb-zh/mteb_zh/models.py b/mteb-zh/mteb_zh/models.py index 47c4209..c13e94c 100644 --- a/mteb-zh/mteb_zh/models.py +++ b/mteb-zh/mteb_zh/models.py @@ -28,39 +28,42 @@ class ModelType(str, Enum): azure = 'azure' -def load_model(model_type: ModelType, model_name: str | None = None) -> MTEBModel: +def load_model(model_type: ModelType, model_id: str | None = None) -> MTEBModel: match model_type: case ModelType.sentence_transformer: - if model_name is None: + if model_id is None: raise ValueError('model_name must be specified for sentence_transformer') - return SentenceTransformer(model_name) + return SentenceTransformer(model_id) case ModelType.text2vec: - from text2vec import SentenceModel + try: + from text2vec import SentenceModel # type: ignore + except ImportError: + raise ImportError('text2vec is not installed, please install it with "pip install text2vec"') - if model_name is None: + if model_id is None: return SentenceModel() else: - return SentenceModel(model_name) + return SentenceModel(model_id) case ModelType.openai: - if model_name is None: + if model_id is None: return OpenAIModel(model_name='text-embedding-ada-002') else: - return OpenAIModel(model_name=model_name) + return OpenAIModel(model_name=model_id) case ModelType.azure: - if model_name is None: + if model_id is None: return AzureModel(model_name='text-embedding-ada-002') else: - return AzureModel(model_name=model_name) + return AzureModel(model_name=model_id) case ModelType.luotuo: - if model_name is None: + if model_id is None: return LuotuoBertModel(model_name='silk-road/luotuo-bert') else: - return LuotuoBertModel(model_name=model_name) + return LuotuoBertModel(model_name=model_id) case ModelType.erlangshen: - if model_name is None: + if model_id is None: return ErLangShenModel(model_name='IDEA-CCNL/Erlangshen-SimCSE-110M-Chinese') else: - return ErLangShenModel(model_name=model_name) + return ErLangShenModel(model_name=model_id) case _: raise ValueError(f'Unknown model type: {model_type}') diff --git a/mteb-zh/run_mteb_zh.py b/mteb-zh/run_mteb_zh.py index 37f74fa..2593bd6 100644 --- a/mteb-zh/run_mteb_zh.py +++ b/mteb-zh/run_mteb_zh.py @@ -34,12 +34,13 @@ def main( model_type: Annotated[ModelType, typer.Option()], + model_id: str | None = None, model_name: str | None = None, task_type: TaskType = TaskType.Classification, output_folder: Path = Path('results'), ): output_folder = Path(output_folder) - model = load_model(model_type, model_name) + model = load_model(model_type, model_id) if task_type is TaskType.All: tasks = default_tasks @@ -47,8 +48,9 @@ def main( tasks = [task for task in default_tasks if task.description['type'] == task_type.value] # type: ignore evaluation = MTEB(tasks=tasks) - model_id = model_type.value + (f'-{model_name.replace("/", "-")}' if model_name else '') - evaluation.run(model, output_folder=str(output_folder / model_id)) + if model_name is None: + model_name = model_type.value + (f'-{model_id.replace("/", "-")}' if model_id else '') + evaluation.run(model, output_folder=str(output_folder / model_name)) if __name__ == '__main__': diff --git a/scripts/process_zh_datasets.py b/scripts/process_zh_datasets.py index 9dfe770..c1caf68 100644 --- a/scripts/process_zh_datasets.py +++ b/scripts/process_zh_datasets.py @@ -298,7 +298,7 @@ def load_miracl(): dataset_dict = cast(DatasetDict, dataset_dict) try: - from zhconv import convert + from zhconv import convert # type: ignore except ImportError: raise ImportError('Please install zhconv first: pip install zhconv') diff --git a/uniem/data.py b/uniem/data.py index 94bb4ba..6535b61 100644 --- a/uniem/data.py +++ b/uniem/data.py @@ -2,13 +2,13 @@ from collections import defaultdict from dataclasses import dataclass from pathlib import Path -from typing import Any, cast, Sequence +from typing import Any, Sequence, cast import torch +from datasets import Dataset as HfDataset from torch.utils.data import Dataset, RandomSampler -from datasets import Dataset as HfDataset -from uniem.data_structures import RecordType, PairRecord, ScoredPairRecord, TripletRecord, get_record_type, record_type_cls_map +from uniem.data_structures import PairRecord, RecordType, ScoredPairRecord, TripletRecord, get_record_type, record_type_cls_map from uniem.types import Tokenizer diff --git a/uniem/finetuner.py b/uniem/finetuner.py index e436ee2..1d81bbf 100644 --- a/uniem/finetuner.py +++ b/uniem/finetuner.py @@ -1,17 +1,18 @@ import functools -import os import logging +import os from pathlib import Path from typing import Sequence, cast import torch -from datasets import DatasetDict as HFDatasetDict, Dataset as HFDataset from accelerate import Accelerator from accelerate.utils import ProjectConfiguration, set_seed +from datasets import Dataset as HFDataset +from datasets import DatasetDict as HFDatasetDict from torch.utils.data import DataLoader from transformers import AutoTokenizer, get_cosine_schedule_with_warmup # type: ignore -from uniem.data import PairCollator, ScoredPairCollator, TripletCollator, FinetuneDataset, PrefixFinetuneDataset +from uniem.data import FinetuneDataset, PairCollator, PrefixFinetuneDataset, ScoredPairCollator, TripletCollator from uniem.data_structures import RecordType, get_record_type from uniem.model import ( EmbedderForPairInBatchNegTrain, @@ -25,7 +26,6 @@ from uniem.types import MixedPrecisionType from uniem.utils import create_adamw_optimizer, split_dataset_dict - logger = logging.getLogger(__name__) RawDataset = Sequence[dict] | dict[str, Sequence[dict]] | HFDatasetDict | HFDataset @@ -219,7 +219,7 @@ def __init__( super().__init__(model_name_or_path, dataset) self.special_prefix_tokens = additional_special_tokens self.prefix = ''.join(self.special_prefix_tokens) if prefix is None else prefix - self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens}) + self.tokenizer.add_special_tokens({'additional_special_tokens': additional_special_tokens}) # type: ignore self.additional_special_token_ids = self.tokenizer.convert_tokens_to_ids(additional_special_tokens) self.only_train_additional_special_tokens = only_train_additional_special_tokens @@ -241,9 +241,10 @@ def create_finetune_model( if self.only_train_additional_special_tokens: for param in model.parameters(): param.requires_grad = False - embedding_layer = model.embedder.encoder.get_input_embeddings() - embedding_layer.weight.requires_grad = True - embedding_layer.weight.register_hook(hook) + embedding_layer_weight = model.embedder.encoder.get_input_embeddings().weight + embedding_layer_weight = cast(torch.nn.Parameter, embedding_layer_weight) + embedding_layer_weight.requires_grad = True + embedding_layer_weight.register_hook(hook) return model def run( diff --git a/uniem/model.py b/uniem/model.py index aaccc3f..260daf1 100644 --- a/uniem/model.py +++ b/uniem/model.py @@ -2,10 +2,10 @@ from pathlib import Path from typing import ClassVar, Literal, Type, TypeVar, cast -import tqdm -import torch import numpy as np -from transformers import AutoTokenizer, AutoConfig, AutoModel, PreTrainedModel # type: ignore +import torch +import tqdm +from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedModel # type: ignore from uniem.criteria import ( CoSentLoss, diff --git a/uniem/trainer.py b/uniem/trainer.py index bae84f6..38e7456 100644 --- a/uniem/trainer.py +++ b/uniem/trainer.py @@ -3,11 +3,11 @@ from typing import Any import torch -from tqdm.auto import tqdm from accelerate import Accelerator from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader +from tqdm.auto import tqdm class Trainer: diff --git a/uniem/types.py b/uniem/types.py index 8063477..dc5986b 100644 --- a/uniem/types.py +++ b/uniem/types.py @@ -2,11 +2,10 @@ from enum import Enum from typing import Callable, TypeAlias +from datasets import DatasetDict from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils_fast import PreTrainedTokenizerFast -from datasets import DatasetDict - Tokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast diff --git a/uniem/utils.py b/uniem/utils.py index b8856ad..fb1f3d6 100644 --- a/uniem/utils.py +++ b/uniem/utils.py @@ -1,5 +1,5 @@ -from itertools import islice import logging +from itertools import islice from typing import Generator, Iterable, TypeVar import torch