Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finetuner model class #42

Merged
merged 10 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
🐞fix:fix type
  • Loading branch information
wangyuxin committed Jul 11, 2023
commit d6dba67cb9a823a58441dada7fe0839b9747e6f2
4 changes: 2 additions & 2 deletions examples/finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"\n",
"让我们看看这背后发生了什么,为什么可以这么简单?\n",
"\n",
"1. `FineTuner` 会自动加载 M3E 模型,您只需要声明即可,就像例子中的 `moka-ai/m3e-small`\n",
"1. `FineTuner` 会自动根据名称识别和加载模型,您只需要声明即可,就像例子中的 `moka-ai/m3e-small`,这会被识别为 M3E 类模型,`FinTuner` 还支持 sentence-transformers, text2vec 等模型\n",
"2. `FineTuner` 会自动识别数据格式,只要您的数据类型在 `FineTuner` 支持的范围内,`FineTuner` 就会自动识别并加以使用\n",
"3. `FineTuner` 会自动选择训练方式,`FineTuner` 会根据模型和数据集自动地选择训练方式,即 对比学习 或者 CoSent 等\n",
"4. `FineTuner` 会自动选择训练环境和超参数,`FineTuner` 会根据您的硬件环境自动选择训练设备,并根据模型、数据等各种信息自动建议最佳的超参数,lr, batch_size 等,当然您也可以自己手动进行调整\n",
Expand Down Expand Up @@ -407,7 +407,7 @@
"`FineTuner` 在设计实现的时候还提供了更多的灵活性,以 [SGPT](https://github.com/Muennighoff/sgpt) 为例,SGPT 和前面介绍的模型主要有以下三点不同:\n",
"\n",
"1. SGPT 使用 GPT 系列模型(transformer decoder)作为 Embedding 模型的基础模型\n",
"2. Embedding 向量的提取策略不再是 LastMeanPolling ,而是根据 token position 来加权平均\n",
"2. Embedding 向量的提取策略不再是 LastMeanPooling ,而是根据 token position 来加权平均\n",
"3. 使用 bitfit 的微调策略,在微调时只对模型的 bias 进行更新\n",
"\n",
"现在我们将效仿 SGPT 的训练策略,使用 Med_QQpairs 对 GPT2 进行微调。"
Expand Down
8 changes: 5 additions & 3 deletions scripts/train_m3e.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
create_uniem_embedder,
)
from uniem.trainer import Trainer
from uniem.training_strategy import BitFitTrainging
from uniem.types import MixedPrecisionType
from uniem.utils import ConfigFile, apply_bitfit, convert_to_readable_string, create_adamw_optimizer
from uniem.utils import ConfigFile, convert_number_to_readable_string, create_adamw_optimizer

app = typer.Typer()

Expand Down Expand Up @@ -128,9 +129,10 @@ def main(
loss_type=loss_type,
)
if bitfit:
apply_bitfit(model)
model = BitFitTrainging().apply_model(model)

num_training_paramters = sum(p.numel() for p in model.parameters() if p.requires_grad)
accelerator.print(f'Number of training parameters: {convert_to_readable_string(num_training_paramters)}')
accelerator.print(f'Number of training parameters: {convert_number_to_readable_string(num_training_paramters)}')
embedder.encoder.config.pad_token_id = tokenizer.pad_token_id
model = accelerator.prepare(model)

Expand Down
4 changes: 2 additions & 2 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pytest
import torch
from uniem.model import (
AutoEmbedder,
EmbedderForPairInBatchNegTrain,
EmbedderForTripletInBatchNegTrain,
FirstLastEmbedder,
LastMeanEmbedder,
LastWeightedEmbedder,
Uniem,
UniemEmbedder,
create_attention_mask_from_input_ids,
create_uniem_embedder,
mean_pooling,
Expand Down Expand Up @@ -172,7 +172,7 @@ def test_auto_embedder(transformers_model, tmpdir, embedder_cls):
embedder = embedder_cls(transformers_model)

embedder.save_pretrained(tmpdir)
new_embedder = AutoEmbedder.from_pretrained(tmpdir)
new_embedder = UniemEmbedder.from_pretrained(tmpdir)

assert isinstance(new_embedder, embedder_cls)
assert torch.allclose(
Expand Down
10 changes: 5 additions & 5 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from uniem.utils import convert_to_readable_string
from uniem.utils import convert_number_to_readable_string


def test_convert_to_readable_string():
assert convert_to_readable_string(123) == '123'
assert convert_to_readable_string(1234) == '1.2k'
assert convert_to_readable_string(1234567) == '1.2M'
assert convert_to_readable_string(1234567890) == '1.2B'
assert convert_number_to_readable_string(123) == '123'
assert convert_number_to_readable_string(1234) == '1.2k'
assert convert_number_to_readable_string(1234567) == '1.2M'
assert convert_number_to_readable_string(1234567890) == '1.2B'
39 changes: 22 additions & 17 deletions uniem/finetuner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from enum import Enum
from pathlib import Path
from typing import Iterable, Sequence, Sized, cast

Expand All @@ -21,7 +22,6 @@
)
from uniem.data_structures import RecordType, infer_record_type
from uniem.model import (
AutoEmbedder,
Embedder,
EmbedderForPairInBatchNegTrain,
EmbedderForScoredPairTrain,
Expand All @@ -33,7 +33,7 @@
)
from uniem.trainer import Trainer
from uniem.training_strategy import FullParametersTraining, PrefixTraining, TrainingStrategy
from uniem.types import MixedPrecisionType, ModelType, Tokenizer
from uniem.types import MixedPrecisionType, Tokenizer
from uniem.utils import create_adamw_optimizer, find_executable_batch_size, split_dataset_dict

logger = logging.getLogger(__name__)
Expand All @@ -43,6 +43,14 @@
SupportedDatasetDict = dict[str, SupportedDataset]


class ModelType(str, Enum):
uniem = 'uniem'
text2vec = 'text2vec'
sentence_transformers = 'sentence_transformers'
huggingface = 'huggingface'
custom = 'custom'


def suggest_lr(model: torch.nn.Module) -> float:
num_training_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

Expand All @@ -62,8 +70,8 @@ def __init__(
embedder: Embedder,
tokenizer: Tokenizer,
dataset: SupportedDatasetDict | SupportedDataset,
model_type: ModelType | str = ModelType.uniem,
record_type: RecordType | str | None = None,
model_type: ModelType | str | None = None,
):
self.embedder = embedder
self.tokenizer = tokenizer
Expand All @@ -80,19 +88,19 @@ def __init__(

record_type = RecordType(record_type) if isinstance(record_type, str) else record_type
self.record_type = record_type or infer_record_type(next(iter(self.raw_train_dataset)))
self.model_type = ModelType(model_type) if model_type is not None else None
self.model_type = ModelType(model_type)

@classmethod
def from_pretrained(
cls,
model_name_or_path: str,
dataset: SupportedDatasetDict | SupportedDataset,
model_type: ModelType | str = ModelType.auto,
model_type: ModelType | str | None = None,
record_type: RecordType | str | None = None,
):
model_type = ModelType(model_type)

if model_type is ModelType.auto:
if model_type is None:
if 'sentence-transformers' in model_name_or_path:
model_type = ModelType.sentence_transformers
elif 'text2vec' in model_name_or_path:
Expand All @@ -105,7 +113,7 @@ def from_pretrained(

match model_type:
case ModelType.uniem:
embedder = AutoEmbedder.from_pretrained(model_name_or_path)
embedder = UniemEmbedder.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
case ModelType.huggingface | ModelType.text2vec:
embedder = create_uniem_embedder(model_name_or_path)
Expand All @@ -119,8 +127,8 @@ def from_pretrained(
tokenizer = cast(Tokenizer, tokenizer)
except ImportError:
raise ImportError('can not find sentence_transformers, pip install sentence_transformers')
case _:
raise ValueError(f'Unknown model type: {model_type}')
case ModelType.custom:
raise ValueError('model_type is custom, you should create embedder by yourself')

return cls(embedder=embedder, tokenizer=tokenizer, dataset=dataset, record_type=record_type, model_type=model_type)

Expand Down Expand Up @@ -161,8 +169,6 @@ def create_dataloaders(
data_collator = TripletCollator(tokenizer=self.tokenizer, max_length=max_length)
case RecordType.SCORED_PAIR:
data_collator = ScoredPairCollator(tokenizer=self.tokenizer, max_length=max_length)
case _:
raise ValueError('Only supports pair, triplet and scored pair record.')

if not isinstance(train_dataset, Sized) and shuffle:
shuffle = False
Expand Down Expand Up @@ -326,18 +332,19 @@ def run(
accelerator.wait_for_everyone()
accelerator.print('Training finished')

if self.model_type is not None:
if self.model_type is not ModelType.custom:
save_dir = output_dir / 'model'
self.save_pretrained(save_dir)
accelerator.print(f'Saving model to {save_dir}')

unwrapped_model = cast(EmbedderForTrain, accelerator.unwrap_model(model))
embedder = unwrapped_model.embedder
return embedder

def save_pretrained(self, output_dir: Path | str):
output_dir = Path(output_dir)
match self.model_type:
case ModelType.uniem:
case ModelType.uniem | ModelType.huggingface | ModelType.text2vec:
embedder = cast(UniemEmbedder, self.embedder)
embedder.save_pretrained(output_dir)
self.tokenizer.save_pretrained(output_dir)
Expand All @@ -346,7 +353,5 @@ def save_pretrained(self, output_dir: Path | str):

embedder = cast(SentenceTransformer, self.embedder)
embedder.save(str(output_dir))
case None:
raise ValueError('model_type is not set, can not save pretrained model')
case _:
raise ValueError(f'Unknown model type: {self.model_type}')
case ModelType.custom:
raise ValueError('model_type is custom, you should save model by yourself')
54 changes: 12 additions & 42 deletions uniem/model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import importlib
from enum import Enum
from pathlib import Path
from typing import ClassVar, Literal, Protocol, Type, TypeVar, cast

import numpy as np
import torch
import tqdm
from transformers import AutoModel, AutoTokenizer, PreTrainedModel # type: ignore
from transformers import AutoConfig, AutoTokenizer, PreTrainedModel # type: ignore

from uniem.criteria import (
CoSentLoss,
Expand All @@ -18,7 +17,7 @@
TripletInBatchNegSoftmaxContrastLoss,
)
from uniem.types import Tokenizer
from uniem.utils import create_attention_mask_from_input_ids, generate_batch
from uniem.utils import create_attention_mask_from_input_ids, generate_batch, load_hf_pretrained_model

T = TypeVar('T')

Expand All @@ -44,27 +43,6 @@ def mean_pooling(hidden_state: torch.Tensor, attention_mask: torch.Tensor | None
return torch.sum(hidden_state * attention_mask.unsqueeze(-1), dim=1) / torch.sum(attention_mask, dim=-1, keepdim=True)


def load_hf_pretrained_model(
model_name_or_path: str, model_class: str | None | Type[PreTrainedModel] | Type[AutoModel] = None
) -> PreTrainedModel:
if model_class is None:
model_class = AutoModel
elif model_class in {'sentence_transformers', 'SentenceTransformer'}:
try:
from sentence_transformers import SentenceTransformer

return SentenceTransformer(model_name_or_path) # type: ignore
except ImportError:
raise ImportError('can not find sentence_transformers, pip install sentence_transformers')
elif isinstance(model_class, str):
transformers_module = importlib.import_module('transformers')
model_class = getattr(transformers_module, model_class)

model = model_class.from_pretrained(model_name_or_path) # type: ignore
model = cast(PreTrainedModel, model)
return model


StrategyEmbedderClsMap: dict[PoolingStrategy, Type['UniemEmbedder']] = {}


Expand Down Expand Up @@ -100,8 +78,14 @@ def save_pretrained(self, path: str | Path):

@classmethod
def from_pretrained(cls, model_name_or_path: str):
encoder = load_hf_pretrained_model(model_name_or_path)
return cls(encoder)
config = AutoConfig.from_pretrained(str(model_name_or_path))
if hasattr(config, 'uniem_pooling_strategy'):
strategy_string = config.uniem_pooling_strategy
elif hasattr(config, 'uniem_embedding_strategy'):
strategy_string = config.uniem_embedding_strategy
else:
raise ValueError('Can not find uniem pooling strategy in config, Model is not trained by UniEmbedder.')
return create_uniem_embedder(str(model_name_or_path), pooling_strategy=strategy_string)

@property
def max_length(self):
Expand Down Expand Up @@ -177,20 +161,6 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None =
return embeddings


class AutoEmbedder:
@classmethod
def from_pretrained(cls, model_name_or_path: str | Path):
encoder = load_hf_pretrained_model(str(model_name_or_path))
if hasattr(encoder.config, 'uniem_pooling_strategy'):
strategy_string = encoder.config.uniem_pooling_strategy
elif hasattr(encoder.config, 'uniem_embedding_strategy'):
strategy_string = encoder.config.uniem_embedding_strategy
else:
raise ValueError('Can not find uniem pooling strategy in config, Model is not trained by UniEmbedder.')
embedder_cls = StrategyEmbedderClsMap[PoolingStrategy(strategy_string)]
return embedder_cls(encoder)


def create_uniem_embedder(
model_name_or_path: str,
model_class: str | None = None,
Expand Down Expand Up @@ -363,9 +333,9 @@ def encode_single(self, sentence: str):

@classmethod
def from_pretrained(cls, model_name_or_path: str, **kwargs):
encoder = AutoEmbedder.from_pretrained(model_name_or_path)
embedder = UniemEmbedder.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
return cls(encoder, tokenizer, **kwargs)
return cls(embedder, tokenizer, **kwargs)

def save_pretrained(self, ouptut_dir: str):
self.embedder.save_pretrained(ouptut_dir)
Expand Down
1 change: 1 addition & 0 deletions uniem/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def train(self):
self.validation_loss_tracker,
)
validation_metrics = self.add_prefix({'loss': validation_loss}, 'validation')
self.accelerator.print(f'Epoch {current_epoch} Validation loss: {validation_loss:.4f}')
self.accelerator.log(validation_metrics, step=current_epoch)

if self.save_on_epoch_end:
Expand Down
9 changes: 0 additions & 9 deletions uniem/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,6 @@ class MixedPrecisionType(str, Enum):
no = 'no'


class ModelType(str, Enum):
auto = 'auto'
uniem = 'uniem'
text2vec = 'text2vec'
sentence_transformers = 'sentence_transformers'
huggingface = 'huggingface'
custom = 'custom'


@dataclass
class DatasetDescription:
name: str
Expand Down
Loading