Skip to content

Commit

Permalink
support tokenizer_mode & support copy_tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
helloyongyang committed Apr 3, 2024
1 parent 58d2722 commit 5a6b4ea
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 17 deletions.
30 changes: 21 additions & 9 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
# editor
*.idea
.vscode

# model
*.pth
*.pt

# log
*.onnx
*.pk
*.model
*.zip
*.tar
*.pyc
*.log
*.o
*.so
*.a
*.exe
*.out
.idea
**.DS_Store**
**/__pycache__/**
**.swp
.vscode/
.env
save*
*log*
*pid*
*.ipynb*

# macOS
*.DS_Store
6 changes: 4 additions & 2 deletions llmc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


def main(config):
tokenizer = BaseTokenizer(config.model.path)
tokenizer = BaseTokenizer(config.model.path, config.model.tokenizer_mode)
model = MODEL_REGISTRY[config.model.type](
config.model.path, config.model.torch_dtype
)
Expand Down Expand Up @@ -44,7 +44,9 @@ def main(config):
logger.info(f"{ppl_eval.dataset} ppl : {ppl}")

if not config.get("calib", False):
blockwise_opt = ALGO_REGISTRY[config.quant.method](model, config.quant)
blockwise_opt = ALGO_REGISTRY[config.quant.method](
model, quant_config=config.quant, config=config
)
blockwise_opt.run_block_loop()
else:
dataset = BaseDataset(tokenizer.get_tokenizer(), config.calib)
Expand Down
11 changes: 10 additions & 1 deletion llmc/compression/quantization/base_blockwise_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
LlmcMistralRMSNorm,
)
from .quant import Quantizer
from llmc.utils import copy_files


class BaseBlockwiseQuantization(BlockwiseOpt):
Expand Down Expand Up @@ -321,6 +322,14 @@ def deploy(self, quant_format):
self.model.replace_module_all(module, params_dict)
logger.info(f"-- deploy_{quant_format}_model done --")

@torch.no_grad()
def copy_tokenizer(self, path):
for substring in self.config.save.get("tokenizer_file_substring", ["token"]):
copy_files(self.config.model.path, path, substring)
logger.info(f"copy tokenizer done --")

@torch.no_grad()
def save_model(self, path):
self.model.get_model().save_pretrained(path)
self.model.get_model().save_pretrained(path)
logger.info(f"save model done --")
self.copy_tokenizer(path)
11 changes: 9 additions & 2 deletions llmc/data/tokenizer/base_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,23 @@


class BaseTokenizer(metaclass=ABCMeta):
def __init__(self, tokenizer_path):
def __init__(self, tokenizer_path, tokenizer_mode):
self.tokenizer_path = tokenizer_path
self.tokenizer_mode = tokenizer_mode
if self.tokenizer_mode == "slow":
self.use_fast = False
elif self.tokenizer_mode == "fast":
self.use_fast = True
else:
raise Exception(f"Not support tokenizer_mode: {self.tokenizer_mode}")
self.build_tokenizer()

def __str__(self):
return str(self.tokenizer)

def build_tokenizer(self):
self.tokenizer = AutoTokenizer.from_pretrained(
self.tokenizer_path, use_fast=False, trust_remote_code=True
self.tokenizer_path, use_fast=self.use_fast, trust_remote_code=True
)

def get_tokenizer(self):
Expand Down
2 changes: 1 addition & 1 deletion llmc/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .utils import seed_all, check_config, mkdirs
from .utils import seed_all, check_config, mkdirs, copy_files
19 changes: 19 additions & 0 deletions llmc/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os
import random
import numpy as np
from loguru import logger
import shutil


def seed_all(seed):
Expand Down Expand Up @@ -40,10 +42,27 @@ def check_weight_setting(weight_setting):
config.save.get("save_fake", False)
and config.save.get("save_quant", False)
), "Saving fake quant and saving real quant conflict now."
if config.model.get("tokenizer_mode", False):
assert (
config.model.tokenizer_mode == "slow"
or config.model.tokenizer_mode == "fast"
), "Tokenizer_mode should be slow or fast."
logger.info(f"Tokenizer_mode is set to {config.model.tokenizer_mode}.")
else:
config.model.tokenizer_mode = "slow"
logger.info("Tokenizer_mode is set to slow.")


def mkdirs(path):
if not os.path.exists(path):
os.makedirs(path)
else:
raise Exception(f"{path} existed before. Need check.")

def copy_files(source_dir, target_dir, substring):
for filename in os.listdir(source_dir):
if substring in filename:
source_file = os.path.join(source_dir, filename)
target_file = os.path.join(target_dir, filename)
shutil.copy(source_file, target_file)
logger.info(f"Copied {filename} to {target_dir}")
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
torch
loguru
transformers==4.34.0
datasets==2.16.1
huggingface-hub==0.20.2
datasets
huggingface-hub
sentencepiece
protobuf
accelerate
Expand Down

0 comments on commit 5a6b4ea

Please sign in to comment.