forked from BlinkDL/RWKV-LM
-
Notifications
You must be signed in to change notification settings - Fork 41
/
config-example.yaml
307 lines (277 loc) · 11.4 KB
/
config-example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# lightning.pytorch==2.0.2
seed_everything: true
trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: 1
num_nodes: 1
#
# Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload`
# and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful
# for training LoRA on large models on a single GPU.
#
# In general you would want to use the following:
#
# - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
#
# - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
# - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
#
# - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
# - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
#
# For more details see:
# https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
#
#!FIXME: currently only deepspeed_stage_1 is supported, due to that deepspeed cannot handle repeated backward hook.
strategy: deepspeed_stage_1
# Floating point precision for the model, because RWKV is built FOR bf16
# you should pretty much never change this setting
precision: bf16
# Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
# ---
# logger:
# class_path: lightning.pytorch.loggers.WandbLogger
# init_args:
# name: null
# save_dir: .
# version: null
# offline: false
# dir: null
# id: null
# anonymous: null
# project: RWKV_training
# log_model: false
# experiment: null
# prefix: ''
# checkpoint_name: null
# job_type: null
# config: null
# entity: null
# reinit: null
# tags: ['RWKV']
# group: null
# notes: null
# magic: null
# config_exclude_keys: null
# config_include_keys: null
# mode: null
# allow_val_change: null
# resume: null
# force: null
# tensorboard: null
# sync_tensorboard: null
# monitor_gym: null
# save_code: null
# settings: null
# Checkpoint settings for the training process
callbacks:
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
# Configure this to the path you want to save your checkpoints to
# note that a subdir will be created with the name `epoch=x-step=y.ckpt`
#
# to convert a checkpoint to a model, you can use the
# `python3 export_checkpoint.py <checkpoint path>` script,
# which will create a `rwkv_model.pth` in the checkpoint directory.
#
# Do not use the `zero_to_fp32.py` script as that will have export format issues
dirpath: /path/to/your/checkpoint/dir
filename: null
# Save the top/last K checkpoints
save_top_k: 5
# Choose the most recent checkpoints by steps
monitor: 'step'
mode: max
# If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
# useful to simply checkpoint resume scripts, at a price of disk performance
save_last: null
# DO NOT set this as true, as the model weight exported will have format issues
# expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
save_weights_only: false
# How frequent you want to save a checkpoint for every step.
# This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
#
# In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
# as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
# However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
every_n_train_steps: 100
every_n_epochs: null
save_on_train_epoch_end: true
train_time_interval: null
# Other settings, you can probably leave alone
verbose: false
auto_insert_metric_name: true
########################################
## Training run parameter settings
########################################
# Generally what you want to configure is the maximum number of epochs
# Leave it as -1, and it will keep going forever till interrupted
# Or set it as a number, and it will stop after that number of epochs
max_epochs: -1
min_epochs: null
max_steps: -1
min_steps: null
max_time: null
# Number of datasamples to train for each step, a data sample is considered
# a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
#
# This decides the number of datasample, to learn together from, before backproping
# any weight changes at the end of the batch.
#
# Recommended to be a big enough number (like 128/256) where it prevents the training
# loss from flucuating in the process. But not too big of a number where the increased
# GPU vRAM usage will cause the training to crash.
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
accumulate_grad_batches: 256
# Various other settings, you probably want to leave alone
fast_dev_run: false
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: null
check_val_every_n_epoch: 1
num_sanity_val_steps: 0
log_every_n_steps: 1
enable_checkpointing: null
enable_progress_bar: null
enable_model_summary: null
gradient_clip_val: 1.0
gradient_clip_algorithm: null
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
########################################
## Training model settings
########################################
model:
# Model to start the finetune/training process from
load_model: /path/to/your/model.pth
# The model size setting, this MUST match
# your current model settings, refer to the model card
# of the downloaded model for more details
n_embd: 768
n_layer: 12
vocab_size: 50277
# Context length to use for the training process
# the larger the number (and batch size) the larger the vram usage
#
# Note that if the datasample context length is larger then the ctx_len
# its training process would be split into ctx_len sized chunks.
#
# This allows the training of extreamly large context length (eg. 100k),
# without eating up too much vram by keeping the training context length
# to a resonable number sutible to the current GPU setup
ctx_len: 2048
# Data samples would be cut down to the respective max ctx_len_cutoffs
# values if its larger then ctx_len. If the data sample is larger then
# the largest len_cutoff, the remaining data will be discarded
ctx_len_cutoffs: [8192, 16384, 32768, 65536]
# Experimental settings, number of tokens to skip in the data sample
# prefix, for the respective cutoff length. Used to speed up the process
ctx_len_warmup_steps: [0, 0, 0, 0]
# Learning rate of the training process
lr_init: 1.0e-04
# Adam optimizer settings
# You probably want to leave this alone, unless you know what you are doing
beta1: 0.9
beta2: 0.99
adam_eps: 1.0e-08
weight_decay: 0.01
# torch.set_float32_matmul_precision, used to optimize operations with tensor cores
# this should be set as null, for non cuda core GPUs
torch_set_float32_matmul_precision: 'high'
# torch_set_float32_matmul_precision: null
# various other settings you probably should leave alone
grad_cp: true
warmup_steps: -1
layerwise_lr: true
dim_att: null
dim_ffn: null
data:
# dataset_path for the prebuilt dataset, using HF `load_from_disk()`
#
# Use this if you have built your own dataset and saved it with `save_to_disk()`
# with source left as null. Other wise configure this to a directory which the
# dataset will be built and tokenized by the huggingface dataset process.
data_path: /path/to/store/your/data_path/
# Other wise provide the source path, which is used as huggingface dataset path
# this will be used to populate the dataset_path
#
# Use either the following
# - hugging face dataset
# - Directory path to a directory containing dataset files
# - Path to a single dataset file
# - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
# - null
#
# If source is disabled, all other params, except data_path, is effectively ignored
# as the dataset building process is skipped. You are expected to prepare the huggingface
# compliant dataset yourself, and save it to the data_path
source: null
# source: "teven/enwiki_100k" # Hugging face dataset
# source: text # Text dir, used with source_data_dir
# source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt
# Use data_dir, if you are using source=text/json/etc
# this should be relative to the data_path
source_data_dir: ../dataset-text/
# After loading the dataset, split out test data used for validation,
# This process is skipped if the dataset includes a test split
# This process is skipped if set to zero
test_split: 0.05
test_split_shuffle: true
# Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
# If using a custom tokenizer, provide the tokenizer file path
# ---
tokenizer: neox
# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
# (eg. removal of small articles of less then 1024 tokens from wikipedia)
#
# This is ignored, if set to -1
# ---
# min_token_size: 1024
# max_token_size: -1
# Rechunking of text dataset, this is done only when source is set as 'text'
# and will merge the various sentencees, into larger chunks up to the target size
#
# Defaults to 2048
#
# This is ignored, if source is not set as text
# This is ignored, if set to zero
# ---
# text_rechunk_size: 2048
# Custom text column to use, useful for dataset with alternative training columns labels
# This is checked before multi column merging, default is null (disabled)
# eg: 'code'
# ---
# custom_text_key: 'code'
# Multi Column merging process, default setting is used to support and merge
# "instruction", "input", "output", datasets. To disable set multi_column_keys to []
#
# A minimum of 2 columns is required, with non empty data, for the merge to occur
# If no match is found, this will fallback to the default prompt/completion or text column,
# or throw an error if the default fallback is not found
# ---
# multi_column_keys: ['instruction', 'input', 'output']
# multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
# multi_column_masking: [false, true, false]
# multi_column_seperator: '\n\n'
# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_mask: false
# Path to the current checkpoint to continue training from
ckpt_path: null