RWKV-v4neo/config-example.yaml

# lightning.pytorch==2.0.2
seed_everything: true
trainer:
  # Configure the number of GPU, avaliable on your machine
  accelerator: gpu
  devices: 1
  num_nodes: 1

  #
  # Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload` 
  # and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful  
  # for training LoRA on large models on a single GPU.
  #
  # In general you would want to use the following:
  #
  # - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
  #
  # - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
  # - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
  #
  # - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
  # - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
  #
  # For more details see:
  # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
  #
  #!FIXME: currently only deepspeed_stage_1 is supported, due to that deepspeed cannot handle repeated backward hook.
  strategy: deepspeed_stage_1

  # Floating point precision for the model, because RWKV is built FOR bf16
  # you should pretty much never change this setting
  precision: bf16

  # Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
  # ---
  # logger:
  #   class_path: lightning.pytorch.loggers.WandbLogger
  #   init_args:
  #     name: null
  #     save_dir: .
  #     version: null
  #     offline: false
  #     dir: null
  #     id: null
  #     anonymous: null
  #     project: RWKV_training
  #     log_model: false
  #     experiment: null
  #     prefix: ''
  #     checkpoint_name: null
  #     job_type: null
  #     config: null
  #     entity: null
  #     reinit: null
  #     tags: ['RWKV']
  #     group: null
  #     notes: null
  #     magic: null
  #     config_exclude_keys: null
  #     config_include_keys: null
  #     mode: null
  #     allow_val_change: null
  #     resume: null
  #     force: null
  #     tensorboard: null
  #     sync_tensorboard: null
  #     monitor_gym: null
  #     save_code: null
  #     settings: null
  
  # Checkpoint settings for the training process
  callbacks:
  - class_path: lightning.pytorch.callbacks.ModelCheckpoint
    init_args:
      # Configure this to the path you want to save your checkpoints to
      # note that a subdir will be created with the name `epoch=x-step=y.ckpt`
      # 
      # to convert a checkpoint to a model, you can use the 
      # `python3 export_checkpoint.py <checkpoint path>` script, 
      # which will create a `rwkv_model.pth` in the checkpoint directory.
      #
      # Do not use the `zero_to_fp32.py` script as that will have export format issues
      dirpath: /path/to/your/checkpoint/dir
      filename: null
      
      # Save the top/last K checkpoints
      save_top_k: 5
      # Choose the most recent checkpoints by steps
      monitor: 'step'
      mode: max
      
      # If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
      # useful to simply checkpoint resume scripts, at a price of disk performance
      save_last: null

      # DO NOT set this as true, as the model weight exported will have format issues
      # expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
      save_weights_only: false

      # How frequent you want to save a checkpoint for every step.
      # This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
      #
      # In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
      # as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
      # However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
      every_n_train_steps: 100
      every_n_epochs: null
      save_on_train_epoch_end: true
      train_time_interval: null

      # Other settings, you can probably leave alone
      verbose: false
      auto_insert_metric_name: true
  
  ########################################
  ## Training run parameter settings
  ########################################

  # Generally what you want to configure is the maximum number of epochs
  # Leave it as -1, and it will keep going forever till interrupted
  # Or set it as a number, and it will stop after that number of epochs
  max_epochs: -1
  min_epochs: null
  max_steps: -1
  min_steps: null
  max_time: null

  # Number of datasamples to train for each step, a data sample is considered
  # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
  #
  # This decides the number of datasample, to learn together from, before backproping
  # any weight changes at the end of the batch.
  #
  # Recommended to be a big enough number (like 128/256) where it prevents the training 
  # loss from flucuating in the process. But not too big of a number where the increased
  # GPU vRAM usage will cause the training to crash.
  #
  # You are also recommended to configure this to a large enough number to fully utilize
  # your GPU processing time %, and avoid idle time for the GPU between batches
  accumulate_grad_batches: 256

  # Various other settings, you probably want to leave alone
  fast_dev_run: false
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: null
  check_val_every_n_epoch: 1
  num_sanity_val_steps: 0
  log_every_n_steps: 1
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  gradient_clip_val: 1.0
  gradient_clip_algorithm: null
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null

########################################
## Training model settings
########################################
model:
  # Model to start the finetune/training process from
  load_model: /path/to/your/model.pth

  # The model size setting, this MUST match
  # your current model settings, refer to the model card
  # of the downloaded model for more details
  n_embd: 768
  n_layer: 12
  vocab_size: 50277

  # Context length to use for the training process
  # the larger the number (and batch size) the larger the vram usage
  # 
  # Note that if the datasample context length is larger then the ctx_len
  # its training process would be split into ctx_len sized chunks.
  #
  # This allows the training of extreamly large context length (eg. 100k),
  # without eating up too much vram by keeping the training context length
  # to a resonable number sutible to the current GPU setup
  ctx_len: 2048
  # Data samples would be cut down to the respective max ctx_len_cutoffs
  # values if its larger then ctx_len. If the data sample is larger then
  # the largest len_cutoff, the remaining data will be discarded
  ctx_len_cutoffs: [8192, 16384, 32768, 65536]
  # Experimental settings, number of tokens to skip in the data sample
  # prefix, for the respective cutoff length. Used to speed up the process
  ctx_len_warmup_steps: [0, 0, 0, 0]

  # Learning rate of the training process
  lr_init: 1.0e-04

  # Adam optimizer settings
  # You probably want to leave this alone, unless you know what you are doing
  beta1: 0.9
  beta2: 0.99
  adam_eps: 1.0e-08
  weight_decay: 0.01

  # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
  # this should be set as null, for non cuda core GPUs
  torch_set_float32_matmul_precision: 'high'
  # torch_set_float32_matmul_precision: null

  # various other settings you probably should leave alone
  grad_cp: true
  warmup_steps: -1
  layerwise_lr: true
  dim_att: null
  dim_ffn: null
data:
  # dataset_path for the prebuilt dataset, using HF `load_from_disk()`
  #
  # Use this if you have built your own dataset and saved it with `save_to_disk()`
  # with source left as null. Other wise configure this to a directory which the 
  # dataset will be built and tokenized by the huggingface dataset process.
  data_path: /path/to/store/your/data_path/

  # Other wise provide the source path, which is used as huggingface dataset path
  # this will be used to populate the dataset_path
  #
  # Use either the following
  # - hugging face dataset 
  # - Directory path to a directory containing dataset files
  # - Path to a single dataset file
  # - hugging face dataset mode (ie: text,csv,etc - use data_dir, to configure the path then)
  # - null
  #
  # If source is disabled, all other params, except data_path, is effectively ignored
  # as the dataset building process is skipped. You are expected to prepare the huggingface
  # compliant dataset yourself, and save it to the data_path
  source: null
  # source: "teven/enwiki_100k"  # Hugging face dataset
  # source: text                 # Text dir, used with source_data_dir
  # source: /home/ubuntu/RWKV-LM-LoRA/dataset-text/enwik8.txt

  # Use data_dir, if you are using source=text/json/etc
  # this should be relative to the data_path
  source_data_dir: ../dataset-text/

  # After loading the dataset, split out test data used for validation, 
  # This process is skipped if the dataset includes a test split
  # This process is skipped if set to zero
  test_split: 0.05
  test_split_shuffle: true

  # Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
  # If using a custom tokenizer, provide the tokenizer file path
  # ---
  tokenizer: neox

  # Minimum / Maximum token size of the dataset to use
  # useful for filtering out small noisy data samples from large datasets
  # (eg. removal of small articles of less then 1024 tokens from wikipedia)
  #
  # This is ignored, if set to -1
  # ---
  # min_token_size: 1024
  # max_token_size: -1

  # Rechunking of text dataset, this is done only when source is set as 'text'
  # and will merge the various sentencees, into larger chunks up to the target size
  #
  # Defaults to 2048
  #
  # This is ignored, if source is not set as text
  # This is ignored, if set to zero
  # ---
  # text_rechunk_size: 2048

  # Custom text column to use, useful for dataset with alternative training columns labels
  # This is checked before multi column merging, default is null (disabled)
  # eg: 'code'
  # ---
  # custom_text_key: 'code'

  # Multi Column merging process, default setting is used to support and merge
  # "instruction", "input", "output", datasets. To disable set multi_column_keys to []
  #
  # A minimum of 2 columns is required, with non empty data, for the merge to occur
  # If no match is found, this will fallback to the default prompt/completion or text column, 
  # or throw an error if the default fallback is not found
  # ---
  # multi_column_keys: ['instruction', 'input', 'output']
  # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
  # multi_column_masking: [false, true, false]
  # multi_column_seperator: '\n\n'

  # If processing prompt/completion jsonl pairs, the prompt is masked by default
  # use this flag to disable this default behaviour
  # ---
  # disable_prompt_mask: false

# Path to the current checkpoint to continue training from
ckpt_path: null