unslothai · danielhanchen · Jan 18, 2024 · Jan 6, 2024 · Jan 6, 2024 · Jan 7, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,15 +32,25 @@ include-package-data = false
 exclude = ["images*"]
 
 [project.optional-dependencies]
+huggingfacedev = [
+ "transformers @ git+https://github.com/huggingface/transformers",
+ "datasets",
+ "sentencepiece",
+ "accelerate",
+ "trl>=0.7.9",
+ "peft",
+ "tqdm",
+ "psutil",
+]
 huggingface = [
  "transformers",
  "datasets",
  "sentencepiece",
  "accelerate",
- "trl",
+ "trl>=0.7.9",
  "peft",
- "packaging",
- "ninja",
+ "tqdm",
+ "psutil",
 ]
 cu118only = [
  "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
@@ -52,12 +62,12 @@ cu121only = [
  "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
  "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
-cu118only_torch211 = [
+cu118onlytorch211 = [
  "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
  "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
  "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
 ]
-cu121only_torch211 = [
+cu121onlytorch211 = [
  "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9'",
  "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10'",
  "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11'",
@@ -75,12 +85,12 @@ cu121 = [
 cu118_torch211 = [
  "unsloth[huggingface]",
  "bitsandbytes",
- "unsloth[cu118only_torch211]",
+ "unsloth[cu118onlytorch211]",
 ]
 cu121_torch211 = [
  "unsloth[huggingface]",
  "bitsandbytes",
- "unsloth[cu121only_torch211]",
+ "unsloth[cu121onlytorch211]",
 ]
 kaggle = [
  "unsloth[huggingface]",
@@ -93,30 +103,53 @@ colab = [
 ]
 colab_ampere = [
  "unsloth[cu121]",
+ "packaging",
+ "ninja",
+ "flash-attn",
+]
+colab_dev = [
+ "unsloth[huggingfacedev]",
+ "bitsandbytes",
+ "unsloth[cu121only]",
+]
+colab_ampere_dev = [
+ "unsloth[huggingfacedev]",
+ "bitsandbytes",
+ "unsloth[cu121only]",
+ "packaging",
+ "ninja",
  "flash-attn",
 ]
 cu118_ampere = [
  "unsloth[huggingface]",
  "bitsandbytes",
  "unsloth[cu118only]",
+ "packaging",
+ "ninja",
  "flash-attn",
 ]
 cu121_ampere = [
  "unsloth[huggingface]",
  "bitsandbytes",
  "unsloth[cu121only]",
+ "packaging",
+ "ninja",
  "flash-attn",
 ]
 cu118_ampere_torch211 = [
  "unsloth[huggingface]",
  "bitsandbytes",
- "unsloth[cu118only_torch211]",
+ "unsloth[cu118onlytorch211]",
+ "packaging",
+ "ninja",
  "flash-attn",
 ]
 cu121_ampere_torch211 = [
  "unsloth[huggingface]",
  "bitsandbytes",
- "unsloth[cu121only_torch211]",
+ "unsloth[cu121onlytorch211]",
+ "packaging",
+ "ninja",
  "flash-attn",
 ]
 

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -65,8 +65,7 @@
  libcuda_dirs()
 except:
  warnings.warn(
- "CUDA is not linked properly.\n"\
- "We shall run `ldconfig /usr/lib64-nvidia` to try to fix it."
+ "Running `ldconfig /usr/lib64-nvidia` to link CUDA."\
  )
  os.system("ldconfig /usr/lib64-nvidia")
  importlib.reload(bnb)

diff --git a/unsloth/kernels/rms_layernorm.py b/unsloth/kernels/rms_layernorm.py
@@ -41,12 +41,13 @@ def _rms_layernorm_forward(
  r += row_idx * r_row_stride
 
  X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
- W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
+ W_row = tl.load(W + col_offsets, mask = mask, other = 0)#.to(tl.float32)
 
  row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
- inv_var = 1 / tl.sqrt(row_var + eps)
+ inv_var = 1.0 / tl.sqrt(row_var + eps)
  tl.store(r, inv_var)
  normed = X_row * inv_var
+ normed = normed.to(W_row.dtype) # Exact copy from HF
  output = normed * W_row
  tl.store(Y + col_offsets, output, mask = mask)
 pass

diff --git a/unsloth/kernels/swiglu.py b/unsloth/kernels/swiglu.py
@@ -25,10 +25,11 @@ def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
  mask = offsets < n_elements
 
  e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
- g_row = tl.load(g + offsets, mask = mask, other = 0).to(tl.float32)
+ g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
 
  # f = e * sigmoid(e)
  f_row = e_row / (1 + tl.exp(-e_row))
+ f_row = f_row.to(g_row.dtype) # Exact copy from HF
  # h = f * g
  h_row = f_row * g_row
 
@@ -53,12 +54,13 @@ def _DWf_DW_dfg_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
  offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
  mask = offsets < n_elements
 
- DW_row = tl.load(DW + offsets, mask = mask, other = 0).to(tl.float32)
- e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
- g_row = tl.load(g + offsets, mask = mask, other = 0).to(tl.float32)
+ DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+ e_row = tl.load(e + offsets, mask = mask, other = 0)#.to(tl.float32)
+ g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
 
  # f = e * sigmoid(e)
- se_row = 1 / (1 + tl.exp(-e_row))
+ se_row = 1 / (1 + tl.exp(-e_row.to(tl.float32)))
+ se_row = se_row.to(e_row.dtype) # Exact copy from HF
  # f = e * se
  f_row = e_row * se_row
  # h = f * g

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -14,9 +14,7 @@
 
 import torch
 from typing import Union, Optional, List, Any, Callable
-import numpy as np
 import warnings
-import gc
 warnings.filterwarnings(action = "ignore", category = UserWarning, module = "torch")
 import bitsandbytes as bnb
 from transformers.models.llama.modeling_llama import logger