import os import sys import math import torch from torch.nn import functional as F ## Symlink this file into minGPT directory to import, and run it from this directory ## (Python sux) # create a GPT instance from mingpt.model import GPT from mingpt.utils import set_seed set_seed(3407) model_config = GPT.get_default_config() model_config.model_type = 'gpt-nano' model_config.vocab_size = 3 model_config.block_size = 11 model = GPT(model_config) model.eval() model.load_state_dict(torch.load('mingpt/model.pt')) def tensor_to_json(tensor): import base64 data = base64.b64encode(tensor.detach().numpy().tobytes()).decode() return {"shape": list(tensor.shape), "dtype": str(tensor.dtype), "data": data} def save_tensor_dict_to_json(dict, filename, extra=None): import json items = { k: tensor_to_json(v) for k, v in dict.items() } if extra is not None: items = { **extra, **items } with open(filename, 'w') as f: json.dump(items, f, indent=4) extra = {'config': model_config.to_dict()} save_tensor_dict_to_json(model.state_dict(), 'public/gpt-nano-sort-model.json', extra) t0 = model.get_submodule('transformer.h.0') t0Attn = t0.get_submodule('attn') n_head = model_config.n_head n_embd = model_config.n_embd B = 3 T = model_config.block_size C = n_embd torch.random.manual_seed(34) # transformer_input = torch.randn(B, T, C, requires_grad=False) # print(transformer_input.flatten().tolist()[:10]) def mlp_forward_with_capture(tModule, x): fc = tModule.c_fc(x) gelu = tModule.act(fc) res = tModule.c_proj(gelu) return res, { 'fc': fc, 'gelu': gelu } def block_forward_with_capture(tModule, x): ln1 = tModule.ln_1(x) attn, attn_partials = transformer_forward_with_capture(tModule.attn, ln1) attnResid = x + attn ln2 = tModule.ln_2(attnResid) mlp, mlp_partials = mlp_forward_with_capture(tModule.mlp, ln2) mlpResid = attnResid + mlp return mlpResid, { 'ln1': ln1, **attn_partials, 'attnResid': attnResid, 'ln2': ln2, **mlp_partials, 'mlp': mlp, 'mlpResid': mlpResid, } def transformer_forward_with_capture(tModule, x): B, T, C = x.shape qkv = tModule.c_attn(x) q, k, v = qkv.split(n_embd, dim=2) k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs) q = q.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs) v = v.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs) # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) att = att.masked_fill(tModule.bias[:,:,:T,:T] == 0, float('-inf')) attSm = F.softmax(att, dim=-1) # att = self.attn_dropout(att) y = attSm @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side # output projection yProj = tModule.c_proj(y) partials = { 'q': q, 'k': k, 'v': v, # projected vectors (B, nh, T, hs) 'qkv': qkv, 'att': att, 'attSm': attSm, # attention (B, nh, T, T) 'y': y, 'yProj': yProj, # output (B, T, C) } return yProj, partials def gpt_forward_with_capture(model, idx): b, t = idx.size() assert t == T, f"For testing, only block size {T} is supported" pos = torch.arange(0, t, dtype=torch.long).unsqueeze(0) # shape (1, t) # forward the GPT model itself tok_emb = model.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) pos_emb = model.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd) x = tok_emb + pos_emb partials = { 'idx': idx.type(torch.float32), 'tok_emb': tok_emb, 'pos_emb': pos_emb, 'x': x, } return x, partials idx = torch.tensor([[0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0]], dtype=torch.long) extraIdx = torch.cat([ torch.randint(0, 3, (B - 1, 6), dtype=torch.long), torch.zeros((B - 1, 5), dtype=torch.long), ], dim=1) if B > 1: extraIdx[1, 0] = 1 idx = torch.cat([idx, extraIdx], dim=0) print(idx) transformer_input, partials0 = gpt_forward_with_capture(model, idx) res, partials = block_forward_with_capture(t0, transformer_input) partials = { **partials0, **partials } resActual = t0(transformer_input) if not torch.equal(res, resActual): print('ERROR: test block output does not match model output') x = transformer_input for i, block in enumerate(model.transformer.h): x = block(x) partials[f'block{i}'] = x x = model.transformer.ln_f(x) partials['ln_f'] = x x = model.lm_head(x) partials['lm_head'] = x probs = F.softmax(x, dim=-1) partials['probs'] = probs print(model_config.to_dict()) extra = {'config': { **model_config.to_dict(), 'B': B }} save_tensor_dict_to_json(partials, 'public/gpt-nano-sort-t0-partials.json', extra) print({ k: v.shape for k, v in partials.items() })