False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

tiwari-research-group · 2024-06-15T21:06:25Z

🐛 Describe the bug

from kan import *
import pickle
import control as ct
import scipy.io as sio

vars = sio.loadmat('variables.mat')
predict_time = int(vars['predict_time'][0,0])
dp = int(vars['dp'][0,0]) - (predict_time)
dt = float(vars['dt'][0,0])
tf = int(vars['tf'][0,0])
m = int(vars['m'][0,0])
nx = int(vars['nx'][0,0])

data_x = sio.loadmat('data_x.mat')
train_x = torch.FloatTensor(data_x['data_x'])
data_y = sio.loadmat('data_y.mat')
train_y = torch.FloatTensor(data_y['data_y'])
data_y2 = sio.loadmat('data_y2.mat')
train_y2 = torch.FloatTensor(data_y2['data_y2'])
data_u = sio.loadmat('data_u.mat')
train_u = torch.FloatTensor(data_u['data_u'])

lifted_space = 3
hidden_size = 2
learning_rate = 1
Loss_prev = 1e38
P = torch.cat((torch.eye(nx),torch.zeros(nx,lifted_space)),1)
criterion = torch.nn.MSELoss()

log = 1

grids = [5,5]
steps = 20

recon_losses = []
pred_losses = []

model = KAN(width=[nx,hidden_size,lifted_space], grid=grids[0], k=3, grid_eps=0, noise_scale_base=0.25)
i=0
for grid in grids:
if i == 0:
a = 0
else:
model = KAN(width=[nx,hidden_size,lifted_space], grid=grid, k=3).initialize_from_another_model(model, train_x)

def train():
    optimizer = LBFGS(model.parameters(), lr=learning_rate, history_size=10, tolerance_grad=1e-32, tolerance_change=1e-32, tolerance_ys=1e-32)
    pbar = tqdm(range(steps), desc='description')

    for step in pbar:
        def closure():
            global pred_loss, recon_loss, K, B
            optimizer.zero_grad()

            phi_k = torch.transpose(model(train_x),0,1)
            phi_k = torch.cat((torch.transpose(train_x,0,1),phi_k),0)
            phi_kp1 = torch.transpose(model(train_y),0,1)
            phi_kp1 = torch.cat((torch.transpose(train_y,0,1),phi_kp1),0)

            W = torch.cat((phi_k,torch.transpose(train_u[:,:,0],0,1)),0)  
            V = phi_kp1

            Vwt = torch.matmul(V,torch.transpose((W),0,1))
            Wwt = torch.matmul(W,torch.transpose((W),0,1))
            KB = torch.matmul(Vwt,torch.pinverse(Wwt))

            K = KB[:,0:-m]
            B = KB[:,-m:]

            # next time step prediciton with linear Koopman operator
            K_phi_plus_one = torch.matmul(K,phi_k)+torch.matmul(B,torch.transpose(train_u[:,:,0],0,1))
            x_plus_one = torch.transpose(torch.matmul(P,K_phi_plus_one),0,1)
            recon_loss = criterion(x_plus_one, train_y)
            
            for l in range(predict_time):
                next_phi = torch.matmul(K,phi_k) + torch.matmul(B,torch.transpose(train_u[:,:,l],0,1)) # K: This is the linear operation. 
                next_state = torch.transpose(torch.matmul(P,next_phi),0,1) # extracting the next state
                next_state_encoded = model(next_state) # stacking original states on 
                phi_k = torch.transpose(torch.cat((next_state,next_state_encoded),1),0,1)
            pred_loss = criterion(next_state, train_y2) # Koopman loss (finds K) EDMD loss

            loss = recon_loss + pred_loss
            loss.backward()
            return loss
        
        if step % 1 == 0 and step < 50:
            model.update_grid_from_samples(train_x)
            
        optimizer.step(closure)            

        if step % log == 0:
            pbar.set_description("reconstruction loss: %.2e | prediction loss: %.2e" % (recon_loss.cpu().detach().numpy(), pred_loss.cpu().detach().numpy()))

        recon_losses.append(recon_loss.detach().numpy())
        pred_losses.append(pred_loss.detach().numpy())
train()
i=i+1

'''---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[3], line 74
72 recon_losses.append(recon_loss.detach().numpy())
73 pred_losses.append(pred_loss.detach().numpy())
---> 74 train()
75 i=i+1

Cell In[3], line 65
62 return loss
64 if step % 1 == 0 and step < 50:
---> 65 model.update_grid_from_samples(train_x)
67 optimizer.step(closure)
69 if step % log == 0:

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\KAN.py:244, in KAN.update_grid_from_samples(self, x)
242 for l in range(self.depth):
243 self.forward(x)
--> 244 self.act_fun[l].update_grid_from_samples(self.acts[l])

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\KANLayer.py:218, in KANLayer.update_grid_from_samples(self, x)
216 grid_uniform = torch.cat([grid_adaptive[:, [0]] - margin + (grid_adaptive[:, [-1]] - grid_adaptive[:, [0]] + 2 * margin) * a for a in np.linspace(0, 1, num=self.grid.shape[1])], dim=1)
217 self.grid.data = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
--> 218 self.coef.data = curve2coef(x_pos, y_eval, self.grid, self.k, device=self.device)

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\spline.py:137, in curve2coef(x_eval, y_eval, grid, k, device)
135 # x_eval: (size, batch); y_eval: (size, batch); grid: (size, grid); k: scalar
136 mat = B_batch(x_eval, grid, k, device=device).permute(0, 2, 1)
--> 137 coef = torch.linalg.lstsq(mat.to('cpu'), y_eval.unsqueeze(dim=2).to('cpu')).solution[:, :, 0] # sometimes 'cuda' version may diverge
138 return coef.to(device)

RuntimeError: false INTERNAL ASSERT FAILED at "..\aten\src\ATen\native\BatchLinearAlgebra.cpp":1538, please report a bug to PyTorch. torch.linalg.lstsq: (Batch element 8): Argument 4 has illegal value. Most certainly there is a bug in the implementation calling the backend library.'''

Versions

StatusCode : 200
StatusDescription : OK
Content : # mypy: allow-untyped-defs

                # Unlike the rest of the PyTorch this file must be python2 compliant.
                # This script outputs relevant system environment info
                # Run it with `python collect_env.py` or `pytho...

RawContent : HTTP/1.1 200 OK
Connection: keep-alive
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
...
Forms : {}
Headers : {[Connection, keep-alive], [Content-Security-Policy, default-src 'none'; style-src 'unsafe-inline'; sandbox],
[Strict-Transport-Security, max-age=31536000], [X-Content-Type-Options, nosniff]...}
Images : {}
InputFields : {}
Links : {}
ParsedHtml : mshtml.HTMLDocumentClass
RawContentLength : 23357

cc @jianyuh @nikitaved @pearu @mruberry @walterddr @xwang233 @lezcano

The text was updated successfully, but these errors were encountered:

lezcano · 2024-06-17T15:15:42Z

It looks similar to #128564.
Can you edit c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\spline.py and print the size of the matrices? Even better, could you pick the matrices that are being fed to lstsq and share them in a pastebin / sharing their singular values?

Also, what is your current environment? see #128564 (comment)

mikaylagawarecki added module: linear algebra Issues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmul triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module labels Jun 17, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

tiwari-research-group commented Jun 15, 2024 •

edited by pytorch-bot bot

Loading

lezcano commented Jun 17, 2024

False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

Comments

tiwari-research-group commented Jun 15, 2024 • edited by pytorch-bot bot Loading

🐛 Describe the bug

Versions

lezcano commented Jun 17, 2024

tiwari-research-group commented Jun 15, 2024 •

edited by pytorch-bot bot

Loading