Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

False INTERNAL ASSERT FAILED bug whilst training Neural Network #128778

Open
tiwari-research-group opened this issue Jun 15, 2024 · 1 comment
Open
Labels
module: linear algebra Issues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmul triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module

Comments

@tiwari-research-group
Copy link

tiwari-research-group commented Jun 15, 2024

🐛 Describe the bug

from kan import *
import pickle
import control as ct
import scipy.io as sio

vars = sio.loadmat('variables.mat')
predict_time = int(vars['predict_time'][0,0])
dp = int(vars['dp'][0,0]) - (predict_time)
dt = float(vars['dt'][0,0])
tf = int(vars['tf'][0,0])
m = int(vars['m'][0,0])
nx = int(vars['nx'][0,0])

data_x = sio.loadmat('data_x.mat')
train_x = torch.FloatTensor(data_x['data_x'])
data_y = sio.loadmat('data_y.mat')
train_y = torch.FloatTensor(data_y['data_y'])
data_y2 = sio.loadmat('data_y2.mat')
train_y2 = torch.FloatTensor(data_y2['data_y2'])
data_u = sio.loadmat('data_u.mat')
train_u = torch.FloatTensor(data_u['data_u'])

lifted_space = 3
hidden_size = 2
learning_rate = 1
Loss_prev = 1e38
P = torch.cat((torch.eye(nx),torch.zeros(nx,lifted_space)),1)
criterion = torch.nn.MSELoss()

log = 1

grids = [5,5]
steps = 20

recon_losses = []
pred_losses = []

model = KAN(width=[nx,hidden_size,lifted_space], grid=grids[0], k=3, grid_eps=0, noise_scale_base=0.25)
i=0
for grid in grids:
if i == 0:
a = 0
else:
model = KAN(width=[nx,hidden_size,lifted_space], grid=grid, k=3).initialize_from_another_model(model, train_x)

def train():
    optimizer = LBFGS(model.parameters(), lr=learning_rate, history_size=10, tolerance_grad=1e-32, tolerance_change=1e-32, tolerance_ys=1e-32)
    pbar = tqdm(range(steps), desc='description')

    for step in pbar:
        def closure():
            global pred_loss, recon_loss, K, B
            optimizer.zero_grad()

            phi_k = torch.transpose(model(train_x),0,1)
            phi_k = torch.cat((torch.transpose(train_x,0,1),phi_k),0)
            phi_kp1 = torch.transpose(model(train_y),0,1)
            phi_kp1 = torch.cat((torch.transpose(train_y,0,1),phi_kp1),0)

            W = torch.cat((phi_k,torch.transpose(train_u[:,:,0],0,1)),0)  
            V = phi_kp1

            Vwt = torch.matmul(V,torch.transpose((W),0,1))
            Wwt = torch.matmul(W,torch.transpose((W),0,1))
            KB = torch.matmul(Vwt,torch.pinverse(Wwt))

            K = KB[:,0:-m]
            B = KB[:,-m:]

            # next time step prediciton with linear Koopman operator
            K_phi_plus_one = torch.matmul(K,phi_k)+torch.matmul(B,torch.transpose(train_u[:,:,0],0,1))
            x_plus_one = torch.transpose(torch.matmul(P,K_phi_plus_one),0,1)
            recon_loss = criterion(x_plus_one, train_y)
            
            for l in range(predict_time):
                next_phi = torch.matmul(K,phi_k) + torch.matmul(B,torch.transpose(train_u[:,:,l],0,1)) # K: This is the linear operation. 
                next_state = torch.transpose(torch.matmul(P,next_phi),0,1) # extracting the next state
                next_state_encoded = model(next_state) # stacking original states on 
                phi_k = torch.transpose(torch.cat((next_state,next_state_encoded),1),0,1)
            pred_loss = criterion(next_state, train_y2) # Koopman loss (finds K) EDMD loss

            loss = recon_loss + pred_loss
            loss.backward()
            return loss
        
        if step % 1 == 0 and step < 50:
            model.update_grid_from_samples(train_x)
            
        optimizer.step(closure)            

        if step % log == 0:
            pbar.set_description("reconstruction loss: %.2e | prediction loss: %.2e" % (recon_loss.cpu().detach().numpy(), pred_loss.cpu().detach().numpy()))

        recon_losses.append(recon_loss.detach().numpy())
        pred_losses.append(pred_loss.detach().numpy())
train()
i=i+1

'''---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[3], line 74
72 recon_losses.append(recon_loss.detach().numpy())
73 pred_losses.append(pred_loss.detach().numpy())
---> 74 train()
75 i=i+1

Cell In[3], line 65
62 return loss
64 if step % 1 == 0 and step < 50:
---> 65 model.update_grid_from_samples(train_x)
67 optimizer.step(closure)
69 if step % log == 0:

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\KAN.py:244, in KAN.update_grid_from_samples(self, x)
242 for l in range(self.depth):
243 self.forward(x)
--> 244 self.act_fun[l].update_grid_from_samples(self.acts[l])

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\KANLayer.py:218, in KANLayer.update_grid_from_samples(self, x)
216 grid_uniform = torch.cat([grid_adaptive[:, [0]] - margin + (grid_adaptive[:, [-1]] - grid_adaptive[:, [0]] + 2 * margin) * a for a in np.linspace(0, 1, num=self.grid.shape[1])], dim=1)
217 self.grid.data = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive
--> 218 self.coef.data = curve2coef(x_pos, y_eval, self.grid, self.k, device=self.device)

File c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\spline.py:137, in curve2coef(x_eval, y_eval, grid, k, device)
135 # x_eval: (size, batch); y_eval: (size, batch); grid: (size, grid); k: scalar
136 mat = B_batch(x_eval, grid, k, device=device).permute(0, 2, 1)
--> 137 coef = torch.linalg.lstsq(mat.to('cpu'), y_eval.unsqueeze(dim=2).to('cpu')).solution[:, :, 0] # sometimes 'cuda' version may diverge
138 return coef.to(device)

RuntimeError: false INTERNAL ASSERT FAILED at "..\aten\src\ATen\native\BatchLinearAlgebra.cpp":1538, please report a bug to PyTorch. torch.linalg.lstsq: (Batch element 8): Argument 4 has illegal value. Most certainly there is a bug in the implementation calling the backend library.'''

Versions

StatusCode : 200
StatusDescription : OK
Content : # mypy: allow-untyped-defs

                # Unlike the rest of the PyTorch this file must be python2 compliant.
                # This script outputs relevant system environment info
                # Run it with `python collect_env.py` or `pytho...

RawContent : HTTP/1.1 200 OK
Connection: keep-alive
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
...
Forms : {}
Headers : {[Connection, keep-alive], [Content-Security-Policy, default-src 'none'; style-src 'unsafe-inline'; sandbox],
[Strict-Transport-Security, max-age=31536000], [X-Content-Type-Options, nosniff]...}
Images : {}
InputFields : {}
Links : {}
ParsedHtml : mshtml.HTMLDocumentClass
RawContentLength : 23357

cc @jianyuh @nikitaved @pearu @mruberry @walterddr @xwang233 @lezcano

@mikaylagawarecki mikaylagawarecki added module: linear algebra Issues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmul triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module labels Jun 17, 2024
@lezcano
Copy link
Collaborator

lezcano commented Jun 17, 2024

It looks similar to #128564.
Can you edit c:\Users\georg\Documents\University\FIT\Research\KAN Network\Quadcopter.venv\Lib\site-packages\kan\spline.py and print the size of the matrices? Even better, could you pick the matrices that are being fed to lstsq and share them in a pastebin / sharing their singular values?

Also, what is your current environment? see #128564 (comment)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
module: linear algebra Issues related to specialized linear algebra operations in PyTorch; includes matrix multiply matmul triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module
Projects
None yet
Development

No branches or pull requests

3 participants