#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  7 21:06:46 2019

@author: decastro
"""
#%%
import numpy as np
import matplotlib.pyplot as plt
from multiple_spacing_tests import lar_rec, get_order, get_residual, observed_significance_spacing, stacked_bar

class MultipleTest:
    def __init__(self):
        self.reject =[]
        self.indexes = []
        self.names = []
#%% Data 1:  DRUG = APV
X = np.loadtxt('design.txt')
y = np.loadtxt('observation.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,24])
plt.ylim([0,0.2])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to APV')
plt.show()

# We found that 20 hyptohese are significant
k = 20

multiple_test = MultipleTest()
hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# Results:
# 13 discoveries: 13 true and 0 false discoveries (fdp = 0)
# K = 26
# n = 767 and p = 201
#%% Data 2: DRUG = ATV
X = np.loadtxt('X_ATV.txt')
y = np.loadtxt('Y_ATV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,13])
plt.ylim([0,0.2])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to ATV')
plt.show()

k = 13    
multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_atv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 12 discoveries: 10 true and 2 false discoveries
# K = 13
# n = 328 and p = 147
#%% Data 3: DRUG = IDV
X = np.loadtxt('X_IDV.txt')
y = np.loadtxt('Y_IDV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,22])
plt.ylim([0,0.15])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to IDV')
plt.show()

k = 21

multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_idv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 17 discoveries: 14 true and 3 false discoveries (fdp = 0.3)
# K = 40
# n = 825 and p = 206

#%% Data 4: DRUG = LPV
X = np.loadtxt('X_LPV.txt')
y = np.loadtxt('Y_LPV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,8])
plt.ylim([0,0.20])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to LPV')
plt.show()

k = 8

multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_lpv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 8 discoveries: 7 true and 1 false discoveries
# K = 8
# n = 515 and p = 183

#%% Data 5: DRUG = NFV
X = np.loadtxt('X_NFV.txt')
y = np.loadtxt('Y_NFV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,27])
plt.ylim([0,0.20])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to NFV')
plt.show()

k = 26
    
multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_nfv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 16 discoveries: 14 true and 2 false discoveries
# K = 56
# n = 842 and p = 207

#%% Data 6: DRUG = RTV <- False Negatives
X = np.loadtxt('X_RTV.txt')
y = np.loadtxt('Y_RTV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,29])
plt.ylim([0,0.20])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to RTV')
plt.show()

k = 27
    
multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_nfv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 21 discoveries: 16 true and 5 false discoveries 
# K = 38
# n = 793 and p = 205

#%% Data 7: DRUG = SQV
X = np.loadtxt('X_SQV.txt')
y = np.loadtxt('Y_SQV.txt')
n,p = np.shape(X)
lambdas, indexes, correls, R, Z = lar_rec(X, y, normalization = True)
lars = [lambdas, indexes, correls]
order_irrep = get_order(Z,R)

#Variance estimation
t=101
R1, Z1 = get_residual(R, Z, t)
w, v = np.linalg.eig(R1)
w = np.real(w)
w2 = np.zeros(np.shape(w))
for k in range(np.size(w)):
    if abs(w[k])>1e-8:
        w2[k] = abs(w[k])**(-0.5)  
        R2 = np.real(np.dot(v,np.dot(np.diag(w2),np.transpose(v))))
        Z2 = np.dot(R2,Z1)
# We conpute the variance estimation
d = np.linalg.matrix_rank(R2, hermitian=True)
V2 = v[:,0:d]
Y2 = np.real(np.dot(np.transpose(V2),Z2))
var = (np.sum(Y2**2)-d*((np.sum(Y2)/d)**2))/(d-1)
sigma = np.sqrt(var)

# Multiple Testing
alpha = 0.2
number_hyp = order_irrep
hat_alpha = np.zeros(number_hyp)
for start in range(number_hyp):
    hat_alpha[start] = observed_significance_spacing(lars, sigma, start)
p_ordered = sorted(hat_alpha, reverse=False)
x = np.cumsum(np.ones(number_hyp))
plt.plot(x, p_ordered)
plt.plot(x, alpha*x/number_hyp)
plt.tight_layout()
plt.xlabel(r'$k$')
plt.xlim([1,9])
plt.ylim([0,0.20])
plt.ylabel(r'$p_{(k)}$')
plt.title('BH for Multiple Spacings on resistance to SQV')
plt.show()

k = 18
    
multiple_test = MultipleTest()

hat_alpha[np.isnan(hat_alpha)] = 0 
rejected = hat_alpha<=alpha*k/number_hyp
multiple_test.reject = rejected
rej = 1*rejected
ind = indexes[:number_hyp]*rej
ind = ind[ind!=0]
ind = ind.astype(int)
multiple_test.indexes = ind
true = np.genfromtxt('tsm_true.txt', delimiter="\n", dtype=None)
names = np.genfromtxt('predictors_names_nfv.txt', dtype=str)
rejected_names = names[ind]
multiple_test.names = rejected_names

# 8 discoveries: 7 true and 1 false discoveries
# K = 10
# n = 793 and p = 205

#%%
# Results from Knockoff and BHq
comparisons = np.genfromtxt('comparisons.txt', delimiter=",", dtype=str)


#APV
plt.figure(figsize=(6, 4))
category_labels = ['Spacing BH', 'Knockoff', 'BHq']
series_labels = ['In TSM list', 'NOT in TSM list']

data = [
    [13, 19, 17],
    [0, 3, 8]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to APV')
plt.savefig('barAPV.png')
plt.show()

# ATV
data = [
    [10, 22, 20],
    [2, 9, 7]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to ATV')
plt.savefig('barATV.png')
plt.show()


# IDV
data = [
    [14, 19, 20],
    [3, 12, 10]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to IDV')
plt.savefig('barIDV.png')
plt.show()

# LPV
data = [
    [7, 16, 18],
    [1, 1, 3]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to LPV')
plt.savefig('barLPV.png')
plt.show()

# NFV
data = [
    [14, 21, 22],
    [2, 2, 7]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to NFV')
plt.savefig('barNFV.png')
plt.show()

# RTV
data = [
    [16, 19, 18],
    [5, 8, 8]
]


stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to RTV')
plt.savefig('barRTV.png')
plt.show()

# SQV
data = [
    [7, 17, 19],
    [2, 4, 11]
]

stacked_bar(
    data, 
    series_labels, 
    category_labels=category_labels, 
    show_values=True, 
    value_format="{:.1f}",
    y_label="HIV-1 protease positions selected",
    grid=False
)

plt.ylim([0,35])
plt.title('Resitance to SQV')
plt.savefig('barSQV.png')
plt.show()

#