Working estimation using scipy.linalg.lstsq

open-risk · Feb 28, 2019 · 650c3c9 · 650c3c9
1 parent d1fd1cb
commit 650c3c9
Show file tree

Hide file tree

Showing 10 changed files with 305 additions and 527 deletions.
diff --git a/correlationMatrix/model.py b/correlationMatrix/model.py
@@ -33,6 +33,10 @@
 from sklearn.linear_model import LinearRegression
 from sklearn.preprocessing import scale
 
+from scipy.linalg import lstsq
+
+# https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html
+
 import matplotlib.pyplot as plt
 
 import correlationMatrix as cm
@@ -297,6 +301,9 @@ def decompose(self, method):
  """
  pass
 
+ def stress(self, scenario, method):
+ pass
+
  @property
  def validation_status(self):
  return self.validated
@@ -637,8 +644,9 @@ def calculate_correlation(self, model_name, input1_url, input2_url):
 class FactorCorrelationMatrix(CorrelationMatrix):
  """ The FactorCorrelationMatrix class fits a variety of factor models
 
- It stores the derived
+ It stores the derived parameters and modelled correlation matrix values
 
+ TODO compute and store confidence intervals
 
  """
 
@@ -659,36 +667,6 @@ def fit(self, data, method='UniformSingleFactor'):
 
  """
 
- # Response (dependent) variables
- Y = data.values
- # Control that the response variables (r) have the right correlation
- # rho = data.corr(method='pearson').values
- # print(rho)
-
- # Compute the row average (Market factor)
- F = data.mean(axis=1).values
- # print(np.std(F0))
-
- # print(np.std(F1))
- # Normalize the market factor to unit variance
- # Replicate the market factor for the multiple regression
- # X = np.tile(F1, (Y.shape[1], 1)).transpose()
- X = scale(F, with_mean=False, with_std=True)
- # np.reshape(X, (100, 1))
- # np.reshape(X, (-1, 1))
-
- print(Y.shape)
- print(X.shape)
-
- corrs = []
- for i in range(Y.shape[1]):
- rho, p = sp.pearsonr(X, Y[:, i])
- corrs.append(rho)
- print(np.mean(corrs)**2)
-
-
-
-
  # print(data.describe())
  # print(len(data.index))
  # rho, p = sp.pearsonr(data['S4'], F)
@@ -704,18 +682,45 @@ def fit(self, data, method='UniformSingleFactor'):
  # print(rho)
 
  if method == 'UniformSingleFactor':
+ # Response (dependent) variables
+ Y = data.values
+ # Control that the response variables (r) have the right correlation
+ # rho = data.corr(method='pearson').values
+ # print(rho)
+
+ # Compute the row average (Market factor)
+ F = data.mean(axis=1).values
+ # print(np.std(F0))
+
+ # print(np.std(F1))
+ # Normalize the market factor to unit variance
+ # Replicate the market factor for the multiple regression
+ # X = np.tile(F1, (Y.shape[1], 1)).transpose()
+ X = scale(F, with_mean=False, with_std=True)
+ # np.reshape(X, (100, 1))
+ # np.reshape(X, (-1, 1))
+
+ print(Y.shape)
+ print(X.shape)
+
+ corrs = []
+ for i in range(Y.shape[1]):
+ rho, p = sp.pearsonr(X, Y[:, i])
+ corrs.append(rho)
+ print(np.mean(corrs) ** 2)
  # res = smf.ols(formula='r ~ F', data=data).fit()
  # estimator = LinearRegression(fit_intercept=False)
  # results = estimator.fit(X, Y)
  # print(dir(results))
  # print(results.coef_)
- # b = np.linalg.lstsq(X[:, 0], Y[:,0])
+
+ b = lstsq(X[:, 0], Y[:, 0])
 
  # X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
  # # y = 1 * x_0 + 2 * x_1 + 3
  # y = np.dot(X, np.array([1, 2])) + 3
- reg = LinearRegression().fit(X, Y)
- print(reg.coef_)
+ # reg = LinearRegression().fit(X, Y)
+ # print(reg.coef_)
 
  # estimator = ols._MultivariateOLS(X, Y)
  # results = estimator.fit()
@@ -724,3 +729,114 @@ def fit(self, data, method='UniformSingleFactor'):
  # plt.show()
 
  # self.matrix = rho
+
+ elif method == 'CAPMModel':
+ print("Lets do this")
+
+ raw_data = data.values
+
+ print(raw_data.shape)
+ n = raw_data.shape[1] - 1
+ # the market factor
+ X = raw_data[:, 3]
+ # the single stock
+ Y = raw_data[:, 0]
+
+ # for i in range(3):
+ # Y = raw_data[:, i]
+ # print(i, sp.pearsonr(Y, X))
+
+ # plt.plot(X, Y)
+ # plt.show()
+
+ """
+ scipy lstsq API
+ 
+ Parameters: 
+ 
+ a : (M, N) array_like Left hand side matrix (2-D array).
+ b : (M,) or (M, K) array_like Right hand side matrix or vector (1-D or 2-D array).
+ cond : float, optional Cutoff for ‘small’ singular values; used to determine effective rank of a. 
+ Singular values smaller than rcond * largest_singular_value are considered zero.
+ overwrite_a : bool, optional Discard data in a (may enhance performance). Default is False.
+ overwrite_b : bool, optional Discard data in b (may enhance performance). Default is False.
+ check_finite : bool, optional Whether to check that the input matrices contain only finite numbers. 
+ Disabling may give a performance gain, but may result in problems (crashes, non-termination) 
+ if the inputs do contain infinities or NaNs.
+ lapack_driver : str, optional Which LAPACK driver is used to solve the least-squares problem. 
+ Options are 'gelsd', 'gelsy', 'gelss'. Default ('gelsd') is a good choice. However, 'gelsy' can be slightly faster on many problems. 'gelss' was used historically. It is generally slow but uses less memory.
+ 
+ Returns: 
+ 
+ x : (N,) or (N, K) ndarray Least-squares solution. Return shape matches shape of b.
+ residues : (0,) or () or (K,) ndarray Sums of residues, squared 2-norm for each column in b - a x. 
+ If rank of matrix a is < N or N > M, or 'gelsy' is used, this is a length zero array. If b was 1-D, 
+ this is a () shape array (numpy scalar), otherwise the shape is (K,).
+ rank : int Effective rank of matrix a.
+ s : (min(M,N),) ndarray or None Singular values of a. The condition number of a is abs(s[0] / s[-1]). 
+ None is returned when 'gelsy' is used.
+
+ """
+
+ #TODO stack the individual stock values when the model ask for single loading
+ #TODO grouped stock loadings
+
+ # Find all the loadings of entities to the market factor
+ a = X
+ n = len(a)
+ a.shape = (n, 1)
+ for i in range(3):
+ b = raw_data[:, i]
+ p, res, rnk, s = lstsq(a, b)
+ print(p, res, rnk, s)
+
+ # # Control that the response variables (r) have the right correlation
+ # # rho = data.corr(method='pearson').values
+ # # print(rho)
+ #
+ # # Compute the row average (Market factor)
+ # F = data.mean(axis=1).values
+ # # print(np.std(F0))
+ #
+ # # print(np.std(F1))
+ # # Normalize the market factor to unit variance
+ # # Replicate the market factor for the multiple regression
+ # # X = np.tile(F1, (Y.shape[1], 1)).transpose()
+ # X = scale(F, with_mean=False, with_std=True)
+ # # np.reshape(X, (100, 1))
+ # # np.reshape(X, (-1, 1))
+ #
+ # print(Y.shape)
+ # print(X.shape)
+ #
+ # corrs = []
+ # for i in range(Y.shape[1]):
+ # rho, p = sp.pearsonr(X, Y[:, i])
+ # corrs.append(rho)
+ # print(np.mean(corrs)**2)
+
+ elif method == 'APTModel':
+
+ raw_data = data.values
+ print(raw_data.shape)
+
+ m = 3
+ n = raw_data.shape[1]
+ a = raw_data[:, n-m:n]
+ print(a.shape)
+
+ # b = raw_data[:, 5]
+ # n = raw_data.shape[1] - 1
+ # # the market factor
+ # X = raw_data[:, 3]
+ # # the single stock
+ # Y = raw_data[:, 0]
+ #
+ # a = X
+ # n = len(a)
+ # a.shape = (n, 1)
+ for i in range(n-m):
+ b = raw_data[:, i]
+ p, res, rnk, s = lstsq(a, b)
+ print(p, res, rnk, s)
+
diff --git a/correlationMatrix/utils/dataset_generators.py b/correlationMatrix/utils/dataset_generators.py
@@ -40,3 +40,85 @@ def multivariate_normal(correlationmatrix, sample):
  data = np.random.multivariate_normal(mean, correlationmatrix.matrix, sample)
  columns = ['S' + str(i) for i in range(0, correlationmatrix.dimension)]
  return pd.DataFrame(data, columns=columns)
+
+
+def capm_model(n, b, sample):
+ """
+ Generate samples from a stylized CAPM model
+
+ Suitable for testing estimation algorithms
+
+ The data format is a table with columns per entity ID and a final market factor
+
+ :param int n: The number of distinct entities to simulate
+ :param list: A list of loading to the market factor
+ :param int sample: The number of samples to simulate
+ :rtype: pandas dataframe
+
+ .. note:: This emulates typical downloads of data from market data API's
+
+ """
+ if len(b) != n:
+ print("Inconsistent input values, length of loadings matrix differs from simulated entities")
+ exit()
+
+ errors = np.random.normal(loc=0.0, scale=1.0, size=(n, sample))
+ market_factor = np.random.normal(loc=0.0, scale=1.0, size=(1, sample))
+ returns = np.outer(b, market_factor) + errors
+ print(returns.shape)
+ print(market_factor.shape)
+ print(returns.shape)
+ data = np.append(returns, market_factor, 0)
+ print(data.shape)
+ columns = ['S' + str(i) for i in range(0, n)]
+ columns.append('Market Factor')
+ print(columns)
+ print(data[:, :5])
+ df = pd.DataFrame(data.transpose(), columns=columns)
+ return df
+
+
+def apt_model(n, b, m, rho, sample):
+ """
+ Generate samples from a stylized APT model
+
+ Suitable for testing estimation algorithms
+
+ The data format is a table with columns per entity ID and market factors
+
+ :param int n: The number of distinct entities to simulate
+ :param list b: A list of loading to the market factor
+ :param int m: The number of market factors
+ :param rho: The correlation matrix to use for the simulation of the market factors
+ :param int sample: The number of samples to simulate
+ :rtype: pandas dataframe
+
+ .. note:: This emulates typical downloads of data from market data API's
+
+ """
+ if len(b) != m:
+ print("Inconsistent input values, length of loadings matrix differs from simulated market factors")
+ exit()
+
+ if rho.dimension != m:
+ print("Inconsistent input values, dimension of factor correlation matrix differs from simulated factors")
+ exit()
+
+ mean = np.zeros(rho.dimension)
+ market_factors = np.random.multivariate_normal(mean, rho.matrix, sample).transpose()
+
+ errors = np.random.normal(loc=0.0, scale=1.0, size=(n, sample))
+
+ print(market_factors.shape)
+ print(errors.shape)
+
+ returns = np.dot(b, market_factors) + errors
+ print(returns.shape)
+
+ data = np.append(returns, market_factors, 0)
+ print(data.shape)
+ columns = ['S' + str(i) for i in range(0, n)] + ['F' + str(i) for i in range(0, m)]
+ print(columns)
+ print(data[:, :5])
+ df = pd.DataFrame(data.transpose(), columns=columns)
+ return df