0.1.1 Cleaned up PairwiseCorrelation, matrix_print

open-risk · Mar 7, 2019 · c75a18e · c75a18e
1 parent 76ca355
commit c75a18e
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 126 deletions.
diff --git a/correlationMatrix/model.py b/correlationMatrix/model.py
@@ -16,6 +16,7 @@
 
 * correlationMatrix_ implements the functionality of single period correlation matrix
 * TODO correlationMatrixSet_ provides a container for a multiperiod correlation matrix collection
+* TODO PairwiseCorrelation implements functionality for pairwise data analysis of timeseries
 * EmpiricalCorrelationMatrix implements the functionality of a continuously observed correlation matrix
 
 """
@@ -32,9 +33,82 @@
 from sklearn.preprocessing import scale
 
 from correlationMatrix.settings import EIGENVALUE_TOLERANCE
+from correlationMatrix.utils.converters import matrix_print
 
 
-# https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html
+def get_data(data_url):
+ r = requests.get(data_url)
+ return r.json()
+
+
+def make_uniform(dates1, values1, dates2, values2):
+ # make the two timeseries arrays uniform (select common observation dates)
+ # find common dates
+ # return values on common dates
+ common_dates = list(set(dates1).intersection(dates2))
+
+ new_values1 = []
+ new_values2 = []
+ for date in common_dates:
+ i1 = dates1.index(date)
+ i2 = dates2.index(date)
+ new_values1.append(values1[i1])
+ new_values2.append(values2[i2])
+
+ x = new_values1
+ y = new_values2
+ return x, y
+
+
+class PairwiseCorrelation(object):
+
+ # calculate the linear (Pearson) correlation
+ def pearsonr(self, x, y):
+ rho, p = sp.kendalltau(x, y)
+ return rho, p
+
+ # calculate the kendall correlation between two timeseries
+ def kendallr(self, x, y):
+ rho, p = sp.kendalltau(x, y)
+ return rho, p
+
+ # calculate the spearman correlation
+ def spearmanr(self, x, y):
+ rho, p = sp.spearmanr(x, y)
+ return rho, p
+
+ def calculate(self, model_name, input1_url, input2_url):
+
+ # Get data from URL
+ # TODO specify valid formats
+ raw_data1 = get_data(input1_url)
+ raw_data2 = get_data(input2_url)
+
+ # Process response (API dependent)
+ json_string1 = raw_data1['_items'][0]['json_dump']
+ Data1 = json.loads(json_string1)
+ dates1 = Data1['Dates']
+ values1 = Data1['Values']
+
+ json_string2 = raw_data2['_items'][0]['json_dump']
+ Data2 = json.loads(json_string2)
+ dates2 = Data2['Dates']
+ values2 = Data2['Values']
+
+ # Make data uniform
+ # TODO expand on missing data / dataquality treatment
+ x, y = make_uniform(dates1, values1, dates2, values2)
+
+ rho = None
+ p = None
+
+ if model_name == 'Pearson_Correlation':
+ rho, p = self.pearsonr(x, y)
+ elif model_name == 'Kendall_Correlation':
+ rho, p = self.kendallr(x, y)
+ elif model_name == 'Spearman_Correlation':
+ rho, p = self.spearmanr(x, y)
+ return {'rho': rho, 'p': p}
 
 
 class CorrelationMatrix:
@@ -263,31 +337,19 @@ def characterize(self):
  pass
 
  def print(self, format_type='Standard', accuracy=2):
- """ Pretty print a correlation matrix
-
- :param format_type: formatting options (Standard, Percent)
- :type format_type: str
- :param accuracy: number of decimals to display
- :type accuracy: int
-
- """
- for s_in in range(self.matrix.shape[0]):
- for s_out in range(self.matrix.shape[1]):
- if format_type is 'Standard':
- format_string = "{0:." + str(accuracy) + "f}"
- print(format_string.format(self.matrix[s_in, s_out]) + ' ', end='')
- elif format_type is 'Percent':
- print("{0:.2f}%".format(100 * self.matrix[s_in, s_out]) + ' ', end='')
- print('')
- print('')
+ matrix_print(self.matrix, format_type=format_type, accuracy=accuracy)
 
  def decompose(self, method):
  """
- TODO Create a decomposition of the correlation matrix according to the selected method
  :param method:
  :return:
  """
- pass
+ if method == 'cholesky':
+ L = np.linalg.cholesky(self.matrix)
+ return L
+ elif method == 'svd':
+ U, S, VH = np.linalg.svd(self.matrix, full_matrices=True)
+ return U, S, VH
 
  def stress(self, scenario, method):
  """
@@ -321,36 +383,13 @@ class EmpiricalCorrelationMatrix(CorrelationMatrix):
  """
 
  def __init__(self, **kwargs):
+ super().__init__(**kwargs)
 
- super().__init__(values=None, type=None, json_file=None, csv_file=None, **kwargs)
-
- """ Create a new correlations matrix from sampled data
+ """ Create a new correlation matrix from sampled data
  
 
  """
 
- def get_data(self, data_url):
- r = requests.get(data_url)
- return r.json()
-
- def make_uniform(self, dates1, values1, dates2, values2):
- # make the two timeseries arrays uniform (select common observation dates)
- # find common dates
- # return values on common dates
- common_dates = list(set(dates1).intersection(dates2))
-
- new_values1 = []
- new_values2 = []
- for date in common_dates:
- i1 = dates1.index(date)
- i2 = dates2.index(date)
- new_values1.append(values1[i1])
- new_values2.append(values2[i2])
-
- x = new_values1
- y = new_values2
- return x, y
-
  def fit(self, data, method='pearson'):
  """
  Calculate correlation according to desired measure
@@ -359,66 +398,42 @@ def fit(self, data, method='pearson'):
  rho = data.corr(method=method).values
  self.matrix = rho
 
- def pearsonr(self, data):
- # make input1 and input2 uniform
- # x, y = self.make_uniform(dates1, values1, dates2, values2)
- # rho, p = sp.pearsonr(x, y)
- # return rho, p
- rho = data.corr(method='pearson').values
- self.matrix = rho
-
- # calculate the kendall correlation
- def kendallr(self, dates1, values1, dates2, values2):
- # make input1 and input2 uniform
- x, y = self.make_uniform(dates1, values1, dates2, values2)
- # print(x, y)
- rho, p = sp.kendalltau(x, y)
- return rho, p
-
- # calculate the spearman correlation
- def spearmanr(self, dates1, values1, dates2, values2):
- # make input1 and input2 uniform
- x, y = self.make_uniform(dates1, values1, dates2, values2)
- rho, p = sp.spearmanr(x, y)
- return rho, p
-
- # the correlation collection (ADD OTHER functions)
- def calculate_correlation(self, model_name, input1_url, input2_url):
- # python models evaluated directly
- # c++ models evaluated via CGI requests
 
- # print(model_name)
- # Exctract timeseries data for calculation
-
- raw_data1 = self.get_data(input1_url)
- raw_data2 = self.get_data(input2_url)
-
- json_string1 = raw_data1['_items'][0]['json_dump']
- Data1 = json.loads(json_string1)
- dates1 = Data1['Dates']
- values1 = Data1['Values']
+class FactorCorrelationMatrix(CorrelationMatrix):
+ """ The FactorCorrelationMatrix class
+ - fits a variety of factor models
+ - stores the derived parameters and modelled correlation matrix values
+ TODO compute and store confidence intervals
 
+ Factor Models are estimated using OLS in various incarnatios
+ Get the full scoop on lstsq at https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.lstsq.html
 
- json_string2 = raw_data2['_items'][0]['json_dump']
- Data2 = json.loads(json_string2)
- dates2 = Data2['Dates']
- values2 = Data2['Values']
+ scipy lstsq API
 
- if model_name == 'Pearson_Correlation':
- rho, p = self.pearsonr(dates1, values1, dates2, values2)
- elif model_name == 'Kendall_Correlation':
- rho, p = self.kendallr(dates1, values1, dates2, values2)
- elif model_name == 'Spearman_Correlation':
- rho, p = self.spearmanr(dates1, values1, dates2, values2)
- return {'rho': rho, 'p': p}
+ Parameters:
 
+ a : (M, N) array_like Left hand side matrix (2-D array).
+ b : (M,) or (M, K) array_like Right hand side matrix or vector (1-D or 2-D array).
+ cond : float, optional Cutoff for ‘small’ singular values; used to determine effective rank of a.
+ Singular values smaller than rcond * largest_singular_value are considered zero.
+ overwrite_a : bool, optional Discard data in a (may enhance performance). Default is False.
+ overwrite_b : bool, optional Discard data in b (may enhance performance). Default is False.
+ check_finite : bool, optional Whether to check that the input matrices contain only finite numbers.
+ Disabling may give a performance gain, but may result in problems (crashes, non-termination)
+ if the inputs do contain infinities or NaNs.
+ lapack_driver : str, optional Which LAPACK driver is used to solve the least-squares problem.
+ Options are 'gelsd', 'gelsy', 'gelss'. Default ('gelsd') is a good choice. However, 'gelsy' can be slightly faster on many problems. 'gelss' was used historically. It is generally slow but uses less memory.
 
-class FactorCorrelationMatrix(CorrelationMatrix):
- """ The FactorCorrelationMatrix class fits a variety of factor models
+ Returns:
 
- It stores the derived parameters and modelled correlation matrix values
+ x : (N,) or (N, K) ndarray Least-squares solution. Return shape matches shape of b.
+ residues : (0,) or () or (K,) ndarray Sums of residues, squared 2-norm for each column in b - a x.
+ If rank of matrix a is < N or N > M, or 'gelsy' is used, this is a length zero array. If b was 1-D,
+ this is a () shape array (numpy scalar), otherwise the shape is (K,).
+ rank : int Effective rank of matrix a.
+ s : (min(M,N),) ndarray or None Singular values of a. The condition number of a is abs(s[0] / s[-1]).
+ None is returned when 'gelsy' is used.
 
- TODO compute and store confidence intervals
 
  """
 
@@ -431,37 +446,12 @@ def __init__(self, **kwargs):
 
  def fit(self, data, method='UniformSingleFactor'):
  """
+
+ Method: 'UniformSingleFactor'
  Estimate a single factor model with uniform loadings
  - The single factor is constructed as the average of all realizations
  - Uniform loadings imply all return realizations are of the same variable r
 
-
- scipy lstsq API
-
- Parameters:
-
- a : (M, N) array_like Left hand side matrix (2-D array).
- b : (M,) or (M, K) array_like Right hand side matrix or vector (1-D or 2-D array).
- cond : float, optional Cutoff for ‘small’ singular values; used to determine effective rank of a.
- Singular values smaller than rcond * largest_singular_value are considered zero.
- overwrite_a : bool, optional Discard data in a (may enhance performance). Default is False.
- overwrite_b : bool, optional Discard data in b (may enhance performance). Default is False.
- check_finite : bool, optional Whether to check that the input matrices contain only finite numbers.
- Disabling may give a performance gain, but may result in problems (crashes, non-termination)
- if the inputs do contain infinities or NaNs.
- lapack_driver : str, optional Which LAPACK driver is used to solve the least-squares problem.
- Options are 'gelsd', 'gelsy', 'gelss'. Default ('gelsd') is a good choice. However, 'gelsy' can be slightly faster on many problems. 'gelss' was used historically. It is generally slow but uses less memory.
-
- Returns:
-
- x : (N,) or (N, K) ndarray Least-squares solution. Return shape matches shape of b.
- residues : (0,) or () or (K,) ndarray Sums of residues, squared 2-norm for each column in b - a x.
- If rank of matrix a is < N or N > M, or 'gelsy' is used, this is a length zero array. If b was 1-D,
- this is a () shape array (numpy scalar), otherwise the shape is (K,).
- rank : int Effective rank of matrix a.
- s : (min(M,N),) ndarray or None Singular values of a. The condition number of a is abs(s[0] / s[-1]).
- None is returned when 'gelsy' is used.
-
  """
 
  if method == 'UniformSingleFactor':

diff --git a/correlationMatrix/utils/converters.py b/correlationMatrix/utils/converters.py
@@ -33,5 +33,26 @@ def datetime_to_float(dataframe):
  start_date = dataframe['Time'].min()
  end_date = dataframe['Time'].max()
  total_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days
- dataframe['Time'] = dataframe['Time'].apply(lambda x: (pd.to_datetime(x) - pd.to_datetime(start_date)).days / total_days)
+ dataframe['Time'] = dataframe['Time'].apply(
+ lambda x: (pd.to_datetime(x) - pd.to_datetime(start_date)).days / total_days)
  return [start_date, end_date, total_days], dataframe
+
+
+def matrix_print(A, format_type='Standard', accuracy=2):
+ """ Pretty print a matrix
+
+ :param format_type: formatting options (Standard, Percent)
+ :type format_type: str
+ :param accuracy: number of decimals to display
+ :type accuracy: int
+
+ """
+ for s_in in range(A.shape[0]):
+ for s_out in range(A.shape[1]):
+ if format_type is 'Standard':
+ format_string = "{0:." + str(accuracy) + "f}"
+ print(format_string.format(A[s_in, s_out]) + ' ', end='')
+ elif format_type is 'Percent':
+ print("{0:.2f}%".format(100 * A[s_in, s_out]) + ' ', end='')
+ print('')
+ print('')
diff --git a/examples/python/matrix_operations.py b/examples/python/matrix_operations.py
@@ -22,6 +22,7 @@
 
 import correlationMatrix as cm
 from correlationMatrix import dataset_path
+from correlationMatrix.utils.converters import matrix_print
 
 print("> Initialize a 3x3 matrix with values")
 A = cm.CorrelationMatrix(values=[[1.0, 0.2, 0.2], [0.2, 1.0, 0.2], [0.2, 0.2, 1.0]])
@@ -72,6 +73,10 @@
 
 # Generate a random matrix
 print("> Generate a random correlation matrix")
-G = cm.generate_random_matrix(100)
+G = cm.generate_random_matrix(10)
 print(G.validate())
 
+# Apply Cholesky decomposition
+print("> Calculate its Cholesky decomposition")
+matrix_print(G.decompose('cholesky'), accuracy=2)
+