Skip to content

Commit

Permalink
Various linear regression attempts (non-functional)
Browse files Browse the repository at this point in the history
  • Loading branch information
open-risk committed Feb 28, 2019
1 parent 1752942 commit d1fd1cb
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 181 deletions.
259 changes: 90 additions & 169 deletions correlationMatrix/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
import scipy.stats as sp
from scipy.linalg import eigh
from scipy.linalg import inv
import statsmodels.multivariate.multivariate_ols as ols
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale

import matplotlib.pyplot as plt

import correlationMatrix as cm
from correlationMatrix.settings import EIGENVALUE_TOLERANCE
Expand Down Expand Up @@ -91,7 +96,6 @@ def __init__(self, values=None, type=None, json_file=None, csv_file=None, **kwar
self.matrix = np.asarray(values)
self.validated = False
elif type is not None:
print('given type')
self.matrix = np.identity(2)
if type == 'UniformSingleFactor':
rho = kwargs.get('rho')
Expand All @@ -107,19 +111,16 @@ def __init__(self, values=None, type=None, json_file=None, csv_file=None, **kwar
# validation flag is set to True for modelled Matrices
self.validated = True
elif json_file is not None:
print('given file')
# Initialize from file in json format
q = pd.read_json(json_file)
self.matrix = np.asarray(q.values)
self.validated = False
elif csv_file is not None:
print('given file')
# Initialize from file in csv format
q = pd.read_csv(csv_file, index_col=None)
self.matrix = np.asarray(q.values)
self.validated = False
else:
print('no input')
# Default instance (2x2 identity matrix)
default = np.identity(2)
self.matrix = np.asarray(default)
Expand Down Expand Up @@ -570,7 +571,14 @@ def make_uniform(self, dates1, values1, dates2, values2):
y = new_values2
return x, y

# calculate the pearson correlation
def fit(self, data, method='pearson'):
"""
Calculate correlation according to desired measure
"""
rho = data.corr(method=method).values
self.matrix = rho

def pearsonr(self, data):
# make input1 and input2 uniform
# x, y = self.make_uniform(dates1, values1, dates2, values2)
Expand Down Expand Up @@ -627,179 +635,92 @@ def calculate_correlation(self, model_name, input1_url, input2_url):


class FactorCorrelationMatrix(CorrelationMatrix):
""" The EmpiricalCorrelationMatrix object stores the full empirical correlation Matrix.
""" The FactorCorrelationMatrix class fits a variety of factor models
It stores matrices estimated using any of the standard correlation metrics
(Pearson, Kendal, Tau)
The EmpiricalCorrelationMatrix object is different from the correlationMatrixSet in that it stores detailed event time
of observations and the correlation densities in addition to the correlation probabilities
An EmpiricalCorrelationMatrix can be converted into a correlationMatrixSet by sampling on a temporal grid (but not
vice-versa)
It stores the derived
"""

def __init__(self, dimension=2, values=None, observation_times=None, json_file=None,
csv_file=None):
CorrelationMatrix.__init__(self)
""" Create a new probability matrix. Different options for initialization are:
* providing values as a 3D numpy array of signature (S, S, T) and observation times as a list or numpy array of length T
* loading from a csv file
* loading from a json file
Without data, a default identity matrix is generated with user specified dimension
:param values: initialization values
:param dimension: matrix dimensionality (default is 2)
:param observation_times: List with the timesteps (support) of correlation observations
:param json_file: a json file containing correlation matrix data
:param csv_file: a csv file containing correlation matrix data
:type values: 3D numpy array
:type dimension: int
:type observations: int
:type json_file: str
:type csv_file: str
:returns: returns a EmpiricalCorrelationMatrix object
:rtype: object
.. note:: The initialization in itself does not validate if the provided values form indeed a correlation matrix set
:Example:
def __init__(self, **kwargs):
super().__init__(values=None, type=None, json_file=None, csv_file=None, **kwargs)

Instantiate a correlation probability matrix
""" Create a new correlations matrix from sampled data
.. code-block:: python
"""
# self.samples = kwargs.get('samples')

self.values = values
self.observation_times = observation_times

return

def get_data(self, data_url):
r = requests.get(data_url)
return r.json()

def make_uniform(self, dates1, values1, dates2, values2):
# make the two timeseries arrays uniform (select common observation dates)
# find common dates
# return values on common dates
common_dates = list(set(dates1).intersection(dates2))

new_values1 = []
new_values2 = []
for date in common_dates:
i1 = dates1.index(date)
i2 = dates2.index(date)
new_values1.append(values1[i1])
new_values2.append(values2[i2])

x = new_values1
y = new_values2
return x, y

# inputs to the library are assumed to be

# calculate the pearson correlation
def pearsonr(self, dates1, values1, dates2, values2):
# make input1 and input2 uniform
x, y = self.make_uniform(dates1, values1, dates2, values2)
rho, p = sp.pearsonr(x, y)
return rho, p

# calculate the kendall correlation
def kendallr(self, dates1, values1, dates2, values2):
# make input1 and input2 uniform
x, y = self.make_uniform(dates1, values1, dates2, values2)
# print(x, y)
rho, p = sp.kendalltau(x, y)
return rho, p

# calculate the spearman correlation
def spearmanr(self, dates1, values1, dates2, values2):
# make input1 and input2 uniform
x, y = self.make_uniform(dates1, values1, dates2, values2)
rho, p = sp.spearmanr(x, y)
return rho, p

# the correlation collection (ADD OTHER functions)
def calculate_correlation(self, model_name, input1_url, input2_url):
# python models evaluated directly
# c++ models evaluated via CGI requests

# print(model_name)
# Exctract timeseries data for calculation

raw_data1 = get_data(input1_url)
raw_data2 = get_data(input2_url)

json_string1 = raw_data1['_items'][0]['json_dump']
Data1 = json.loads(json_string1)
dates1 = Data1['Dates']
values1 = Data1['Values']
# print(dates1, values1)

json_string2 = raw_data2['_items'][0]['json_dump']
Data2 = json.loads(json_string2)
dates2 = Data2['Dates']
values2 = Data2['Values']
def fit(self, data, method='UniformSingleFactor'):
"""
Estimate a single factor model with uniform loadings
- The single factor is constructed as the average of all realizations
- Uniform loadings imply all return realizations are of the same variable r
if model_name == 'Pearson_Correlation':
rho, p = self.pearsonr(dates1, values1, dates2, values2)
elif model_name == 'Kendall_Correlation':
rho, p = self.kendallr(dates1, values1, dates2, values2)
elif model_name == 'Spearman_Correlation':
rho, p = self.spearmanr(dates1, values1, dates2, values2)
return {'rho': rho, 'p': p}
"""

def hierarchical(self):
########## Fit Hierachical Factor Model ##########
# we need separate the residuals, if the residuals are correlated with each other, we won't make the garphs.
# Basically, in the kiwi paper, he express the residuals only have one sector, but we have five sectors
# Firstly we do the sector model, sector indics are just the averages of companies within each sector
#
# Input:
# CSV file of scaled log-return data
#
# Output:
# Linear factor model and residuals
# rm(list=ls())
# library(corrplot)
# # Read closing data from csv file
# setwd("C:\\Users\\lixua\\Desktop\\version1.4")
# # setwd('/home/philippos/Desktop/R_Development/version1.2.1')
# source('SectorsNCompanies.R')
# # setwd('/home/philippos/Desktop/R_Development/Current')
# df < - read.csv('cleaned_returns_data.csv', sep=",")
#
# ### Calculate the Sector Loadings on the Index and the Sector Residuals ###
#
# sector_fit < - lm(data.matrix(df[, 51:55])
# ~ Index, data = df)
# s_load < - sector_fit$coefficients
# s_res < - data.frame(sector_fit$residuals)
# s_corr < - cor(s_res)
# corrplot(s_corr)
#
# ### Calculate company loadings on the Index and the company Residuals ###
#
# df2 < - data.frame(df[, 1:50], s_res, df["Index"])
#
# company_fit < - lm(data.matrix(df2[, 1:50])
# ~ Index + S_FINA + S_HLTH + S_TECH + S_OILG + S_CONS, data = df2)
# c_load < - company_fit$coefficients
# c_res < - data.frame(company_fit$residuals)
# c_corr < - cor(c_res)
# corrplot(c_corr, tl.cex = 0.5)
#
#
# ### Store Sector and Company Residuales ###
# write.table(s_res, file="sector_residuals.csv", sep=",", row.names = FALSE, col.names = TRUE)
# write.table(c_res, file="company_residuals.csv", sep=",", row.names = FALSE, col.names = TRUE)
pass
# Response (dependent) variables
Y = data.values
# Control that the response variables (r) have the right correlation
# rho = data.corr(method='pearson').values
# print(rho)

# Compute the row average (Market factor)
F = data.mean(axis=1).values
# print(np.std(F0))

# print(np.std(F1))
# Normalize the market factor to unit variance
# Replicate the market factor for the multiple regression
# X = np.tile(F1, (Y.shape[1], 1)).transpose()
X = scale(F, with_mean=False, with_std=True)
# np.reshape(X, (100, 1))
# np.reshape(X, (-1, 1))

print(Y.shape)
print(X.shape)

corrs = []
for i in range(Y.shape[1]):
rho, p = sp.pearsonr(X, Y[:, i])
corrs.append(rho)
print(np.mean(corrs)**2)




# print(data.describe())
# print(len(data.index))
# rho, p = sp.pearsonr(data['S4'], F)

# for i in range(20):
# rho, p = sp.pearsonr(Y[:, i], X[:, i])
# print(rho)

# Control that the response variables (r) have the right correlation
# for i in range(20):
# for j in range(20):
# rho, p = sp.pearsonr(Y[:, i], Y[:, j])
# print(rho)

if method == 'UniformSingleFactor':
# res = smf.ols(formula='r ~ F', data=data).fit()
# estimator = LinearRegression(fit_intercept=False)
# results = estimator.fit(X, Y)
# print(dir(results))
# print(results.coef_)
# b = np.linalg.lstsq(X[:, 0], Y[:,0])

# X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# # y = 1 * x_0 + 2 * x_1 + 3
# y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, Y)
print(reg.coef_)

# estimator = ols._MultivariateOLS(X, Y)
# results = estimator.fit()
# print(results)
# plt.plot(b)
# plt.show()

# self.matrix = rho
21 changes: 14 additions & 7 deletions examples/python/empirical_correlation_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied. See the License for the specific language governing permissions and
# either express or implied. See the License for the specif ic language governing permissions and
# limitations under the License.


Expand Down Expand Up @@ -57,15 +57,22 @@

# Step
# Estimate the empirical correlation matrix using the Pearson measure
print("> Step 3: Estimate the empirical correlation matrix using the Pearson measure")
print("> Step 3a: Estimate the empirical correlation matrix using the Pearson measure")
myMatrix = cm.EmpiricalCorrelationMatrix()
print(myMatrix.validated)
# print(myMatrix.validated)
# print(type(myMatrix))
# myMatrix.print()
myMatrix.pearsonr(data)
# myMatrix.pearsonr(data)
myMatrix.fit(data, method='pearson')
myMatrix.print()
myMatrix.validate()
print(myMatrix.validated)
print("> Step 3b: Estimate the empirical correlation matrix using the Kendall measure")
myMatrix.fit(data, method='kendall')
myMatrix.print()
print("> Step 3c: Estimate the empirical correlation matrix using the Spearman measure")
myMatrix.fit(data, method='spearman')
myMatrix.print()
# myMatrix.validate()
# print(myMatrix.validated)
# myEstimator = aj.AalenJohansenEstimator(states=myState)
# labels = {'Timestamp': 'Time', 'From_State': 'From', 'To_State': 'To', 'ID': 'ID'}
# etm, times = myEstimator.fit(sorted_data, labels=labels)
Expand Down Expand Up @@ -116,4 +123,4 @@
plt.savefig("correlation_probabilities.png")
plt.show()
"""
"""
6 changes: 3 additions & 3 deletions examples/python/generate_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
# This dataset creates the simplest possibe (uniform single factor) correlation matrix
# Correlation Matrix definition
# n: number of entities to generate
myMatrix = cm.CorrelationMatrix(type='UniformSingleFactor', rho=0.2, n=10)
myMatrix.print()
myMatrix = cm.CorrelationMatrix(type='UniformSingleFactor', rho=0.3, n=10)
# myMatrix.print()
# Generate multivariate normal data with that correlation matrix (a pandas frame)
# s: number of samples per entity
data = dataset_generators.multivariate_normal(myMatrix, sample=1000)
data = dataset_generators.multivariate_normal(myMatrix, sample=100)
data.to_csv(dataset_path + 'synthetic_data1.csv', index=False)

elif dataset == 2:
Expand Down
Loading

0 comments on commit d1fd1cb

Please sign in to comment.