rm(list = ls()) 
# remove working space

library(readxl)
library(leaps)
library(car)
library(MASS)

setwd("~/Desktop/Final_Project")

# PART1
##################################################################################
# All-subsets regression with exhaustive algorithm before testing correlation
# monthly
mon_data <- data.frame(read_excel("Group_Project_Data.xlsx"), sheet = 1)

mon_sec_re      <- mon_data[, 2]
mon_rf          <- mon_data[, 4]
mon_sec_exc_re  <- mon_sec_re - mon_rf

cpi <- mon_data[, 5]
dpi <- mon_data[, 6]
id  <- mon_data[, 7]
une <- mon_data[, 8]
afs <- mon_data[, 9]
per <- mon_data[, 10]
pbv <- mon_data[, 11]
pm  <- mon_data[, 12]
roe <- mon_data[, 13]
emp <- mon_data[, 14]

attach(mon_data)
mon_leaps <- regsubsets(mon_sec_exc_re ~ cpi + dpi + id + une + afs 
                     + per + pbv + pm + roe + emp, data = mon_data, 
                     nbest = 1)
summary(mon_leaps)

# plot statistic by subset size
subsets(mon_leaps,  abbrev = 2, legend = FALSE, statistic="cp", las = 1,
        main = 
          "Monthly all-subsets regression with exhaustive algorithm (bf corr)")
subsets(mon_leaps,  abbrev = 2, legend = FALSE, statistic="adjr2", las = 1,
        main = 
          "Monthly all-subsets regression with exhaustive algorithm (bf corr)")

detach(mon_data)
# selection_result: mon: dpi, per, pbv, pm.

##################################################################################
# All-subsets regression with exhaustive algorithm before testing correlation

# annually
ann_data <- data.frame(read_excel("Group_Project_Data.xlsx"), sheet = 2)

ann_sec_return <- ann_data[, 2]
ann_rf         <- ann_data[, 3]
ann_sec_exc_re <- ann_sec_return - ann_rf

cspi <- ann_data[, 4]
vc   <- ann_data[, 5]
dmee <- ann_data[, 6]
pde  <- ann_data[, 7]
pce  <- ann_data[, 8]
he   <- ann_data[, 9]
ap   <- ann_data[, 10]
ir   <- ann_data[, 11]
ce   <- ann_data[, 12]
dpi  <- ann_data[, 13]

attach(ann_data)
ann_leaps <- regsubsets(ann_sec_exc_re ~ cspi + vc + dmee + pde + pce 
                     + he + ap + ir + ce + dpi, data = ann_data, nbest = 1)
summary(ann_leaps)

subsets(ann_leaps,  abbrev = 2, legend = TRUE, statistic="cp", las = 1,
        main = 
          "Annual all-subsets regression with exhaustive algorithm (bf corr)")
subsets(ann_leaps,  abbrev = 2, min.size = 3, legend = TRUE, 
        statistic="cp", las = 1,
        main = 
          "Annual all-subsets regression with exhaustive algorithm (bf corr)")
detach(ann_data)
# selection_result: ann: cspi, dmee, ap, ir, ce.

##################################################################################
# Correlation among Variables

# monthly
mon_cormatrix <- matrix(NA, 10, 10)

rownames(mon_cormatrix) <- c("cpi", "dpi", "id", "une", "afs", 
                             "per", "pbv", "pm", "roe", "emp")
colnames(mon_cormatrix) <- c("cpi", "dpi", "id", "une", "afs", 
                               "per", "pbv", "pm", "roe", "emp")

for (i in 5:14)
{
  for (j in 5:14)
  {
    x <- mon_data[, i]
    y <- mon_data[, j]
    mon_cormatrix[i - 4, j - 4]<- cor(x, y)
  }
}
print(mon_cormatrix, digits = 3)

# mon: pm & roe are highly correlated. 
# mon: pbv & per are highly correlated.
# exclude pbv
attach(mon_data)
mon_leaps <- regsubsets(mon_sec_exc_re ~ cpi + dpi + id + une + afs 
                        + per + pm + roe + emp, data = mon_data, 
                        nbest = 1)
summary(mon_leaps)

subsets(mon_leaps,  abbrev = 2, legend = FALSE, statistic="cp", las = 1, 
        main = "Monthly all-subsets regression with exhaustive algorithm")
detach(mon_data)

# selection_result: mon: dpi, per, pbv, pm.
# selection_result_after_testing_correlation
# mon: dpi, per, roe.

# compare the result of excluding roe and pbv at the begining
attach(mon_data)
com_mon_leaps <- regsubsets(mon_sec_exc_re ~ cpi + dpi + id + une + afs 
                            + per + roe + emp, data = mon_data, 
                            nbest = 1)
summary(com_mon_leaps)

subsets(com_mon_leaps,  abbrev = 2, legend = FALSE, statistic="cp", las = 1,
        main =
        "Monthly all-subsets regression with exhaustive algorithm (exclude corr)")
detach(mon_data)
# selection_result: mon: dpi, per, roe.
# conclusion: same.

##################################################################################
# Correlation among Variables

# annually
ann_cormatrix <- matrix(NA, 10, 10)

rownames(ann_cormatrix) <- c("cspi", "vc", "dmee", "pde", "pce", "he", 
                             "ap", "ir", "ce", "dpi")
colnames(ann_cormatrix) <- c("cspi", "vc", "dmee", "pde", "pce", "he", 
                             "ap", "ir", "ce", "dpi")

for (i in 4:13)
{
  for (j in 4:13)
  {
    x <- ann_data[, i]
    y <- ann_data[, j]
    ann_cormatrix[i - 3, j - 3]<- cor(x, y)
  }
}
print(ann_cormatrix, digits = 3)

# ann: ap & ir are highly correlated.
# ann: ce & dpi are highly correlated.
# exclude ir

attach(ann_data)
ann_leaps <- regsubsets(ann_sec_exc_re ~ cspi + vc + dmee + pde + pce 
                        + he + ap + ce + dpi, data = ann_data, nbest = 1)
summary(ann_leaps)

subsets(ann_leaps,  abbrev = 2, legend = TRUE, statistic="cp", las = 1,
        main = "Annual all-subsets regression with exhaustive algorithm")
subsets(ann_leaps,  abbrev = 2, min.size = 3, legend = TRUE, 
        statistic="cp", las = 1, 
        main = "Annual all-subsets regression with exhaustive algorithm")
detach(ann_data)
# selection_result_after_testing_correlation
# ann: cspi, ap, dpi, ce
# and we exclude ce, which is identical to excluding ce and ir at the beginning

# compare the result of excluding ce and ir at the begining
attach(ann_data)
com_ann_leaps <- regsubsets(ann_sec_exc_re ~ cspi + vc + dmee + pde + pce 
                        + he + ap + dpi, data = ann_data, nbest = 1)
summary(com_ann_leaps)

subsets(com_ann_leaps,  abbrev = 2, min.size = 3, legend = TRUE, 
        statistic = "cp", las = 1,
        main =
        "Annual all-subsets regression with exhaustive algorithm (exclude corr)")
detach(ann_data)
# selection_result: ann: cspi, ap, dpi.
# conclusion: Same.

##################################################################################
# Regression: Final model

mon_rslt <- lm(mon_sec_exc_re ~ dpi + per + roe, mon_data)
ann_rslt <- lm(ann_sec_exc_re ~ cspi + dpi + ap, ann_data)

summary(mon_rslt)
summary(ann_rslt)

##################################################################################
# Regression Diagnostics

# monthly
# Test for Autocorrelated Errors
durbinWatsonTest(mon_rslt)

# Evaluate homoscedasticity
# non-constant error variance test
ncvTest(mon_rslt)
# plot studentized residuals vs. fitted values 
spreadLevelPlot(mon_rslt)

# Test Normality of Residuals
# qq plot for studentized residuals
qqPlot(mon_rslt, las = 1, main = "Mon QQ Plot")
# distribution of studentized residuals
mon_sresid <- studres(mon_rslt)

hist(mon_sresid, freq=FALSE, las = 1,
     main = "Monthly distribution of studentized residuals")

mon_xfit <- seq(min(mon_sresid), max(mon_sresid),length = 40) 
mon_yfit <- dnorm(mon_xfit) 

lines(mon_xfit, mon_yfit)

# annually
durbinWatsonTest(ann_rslt)

ncvTest(ann_rslt)
spreadLevelPlot(ann_rslt)

qqPlot(ann_rslt, las = 1, main = "Ann QQ Plot")
ann_sresid <- studres(ann_rslt)

hist(sresid, freq=FALSE, las = 1,
     main = "Annual distribution of studentized residuals")

ann_xfit <- seq(min(ann_sresid), max(ann_sresid), length = 40) 
ann_yfit <- dnorm(ann_xfit) 

lines(ann_xfit, ann_yfit)

# PART2
##################################################################################
# Prediction

# monthly
n <- dim(mon_data)[1]

t_mon_sec_re     <- mon_data[2:n, 2]
t_mon_rf         <- mon_data[2:n, 4]
t_mon_sec_exc_re <- t_mon_sec_re - t_mon_rf

t_cpi <- mon_data[1:n-1, 5]
t_dpi <- mon_data[1:n-1, 6]
t_id  <- mon_data[1:n-1, 7]
t_une <- mon_data[1:n-1, 8]
t_afs <- mon_data[1:n-1, 9]
t_per <- mon_data[1:n-1, 10]
t_pbv <- mon_data[1:n-1, 11]
t_pm  <- mon_data[1:n-1, 12]
t_roe <- mon_data[1:n-1, 13]
t_emp <- mon_data[1:n-1, 14]

attach(mon_data)
t_mon_leaps <- regsubsets(t_mon_sec_exc_re ~ t_cpi + t_dpi + t_id + t_une 
                          + t_afs + t_per + t_pbv + t_pm + t_roe + t_emp, 
                          data = mon_data, nbest = 1)
summary(t_mon_leaps)

# plot statistic by subset size
subsets(t_mon_leaps,  abbrev = 2, legend = FALSE, statistic="cp", las = 1,
        main = "Monthly all-subsets regression (prediction)")
detach(mon_data)
# mon: pm & roe are highly correlated. 
# mon: pbv & per are highly correlated.
# selection_result: mon: pbv, roe

#############################################################################
# annually
ann_data <- data.frame(read_excel("Group_Project_Data.xlsx"), sheet = 2)

m <- dim(ann_data)[1]

t_ann_sec_return <- ann_data[2:m, 2]
t_ann_rf         <- ann_data[2:m, 3]
t_ann_sec_exc_re <- t_ann_sec_return - t_ann_rf

t_cspi <- ann_data[1:m-1, 4]
t_vc   <- ann_data[1:m-1, 5]
t_dmee <- ann_data[1:m-1, 6]
t_pde  <- ann_data[1:m-1, 7]
t_pce  <- ann_data[1:m-1, 8]
t_he   <- ann_data[1:m-1, 9]
t_ap   <- ann_data[1:m-1, 10]
t_ir   <- ann_data[1:m-1, 11]
t_ce   <- ann_data[1:m-1, 12]
t_dpi  <- ann_data[1:m-1, 13]

attach(ann_data)
t_ann_leaps <- regsubsets(t_ann_sec_exc_re ~ t_cspi + t_vc + t_dmee + t_pde 
                          + t_pce + t_he + t_ap + t_ir + t_ce + t_dpi, 
                          data = ann_data, nbest = 1)
summary(t_ann_leaps)

subsets(t_ann_leaps,  abbrev = 2, min.size = 3, legend = TRUE, 
        statistic="cp", las = 1,
        main = "Annual all-subsets regression (prediction)")
detach(mon_data)
# ann: ap & ir are highly correlated.
# ann: ce & dpi are highly correlated.
# selection_result_after_corrleaiton_test: ann: cspi,ir, dpi.

##################################################################################
# Regression: Final model
t_mon_rslt <- lm(t_mon_sec_exc_re ~ t_pbv + t_roe, mon_data)
t_ann_rslt <- lm(t_ann_sec_exc_re ~ t_cspi + t_ir + t_dpi, ann_data)

summary(t_mon_rslt)
summary(t_ann_rslt)

##################################################################################