-
Notifications
You must be signed in to change notification settings - Fork 0
/
regressionoffitbit.r
98 lines (86 loc) · 3.81 KB
/
regressionoffitbit.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
library(ggplot2)
library(leaps)
# Constructing the myData frame!
myData <- read.csv(#path to my data from the Fitbit API, saves as csv#)
myData <- cbind(myData[,(2:10)])
names(myData) <- c('date', 'calories', 'steps', 'fairlyActive', 'veryActive', 'heartRate', 'sleep', 'temp', 'onCampus')
# Now we have a lot of nice data to play with!
#We will omit the line that has a missing heart rate value
myData <- na.omit(myData)
#splitting data into training and building
predict <- data.frame()
testing <- data.frame()
set.seed(38)
predictIndex <- c(sample.int(92, 70))
for(i in (1:length(predictIndex))){
predict <- rbind(predict, myData[predictIndex[i],] )
}
names(predict) <- names(myData)
testingIndex <- c(1:92)
for(i in (1:92)){
for(j in (1:70))
{
if(i == predictIndex[[j]]){
testingIndex[i]<-0
}
}
}
for(i in (1:92)){
if(testingIndex[i]>0){
testing <- rbind(testing, myData[testingIndex[i],] )
}
}
# To get a feel, I first plot some pairs of variables
# Plotting heart rate versus calories in ggplot2
heartplot <- ggplot(data=predict, aes(x=(heartRate), y = calories, color="calories")) +geom_point(size=3)
heartplot <- heartplot + stat_smooth(method="lm", color="blue")
heartplot <- heartplot + coord_cartesian(xlim=c(68,82), ylim=c(1400, 2800))
heartplot
# Plotting steps versus calories in ggplot2
stepplot <- ggplot(data=predict, aes(x=steps, y = calories, color="calories")) +geom_point(size=3)
stepplot <- stepplot + stat_smooth(method="lm", color="blue")
stepplot <- stepplot + coord_cartesian(xlim=c(70,36600), ylim=c(1400, 2800))
stepplot
#Let's try a combined linear model
# one with everything thrown in!
fullModel <- (lm(predict$calories ~ predict$steps + predict$fairlyActive + predict$veryActive + predict$heartRate + predict$sleep + predict$temp + predict$onCampus))
summary(fullModel)
plot(fullModel)
# heteroskedasticity not an issue in residuals vs fitted
# Q-Q plot nice
# the full model shows the strongest input is the number of steps
# intuition tells me maybe sleep is inversely related to calories burnt - I'm buring less when I'm asleep than awake right?
# maybe a model with a transformed sleep is in order
inverseSleep <- c(1/predict$sleep)
summary(lm(predict$calories ~ inverseSleep))
# looks pretty insignificant on its own - what if thrown in with some other variables?
summary(lm(predict$calories ~ inverseSleep + predict$steps + predict$heartRate +predict$fairlyActive + predict$veryActive + predict$temp + predict$onCampus))
# inverse sleep has a bigger p-value than untransformed sleep!
# will keep sleep as is
#use of forward stepwise procedure on the full model
subs <- regsubsets(predict$calories ~ predict$steps + predict$fairlyActive + predict$veryActive + predict$heartRate + predict$sleep + predict$temp + predict$onCampus, method = "forward", data=myData , nbest=1)
plot(subs, scale="adjr2")
sumsubs <- summary(subs)
sumsubs
# steps is the most important variable to keep, then hfairlyActive, then temperature
red1 <- (lm(predict$calories ~ predict$steps + predict$heartRate + predict$sleep))
summary(red1)
plot(red1)
# this model shows only steps to be significant
# making things simpler and just using steps as a predictor
summary(lm(predict$calories ~ predict$steps))
plot(lm(predict$calories ~ predict$steps))
#confidence inerval for three variable model
confint(red1)
# now to use that training data to see if it works any!
calories <- predict$calories
steps <- predict$steps
heartRate <- predict$heartRate
sleep <- predict$sleep
summary(predict(lm(calories ~ steps + heartRate + sleep), newdata=testing))
predictions <- data.frame(predict(lm(calories ~ steps + heartRate + sleep), newdata=testing))
squareSum <- sum((testing$calories-predictions)^2)
meanSquareSum <- (squareSum/length(testing$calories))
anova(red1)
anova(fullModel)
# mean square errors of full model is larger than reduced model