Skip to content

Commit

Permalink
Fix Roger's additions.
Browse files Browse the repository at this point in the history
  • Loading branch information
Aleksandar Prokopec committed Oct 17, 2012
1 parent c39189e commit a9fc883
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 68 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
target
project/target
tmp
tmp1
tmp2
149 changes: 84 additions & 65 deletions src/main/scala/org/collperf/statistics.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,118 +11,137 @@ import org.apache.commons.math3.distribution.FDistribution

object Statistics {

/**
* Let Y = (Y_1, ..., Y_n) data resulting from a parametric law F of
* scalar parameter θ. A confidence interval (B_i, B_s) is a statistic
* in the form of an interval containing θ with a specified probability.
/** Let Y = (Y_1, ..., Y_n) data resulting from a parametric law F of
* scalar parameter θ. A confidence interval (B_i, B_s) is a statistic
* in the form of an interval containing θ with a specified probability.
*/
def CI(seq: Seq[Long], alpha: Double): (Double, Double) = {
def confidenceInterval(seq: Seq[Long], alpha: Double): (Double, Double) = {
val n = seq.length
val xbar = mean(seq)
val S = sampleStandardDeviation(seq)
val S = stdev(seq)
/* Student's distribution could be used all the time because it converges
* towards the normal distribution as n grows.
*/
if (n < 30) {
(xbar - qt(1 - alpha / 2, n - 1) * S / sqrt(n),
xbar + qt(1 - alpha / 2, n - 1) * S / sqrt(n))
(xbar - qt(1 - alpha / 2, n - 1) * S / sqrt(n), xbar + qt(1 - alpha / 2, n - 1) * S / sqrt(n))
} else {
(xbar - qsnorm(1 - alpha / 2) * S / sqrt(n),
xbar + qsnorm(1 - alpha / 2) * S / sqrt(n))
(xbar - qsnorm(1 - alpha / 2) * S / sqrt(n), xbar + qsnorm(1 - alpha / 2) * S / sqrt(n))
}
}

// For two alternatives
def CITest(alt1: Seq[Long], alt2: Seq[Long]): Boolean = {
/*val diffM = mean(alt1) - mean(alt2)
val S1 = sampleStandardDeviation(alt1)
val S2 = sampleStandardDeviation(alt2)
/** Compares two alternative sets of measurements given a confidence level `alpha`.
*/
def confidenceIntervalTest(alt1: Seq[Long], alt2: Seq[Long], alpha: Double): Boolean = {
val m1 = mean(alt1)
val m2 = mean(alt2)
val s1 = stdev(alt1)
val s2 = stdev(alt2)
val n1 = alt1.length
val n2 = alt2.length
confidenceIntervalTest(m1, m2, s1, s2, n1, n2, alpha)
}

/** Compares two alternative sets of measurements given a confidence level `alpha`, and
* the mean, deviation and the number of measurements for each set.
*/
def confidenceIntervalTest(m1: Double, m2: Double, S1: Double, S2: Double, n1: Int, n2: Int, alpha: Double): Boolean = {
val diffM = m1 - m2
val diffS = sqrt(S1 * S1 / n1 + S2 * S2 / n2)
var CI = (0.0, 0.0)
if (n1 >= 30 && n2 >= 30) {
CI = (diffM - qsnorm(1 - alpha / 2) * diffS,
diffM + qsnorm(1 - alpha / 2) * diffS)
} else {
val CI = if (n1 < 30 || n2 < 30) {
val ndf = math.round(pow(pow(S1, 2) / n1 + pow(S2, 2) / n2, 2) / (pow(pow(S1, 2) / n1, 2) / (n1 - 1) + pow(pow(S2, 2) / n2, 2) / (n2 - 1)))
CI = (diffM - qt(1 - alpha / 2, ndf) * diffS,
diffM + qt(1 - alpha / 2, ndf) * diffS)
(diffM - qt(1 - alpha / 2, ndf) * diffS, diffM + qt(1 - alpha / 2, ndf) * diffS)
} else {
(diffM - qsnorm(1 - alpha / 2) * diffS, diffM + qsnorm(1 - alpha / 2) * diffS)
}
/* If 0 is within the confidence interval, we conclude that there is no
statiscal difference between the two alternatives */
(!(CI._1 <= 0 && 0 <= CI._2))*/
false
CI._1 <= 0 && 0 <= CI._2
}

/**
* ANOVA separates the total variation in a set of measurements into a component due to random fluctuations
* in the measurements and a component due to the actualdifferences among the alternatives. [...]
* If the variation between the alternatives is larger than the variation within each alternative, then
* it can be concluded that there is a statistically significant difference between the alternatives.
* Ref : Statistically Rigorous Java Performance Evaluation, Andy Georges, Dries Buytaert, Lieven Eeckhout
/** ANOVA separates the total variation in a set of measurements into a component due to random fluctuations
* in the measurements and a component due to the actual differences among the alternatives.
*
* If the variation between the alternatives is larger than the variation within each alternative, then
* it can be concluded that there is a statistically significant difference between the alternatives.
*
* For more information see: Statistically Rigorous Java Performance Evaluation, Andy Georges, Dries Buytaert, Lieven Eeckhout
*/
def ANOVAFTest(history: Seq[Seq[Long]], newest: Seq[Long]): Boolean = {
/*val alternatives = newest +: history
val means = for(a <- alternatives) yield mean(a)
val overallMean = means.reduceLeft(_ + _) / means.length
// TODO : we should verify here that each alternative has the same number of measurements !
val n = alternatives.head.length
val SSA = n * (means.reduceLeft((sum: Long, t: Long) => sum + ((t - overallMean) * (t - overallMean))))
def ANOVAFTest(history: Seq[Seq[Long]], alpha: Double): Boolean = {
val alternatives = history
val means: Seq[Double] = for(a <- alternatives) yield mean(a)
val overallMean: Double = means.reduceLeft(_ + _) / means.length

val SSA = (means zip history.map(_.length)).foldLeft(0.0) { (sum: Double, p: (Double, Int)) =>
val yi = p._1
val ni = p._2
sum + ni * (yi - overallMean) * (yi - overallMean)
}

/* Computation of SSE */
val k = alternatives.length
val doubleSumTerms = for(j <- 0 until k ; i <- 0 until n) yield pow(alternatives(j)(i) - means(j), 2);
val doubleSumTerms = for ((alternative, mean) <- alternatives zip means; yij <- alternative) yield (yij - mean) * (yij - mean)
val SSE = doubleSumTerms reduceLeft (_ + _)

val F = SSA * (k * (n - 1)) / (SSE * (k - 1))
val K = alternatives.length
val N = alternatives.foldLeft(0)(_ + _.size)
val F = SSA / SSE * (N - K) / (K - 1)

(F > qf(1 - alpha, k - 1, k * (n - 1)))*/
false
F <= qf(1 - alpha, K - 1, N - K)
}

def CoV(measurements: Seq[Long]): Boolean = {
// val cov = sampleStandardDeviation(measurements) / mean(measurements)
false // might be useful later
/** Compares the coefficient of variance to some `threshold` value.
*
* This heuristic can be used to detect if the measurement has stabilized.
*/
def CoV(measurements: Seq[Long], threshold: Double): Boolean = {
val cov = stdev(measurements) / mean(measurements)
cov <= threshold
}

def mean(seq : Seq[Long]): Long = {
/*if (seq.length == 0) 0 else seq reduceLeft(_ + _) / seq.length*/
0
}
/** Computes the mean of the sequence of measurements. */
def mean(seq : Seq[Long]): Double = seq.sum * 1.0 / seq.length

/**
* The sample standard sample deviation. It is the square root of S², unbiased estimator for the variance.
/** The sample standard sample deviation. It is the square root of S², unbiased estimator for the variance.
*/
def sampleStandardDeviation(seq: Seq[Long]): Double = {
def stdev(seq: Seq[Long]): Double = {
val xbar = mean(seq)
sqrt(seq.reduceLeft((sum: Long, xi: Long) => sum + ((xi - xbar) * (xi - xbar))) / (seq.length - 1))
val squaresum: Double = seq.foldLeft(0.0)((sum, xi) => sum + (xi - xbar) * (xi - xbar))
sqrt(squaresum / (seq.length - 1))
}

/**
* Quantile function for the Student's t distribution.
* Let 0 < p < 1. The p-th quantile of the cumulative distribution function F(x) is defined as
* x_p = inf{x : F(x) >= p}
* For most of the continuous random variables, x_p is unique and is equal to x_p = F^(-1)(p), where
* F^(-1) is the inverse function of F. Thus, x_p is the value for which Pr(X <= x_p) = p. In particular,
* the 0.5-th quantile is called the median of F.
/** Quantile function for the Student's t distribution.
* Let 0 < p < 1. The p-th quantile of the cumulative distribution function F(x) is defined as
* x_p = inf{x : F(x) >= p}
* For most of the continuous random variables, x_p is unique and is equal to x_p = F^(-1)(p), where
* F^(-1) is the inverse function of F. Thus, x_p is the value for which Pr(X <= x_p) = p. In particular,
* the 0.5-th quantile is called the median of F.
*/
private def qt(p: Double, df: Double): Double = {
new TDistribution(df).inverseCumulativeProbability(p)
}

/**
* Quantile function for the standard (μ = 0, σ = 1) normal distribution
/** Quantile function for the standard (μ = 0, σ = 1) normal distribution.
*/
private def qsnorm(p: Double): Double = {
new NormalDistribution().inverseCumulativeProbability(p)
}

/**
* Quantile function for the F distribution
/** Quantile function for the F distribution.
*/
private def qf(p: Double, df1: Double, df2: Double) = {
new FDistribution(df1, df2).inverseCumulativeProbability(p)
}

}
}













6 changes: 3 additions & 3 deletions src/test/scala/org/collperf/SeqTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class NewJvmMinNoGcReinstSeqTest extends SeqTesting with PerformanceTest with Pe

abstract class SeqTesting extends PerformanceTest {

val largesizes = Gen.range("size")(500000, 5000000, 500000)
val largesizes = Gen.range("size")(500000, 5000000, 250000)

val lists = for {
size <- largesizes
Expand Down Expand Up @@ -113,7 +113,7 @@ abstract class SeqTesting extends PerformanceTest {
_.reduce(_ + _)
}
}
/*

measure method "filter" in {
using(arrays) curve("Array") apply {
_.filter(_ % 2 == 0)
Expand Down Expand Up @@ -156,7 +156,7 @@ abstract class SeqTesting extends PerformanceTest {
using(mutablelists) curve("LinkedList") apply {
_.groupBy(_ % 10)
}
}*/
}

}

Expand Down

0 comments on commit a9fc883

Please sign in to comment.