Skip to content

Commit

Permalink
Merge pull request JuliaLang#14413 from JuliaLang/sb/quantile
Browse files Browse the repository at this point in the history
improve quantile: reduce allocations, use partial sort
  • Loading branch information
jakebolewski committed Dec 16, 2015
2 parents afcbab1 + f6ad90e commit 2778a52
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 47 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ Library improvements
(to alter Windows command-line generation) and `windows_hide` (to
suppress creation of new console windows) ([#13780]).

* Statistics:

* Improve performance of `quantile` ([#14413]).

Deprecated or removed
---------------------

Expand Down
7 changes: 0 additions & 7 deletions base/docs/helpdb/Base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2450,13 +2450,6 @@ Like `mapreduce(f, op, v0, itr)`. In general, this cannot be used with empty col
"""
mapreduce(f, op, itr)

"""
quantile!(v, p)
Like `quantile`, but overwrites the input vector.
"""
quantile!

"""
accept(server[,client])
Expand Down
4 changes: 4 additions & 0 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ convert(::Type{AbstractFloat}, x::UInt128) = convert(Float64, x) # LOSSY

float(x) = convert(AbstractFloat, x)

# for constructing arrays
float{T<:Number}(::Type{T}) = typeof(float(zero(T)))
float{T}(::Type{T}) = Any

for Ti in (Int8, Int16, Int32, Int64)
@eval begin
unsafe_trunc(::Type{$Ti}, x::Float32) = box($Ti,fptosi($Ti,unbox(Float32,x)))
Expand Down
118 changes: 86 additions & 32 deletions base/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -515,51 +515,105 @@ median{T}(v::AbstractArray{T}, region) = mapslices(median!, v, region)

# for now, use the R/S definition of quantile; may want variants later
# see ?quantile in R -- this is type 7
# TODO: need faster implementation (use select!?)
#
function quantile!(v::AbstractVector, q::AbstractVector)
isempty(v) && throw(ArgumentError("empty data array"))
isempty(q) && throw(ArgumentError("empty quantile array"))
"""
quantile!([q, ] v, p; sorted=false)
Compute the quantile(s) of a vector `v` at the probabilities `p`, with optional output into
array `q` (if not provided, a new output array is created). The keyword argument `sorted`
indicates whether `v` can be assumed to be sorted; if `false` (the default), then the
elements of `v` may be partially sorted.
The elements of `p` should be on the interval [0,1], and `v` should not have any `NaN`
values.
Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`,
for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan
(1996), and is the same as the R default.
# make sure the quantiles are in [0,1]
q = bound_quantiles(q)
* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages",
*The American Statistician*, Vol. 50, No. 4, pp. 361-365
"""
function quantile!(q::AbstractArray, v::AbstractVector, p::AbstractArray;
sorted::Bool=false)
size(p) == size(q) || throw(DimensionMismatch())

isempty(v) && throw(ArgumentError("empty data vector"))

lv = length(v)
lq = length(q)
if !sorted
minp, maxp = extrema(p)
lo = floor(Int,1+minp*(lv-1))
hi = ceil(Int,1+maxp*(lv-1))

index = 1 .+ (lv-1)*q
lo = floor(Int,index)
hi = ceil(Int,index)
sort!(v)
# only need to perform partial sort
sort!(v, 1, lv, PartialQuickSort(lo:hi), Base.Sort.Forward)
end
isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs"))
i = find(index .> lo)
r = float(v[lo])
h = (index.-lo)[i]
r[i] = (1.-h).*r[i] + h.*v[hi[i]]
return r

for i = 1:length(p)
@inbounds q[i] = _quantile(v,p[i])
end
return q
end

"""
quantile(v, ps)
quantile!(v::AbstractVector, p::AbstractArray; sorted::Bool=false) =
quantile!(similar(p,float(eltype(v))), v, p; sorted=sorted)

Compute the quantiles of a vector `v` at a specified set of probability values `ps`. Note: Julia does not ignore `NaN` values in the computation.
"""
quantile(v::AbstractVector, q::AbstractVector) = quantile!(copy(v),q)
"""
quantile(v, p)
function quantile!(v::AbstractVector, p::Real;
sorted::Bool=false)
isempty(v) && throw(ArgumentError("empty data vector"))

Compute the quantile of a vector `v` at the probability `p`. Note: Julia does not ignore `NaN` values in the computation.
"""
quantile(v::AbstractVector, q::Number) = quantile(v,[q])[1]
lv = length(v)
if !sorted
lo = floor(Int,1+p*(lv-1))
hi = ceil(Int,1+p*(lv-1))

function bound_quantiles(qs::AbstractVector)
epsilon = 100*eps()
if (any(qs .< -epsilon) || any(qs .> 1+epsilon))
throw(ArgumentError("quantiles out of [0,1] range"))
# only need to perform partial sort
sort!(v, 1, lv, PartialQuickSort(lo:hi), Base.Sort.Forward)
end
[min(1,max(0,q)) for q = qs]
isnan(v[end]) && throw(ArgumentError("quantiles are undefined in presence of NaNs"))

return _quantile(v,p)
end

# Core quantile lookup function: assumes `v` sorted
@inline function _quantile(v::AbstractVector, p::Real)
T = float(eltype(v))
isnan(p) && return T(NaN)

lv = length(v)
index = 1 + (lv-1)*p
1 <= index <= lv || error("input probability out of [0,1] range")

indlo = floor(index)
i = trunc(Int,indlo)

if index == indlo
return T(v[i])
else
h = T(index - indlo)
return (1-h)*T(v[i]) + h*T(v[i+1])
end
end


"""
quantile(v, p; sorted=false)
Compute the quantile(s) of a vector `v` at a specified probability or vector `p`. The
keyword argument `sorted` indicates whether `v` can be assumed to be sorted.
The `p` should be on the interval [0,1], and `v` should not have any `NaN` values.
Quantiles are computed via linear interpolation between the points `((k-1)/(n-1), v[k])`,
for `k = 1:n` where `n = length(v)`. This corresponds to Definition 7 of Hyndman and Fan
(1996), and is the same as the R default.
* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages",
*The American Statistician*, Vol. 50, No. 4, pp. 361-365
"""
quantile(v::AbstractVector, p; sorted::Bool=false) =
quantile!(sorted ? v : copy!(similar(v),v), p; sorted=sorted)


##### histogram #####
Expand Down
20 changes: 13 additions & 7 deletions doc/stdlib/math.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1753,23 +1753,29 @@ Statistics
Compute the midpoints of the bins with edges ``e``\ . The result is a vector/range of length ``length(e) - 1``\ . Note: Julia does not ignore ``NaN`` values in the computation.

.. function:: quantile(v, ps)
.. function:: quantile(v, p; sorted=false)

.. Docstring generated from Julia source
Compute the quantiles of a vector ``v`` at a specified set of probability values ``ps``\ . Note: Julia does not ignore ``NaN`` values in the computation.
Compute the quantile(s) of a vector ``v`` at a specified probability or vector ``p``\ . The keyword argument ``sorted`` indicates whether ``v`` can be assumed to be sorted.

.. function:: quantile(v, p)
The ``p`` should be on the interval [0,1], and ``v`` should not have any ``NaN`` values.

.. Docstring generated from Julia source
Quantiles are computed via linear interpolation between the points ``((k-1)/(n-1), v[k])``\ , for ``k = 1:n`` where ``n = length(v)``\ . This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default.

Compute the quantile of a vector ``v`` at the probability ``p``\ . Note: Julia does not ignore ``NaN`` values in the computation.
* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", *The American Statistician*, Vol. 50, No. 4, pp. 361-365

.. function:: quantile!(v, p)
.. function:: quantile!([q, ] v, p; sorted=false)

.. Docstring generated from Julia source
Like ``quantile``\ , but overwrites the input vector.
Compute the quantile(s) of a vector ``v`` at the probabilities ``p``\ , with optional output into array ``q`` (if not provided, a new output array is created). The keyword argument ``sorted`` indicates whether ``v`` can be assumed to be sorted; if ``false`` (the default), then the elements of ``v`` may be partially sorted.

The elements of ``p`` should be on the interval [0,1], and ``v`` should not have any ``NaN`` values.

Quantiles are computed via linear interpolation between the points ``((k-1)/(n-1), v[k])``\ , for ``k = 1:n`` where ``n = length(v)``\ . This corresponds to Definition 7 of Hyndman and Fan (1996), and is the same as the R default.

* Hyndman, R.J and Fan, Y. (1996) "Sample Quantiles in Statistical Packages", *The American Statistician*, Vol. 50, No. 4, pp. 361-365

.. function:: cov(x[, corrected=true])

Expand Down
6 changes: 5 additions & 1 deletion test/statistics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,12 @@ end
@test midpoints(Float64[1.0:1.0:10.0;]) == Float64[1.5:1.0:9.5;]

@test quantile([1,2,3,4],0.5) == 2.5
@test quantile([1,2,3,4],[0.5]) == [2.5]
@test quantile([1., 3],[.25,.5,.75])[2] == median([1., 3])
@test quantile([0.:100.;],[.1,.2,.3,.4,.5,.6,.7,.8,.9])[1] == 10.0
@test quantile(100.0:-1.0:0.0, 0.0:0.1:1.0) == collect(0.0:10.0:100.0)
@test quantile(0.0:100.0, 0.0:0.1:1.0, sorted=true) == collect(0.0:10.0:100.0)
@test quantile(100f0:-1f0:0.0, 0.0:0.1:1.0) == collect(0f0:10f0:100f0)


# test invalid hist nbins argument (#9999)
@test_throws ArgumentError hist(Int[], -1)
Expand Down

0 comments on commit 2778a52

Please sign in to comment.