Skip to content

Commit

Permalink
uses SimilaritySearch 0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
sadit committed Dec 7, 2021
1 parent 087cb32 commit 2cda28a
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 73 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "KCenters"
uuid = "5d8de97f-65f8-4dd6-a15b-0f89c36a43ce"
authors = ["Eric S. Tellez <[email protected]>"]
version = "0.4.10"
version = "0.5.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -19,6 +19,6 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
CategoricalArrays = "0.8, 0.9, 0.10"
Distances = "0.10"
MLDataUtils = "0.5"
SimilaritySearch = "0.6, 0.7"
SimilaritySearch = "0.8"
StatsBase = "0.32, 0.33"
julia = "1.5"
13 changes: 6 additions & 7 deletions src/centerselection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,11 @@ end
MedoidSelection(; dist=SqL2Distance(), ratio=0.5) = MedoidSelection(dist, convert(Float32, ratio))
KnnCentroidSelection(; sel1=CentroidSelection(), sel2=CentroidSelection(), dist=SqL2Distance(), k=0) = KnnCentroidSelection(sel1, sel2, dist, convert(Int32, k))

center(::CentroidSelection, lst::AbstractVector{ObjectType}) where {ObjectType<:AbstractVector{N}} where {N<:Real} =
mean(lst)
center(::CentroidSelection, lst::AbstractDatabase) = mean(convert(Vector, lst))
center(::CentroidSelection, lst) = mean(lst)
center(::RandomCenterSelection, lst) = rand(lst)

center(::RandomCenterSelection, lst::AbstractVector) = rand(lst)

function center(sel::MedoidSelection, lst::AbstractVector)
function center(sel::MedoidSelection, lst)
if sel.ratio < 1.0
ss = randsubseq(1:length(lst), sel.ratio)
if length(ss) > 0
Expand All @@ -93,9 +92,9 @@ function center(sel::MedoidSelection, lst::AbstractVector)
lst[argmin(L)]
end

function center(sel::KnnCentroidSelection, lst::AbstractVector)
function center(sel::KnnCentroidSelection, lst)
c = center(sel.sel1, lst)
seq = ExhaustiveSearch(sel.dist, lst)
seq = ExhaustiveSearch(sel.dist, convert(AbstractVector, lst))
k = sel.k == 0 ? ceil(Int32, log2(length(lst))) : sel.k
k = max(1, k)
center(sel.sel2, lst[[id for (id, dist) in search(seq, c, k)]])
Expand Down
69 changes: 39 additions & 30 deletions src/clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using CategoricalArrays, StatsBase, MLDataUtils
export enet, dnet, kcenters, associate_centroids, ClusteringData

"""
struct ClusteringData{DataType<:AbstractVector}
struct ClusteringData{DataType}
# n elements in the dataset, m centers
centers::DataType # centers, m entries
freqs::Vector{Int32} # number of elements associated to each center, m entries
Expand All @@ -18,7 +18,7 @@ export enet, dnet, kcenters, associate_centroids, ClusteringData
The datastructure output of our clustering procedures
"""
struct ClusteringData{DataType<:AbstractVector}
struct ClusteringData{DataType<:AbstractDatabase}
# n elements in the dataset, m centers
centers::DataType # centers, m entries
freqs::Vector{Int32} # number of elements associated to each center, m entries
Expand All @@ -29,14 +29,15 @@ struct ClusteringData{DataType<:AbstractVector}
end

"""
kcenters(dist::PreMetric, X::AbstractVector{T}, y::CategoricalArray, sel::AbstractCenterSelection=CentroidSelection()) where T
kcenters(dist::PreMetric, X, y::CategoricalArray, sel::AbstractCenterSelection=CentroidSelection())
Computes a center per region (each region is defined by the set of items having the same label in `y`).
The output is compatible with `kcenters` function when `eltype(y)` is Int
"""
function kcenters(dist::PreMetric, X::AbstractVector{T}, y::CategoricalArray, sel::AbstractCenterSelection=CentroidSelection()) where T
function kcenters(dist::PreMetric, X, y::CategoricalArray, sel::AbstractCenterSelection=CentroidSelection())
X = convert(AbstractDatabase, X)
m = length(levels(y))
centers = Vector{T}(undef, m)
centers = Vector(undef, m)
freqs = zeros(Int32, m)
invindex = labelmap(y.refs)

Expand All @@ -46,9 +47,11 @@ function kcenters(dist::PreMetric, X::AbstractVector{T}, y::CategoricalArray, se
freqs[i] = length(elements)
end

distances = Float32[evaluate(dist, X[i], centers[y.refs[i]]) for i in eachindex(X)]
codes = Int32.(y.refs)
ClusteringData(centers, freqs, compute_dmax(m, codes, distances), codes, distances, Float32[sum(distances)])
let centers = VectorDatabase(centers)
distances = Float32[evaluate(dist, X[i], centers[y.refs[i]]) for i in eachindex(X)]
codes = Int32.(y.refs)
ClusteringData(centers, freqs, compute_dmax(m, codes, distances), codes, distances, Float32[sum(distances)])
end
end

function compute_dmax(m, codes, distances)
Expand All @@ -62,8 +65,8 @@ function compute_dmax(m, codes, distances)
end

"""
kcenters(dist::PreMetric, X::AbstractVector{T}, k::Integer; sel::AbstractCenterSelection=CentroidSelection(), initial=:fft, maxiters=0, tol=0.001, recall=1.0) where T
kcenters(dist::PreMetric, X::AbstractVector{T}, C::AbstractzVector{T}; sel::AbstractCenterSelection=CentroidSelection(), maxiters=30, tol=0.001, recall=1.0) where T
kcenters(dist::PreMetric, X, k::Integer; sel::AbstractCenterSelection=CentroidSelection(), initial=:fft, maxiters=0, tol=0.001, recall=1.0)
kcenters(dist::PreMetric, X, C; sel::AbstractCenterSelection=CentroidSelection(), maxiters=30, tol=0.001, recall=1.0)
Performs a kcenters clustering of `X` using `dist` as distance function and `sel` to compute center objects.
It is based on the Lloyd's algorithm yet using different algorithms as initial clusters:
Expand All @@ -77,32 +80,34 @@ It is based on the Lloyd's algorithm yet using different algorithms as initial c
If recall is 1.0 then an exhaustive search is made to find associations of each item to its nearest cluster; if ``0 < recall < 0`` then an approximate index
(`SearchGraph` from `SimilaritySearch.jl`) will be used for the same purpose; the `recall` controls the expected search quality (trade with search time).
"""
function kcenters(dist::PreMetric, X::AbstractVector{T}, k::Integer; sel::AbstractCenterSelection=CentroidSelection(), initial=:fft, maxiters=10, tol=0.001, recall=1.0, verbose=false) where T
if initial == :fft
function kcenters(dist::PreMetric, X, k::Integer; sel::AbstractCenterSelection=CentroidSelection(), initial=:fft, maxiters=10, tol=0.001, recall=1.0, verbose=false)
X = convert(AbstractDatabase, X)

if initial === :fft
m = 0
irefs = enet(dist, X, k+m).irefs
if m > 0
irefs = irefs[1+m:end]
end
initial = X[irefs]
elseif initial == :dnet
elseif initial === :dnet
irefs = dnet(dist, X, k).irefs
resize!(irefs, k)
initial = X[irefs]
elseif initial == :sfft
elseif initial === :sfft
n = length(X)
m = min(n, ceil(Int, sqrt(n)) + k)
X_ = X[unique(rand(1:n, m))]
C = enet(dist, X_, k, verbose=verbose)
initial = X_[C.irefs]
elseif initial == :sdnet
elseif initial === :sdnet
n = length(X)
m = min(n, ceil(Int, sqrt(n)) + k)
X_ = X[unique(rand(1:n, m))]
irefs = dnet(dist, X_, k, verbose=verbose).irefs
resize!(irefs, k)
initial = X_[irefs]
elseif initial == :fftdensity
elseif initial === :fftdensity
n = length(X)
m = min(n, ceil(Int, log(n)) + 2 * k)

Expand All @@ -121,18 +126,16 @@ function kcenters(dist::PreMetric, X::AbstractVector{T}, k::Integer; sel::Abstra
XX = X[irefs]
C = enet(dist, XX, k)
initial = XX[C.irefs]
elseif initial == :rand
elseif initial === :rand
initial = rand(X, k)
elseif initial isa Symbol
error("Unknown kind of initial value $initial")
else
initial = initial::AbstractVector{T}
end

kcenters(dist, X, initial, sel=sel, maxiters=maxiters, tol=tol, recall=recall, verbose=verbose)
kcenters_(dist, X, initial, sel=sel, maxiters=maxiters, tol=tol, recall=recall, verbose=verbose)
end

function kcenters(dist::PreMetric, X::AbstractVector{T}, C::AbstractVector{T}; sel::AbstractCenterSelection=CentroidSelection(), maxiters=-1, tol=0.001, recall=1.0, verbose=true) where T
function kcenters_(dist::PreMetric, X::AbstractDatabase, C; sel::AbstractCenterSelection=CentroidSelection(), maxiters=-1, tol=0.001, recall=1.0, verbose=true)
# Lloyd's algoritm
n = length(X)
numcenters = length(C)
Expand All @@ -142,11 +145,12 @@ function kcenters(dist::PreMetric, X::AbstractVector{T}, C::AbstractVector{T}; s
end

function create_index(CC)
CC = convert(AbstractDatabase, CC)
if recall >= 1.0
ExhaustiveSearch(dist, CC)
else
idx = SearchGraph(; dist)
append!(idx, CC)
idx = SearchGraph(; db=CC, dist)
index!(idx)
end
end

Expand All @@ -155,40 +159,45 @@ function kcenters(dist::PreMetric, X::AbstractVector{T}, C::AbstractVector{T}; s
distances = zeros(Float32, n)
err = Float32[typemax(Float32), associate_centroids_and_compute_error!(X, create_index(C), codes, distances, freqs)]
iter = 0

CC = C
clusters = [Int[] for i in 1:numcenters]
while iter < maxiters && err[end-1] - err[end] >= tol
iter += 1
verbose && println(stderr, "*** starting iteration: $iter; err: $err ***")
clusters = [Int[] for i in 1:numcenters]
for c in clusters
empty!(c)
end

for (objID, plist) in enumerate(codes)
for refID in plist
push!(clusters[refID], objID)
end
end

verbose && println(stderr, "*** computing centroids ***")
resize!(CC, length(clusters))
Threads.@threads for i in 1:length(clusters)
plist = clusters[i]
# C[i] can be empty because we could be using approximate search
# CC[i] can be empty because we could be using approximate search
if length(plist) > 0
C[i] = center(sel, X[plist])
c = center(sel, X[plist])
CC[i] = c
end
end

verbose && println(stderr, "*** computing $(numcenters) nearest references ***")
s = associate_centroids_and_compute_error!(X, create_index(C), codes, distances, freqs)
s = associate_centroids_and_compute_error!(X, create_index(CC), codes, distances, freqs)
push!(err, s)
isnan(err[end]) && error("ERROR invalid score $err")
verbose && println(stderr, "*** new score with $(numcenters) references: $err ***")
end

verbose && println(stderr, "*** finished computation of $(numcenters) references, err: $err ***")
ClusteringData(C, freqs, compute_dmax(numcenters, codes, distances), codes, distances, err)
ClusteringData(VectorDatabase(CC), freqs, compute_dmax(numcenters, codes, distances), codes, distances, err)
end

function associate_centroids_and_compute_error!(X, index::AbstractSearchContext, codes, distances, counters)
Threads.@threads for objID in 1:length(X)
#for objID in 1:length(X)
res = KnnResult(1)
search(index, X[objID], res)
codes[objID] = argmin(res)
Expand Down
42 changes: 17 additions & 25 deletions src/dnet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,65 +5,57 @@ using SimilaritySearch
using Random
export dnet

struct MaskedDistance{DataType<:AbstractVector,DistType<:PreMetric} <: PreMetric
dist::DistType
db::DataType
end

SimilaritySearch.evaluate(m::MaskedDistance, i::Integer, j::Integer) = @inbounds evaluate(m.dist, m.db[i], m.db[j])

"""
dnet(callback::Function, dist::PreMetric, X::AbstractVector{T}, k::Integer) where {T}
dnet(callback::Function, dist::PreMetric, X::AbstractDatabase, k::Integer)
A `k`-net is a set of points `M` such that each object in `X` can be:
- It is in `M`
- It is in the knn set of an object in `M` (defined with the distance function `dist`)
The size of `M` is determined by \$\\leftceil |X| / k \\rightceil\$
The dnet function uses the `callback` function as an output mechanism. This function is called on each center as `callback(centerId, res)` where
res is a `KnnResult` object (from SimilaritySearch.jl).
The dnet function uses the `callback` function as an output mechanism. This function is called on each center as `callback(centerId, res, dbmap)` where
res is a `KnnResult` object (from SimilaritySearch.jl) and dbmap a mapping
"""
function dnet(callback::Function, dist::PreMetric, X::AbstractVector{T}, k::Integer) where {T}
function dnet(callback::Function, dist::PreMetric, X::AbstractDatabase, k::Integer)
N = length(X)
metadist = (a::Int, b::Int) -> evaluate(dist, X[a], X[b])

I = ExhaustiveSearch(MaskedDistance(dist, X), shuffle!(collect(1:N)))
S = SubDatabase(X, shuffle!(collect(1:N)))
I = ExhaustiveSearch(dist, S)
res = KnnResult(k)

while length(I.db) > 0
while length(I) > 0
empty!(res)
n = length(I.db)
search(I, n, res)
callback(I.db[n], res, I.db)
n = length(I)
search(I, I[n], res)
callback(S.map[n], res, S.map)
m = n - length(res)
rlist = sort!([id_ for (id_, dist_) in res])
numzeros = 0
while length(rlist) > 0
if rlist[end] > m
I.db[rlist[end]] = 0
S.map[rlist[end]] = 0
pop!(rlist)
numzeros += 1
else
break
end
end

E = @view I.db[m+1:end]
E = @view S.map[m+1:end]
sort!(E)
E = @view I.db[m+1+numzeros:end]
E = @view S.map[m+1+numzeros:end]
if length(E) > 0
I.db[rlist] .= E
S.map[rlist] .= E
end

resize!(I.db, m)
resize!(S.map, m)
end
end


"""
dnet(dist::PreMetric, X::AbstractVector{T}, numcenters::Integer) where T
dnet(dist::PreMetric, X::AbstractDatabase, numcenters::Integer)
Selects `numcenters` far from each other based on density nets.
Expand All @@ -78,7 +70,7 @@ Returns a named tuple ``(nn, irefs, dmax)``.
- `dmax` a list of coverage-radius of each center (aligned with irefs centers) smallest distance among centers
"""
function dnet(dist::PreMetric, X::AbstractVector{T}, numcenters::Integer; verbose=false) where T
function dnet(dist::PreMetric, X::AbstractDatabase, numcenters::Integer; verbose=false)
# criterion = change_criterion(0.01)
n = length(X)
irefs = Int32[]
Expand Down
13 changes: 5 additions & 8 deletions src/enet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ function _ignore3(a, b, c)
end

"""
fftraversal(callback::Function, dist::PreMetric, X::AbstractVector{T}, stop, callbackdist=_ignore3) where {T}
fftraversal(callback::Function, dist::PreMetric, X::AbstractDatabase, stop, callbackdist=_ignore3)
Selects a number of farthest points in `X`, using a farthest first traversal
Expand All @@ -22,17 +22,14 @@ Selects a number of farthest points in `X`, using a farthest first traversal
- The callbackdist function is called on each distance evaluation between pivots and items in the dataset
`callbackdist(index-pivot, index-item, distance)`
"""
function fftraversal(callback::Function, dist::PreMetric, X::AbstractVector{T}, stop, callbackdist=_ignore3) where {T}
function fftraversal(callback::Function, dist::PreMetric, X::AbstractDatabase, stop, callbackdist=_ignore3)
N = length(X)
D = Vector{Float64}(undef, N)
dmaxlist = Float64[]
dset = [typemax(Float64) for i in 1:N]
imax::Int = rand(1:N)
dmax::Float64 = typemax(Float64)
if N == 0
return
end

N == 0 && return
k::Int = 0

@inbounds while k <= N
Expand Down Expand Up @@ -78,7 +75,7 @@ end


"""
enet(dist::PreMetric, X::AbstractVector{T}, numcenters::Int, knr::Int=1; verbose=false) where T
enet(dist::PreMetric, X::AbstractDatabase, numcenters::Int, knr::Int=1; verbose=false) where T
Selects `numcenters` far from each other based on Farthest First Traversal.
Expand All @@ -94,7 +91,7 @@ Returns a named tuple \$(nn, irefs, dmax)\$.
- `dmax` smallest distance among centers
"""
function enet(dist::PreMetric, X::AbstractVector{T}, numcenters::Integer, knr::Integer=1; verbose=false) where T
function enet(dist::PreMetric, X::AbstractDatabase, numcenters::Integer, knr::Integer=1; verbose=false) where T
# refs = Vector{Float64}[]
irefs = Int32[]
nn = [KnnResult(knr) for i in 1:length(X)]
Expand Down
2 changes: 1 addition & 1 deletion test/kcenters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
using Test
using Random, SimilaritySearch, KCenters, StatsBase

const X = [rand(4) for i in 1:1000]
const X = MatrixDatabase(rand(Float32, 4, 10000))

@testset "Clustering with enet" begin
for i in 2:5
Expand Down

0 comments on commit 2cda28a

Please sign in to comment.