Skip to content

Commit 6d71394

Browse files
authored
Merge pull request #102 from PyDataBlog/optimise-mini
Optimise mini
2 parents 5f7f48a + afae56a commit 6d71394

File tree

6 files changed

+98
-62
lines changed

6 files changed

+98
-62
lines changed

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ julia:
77
- 1.3
88
- 1.4
99
- 1.5
10+
- 1.6
1011
- nightly
1112
after_success:
1213
- julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
@@ -16,7 +17,7 @@ jobs:
1617
fast_finish: true
1718
include:
1819
- stage: Documentation
19-
julia: 1.5
20+
julia: 1.6
2021
script: julia --project=docs -e '
2122
using Pkg;
2223
Pkg.develop(PackageSpec(path=pwd()));

docs/src/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ r.converged # whether the procedure converged
127127
- [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
128128
- [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
129129
- [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
130+
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets.
130131
- [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
131-
- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
132132

133133
### Practical Usage Examples
134134

src/kmeans.jl

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -170,20 +170,30 @@ alternatively one can use `rand` to choose random points for init.
170170
171171
A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
172172
"""
173-
function kmeans(alg::AbstractKMeansAlg, design_matrix, k; weights = nothing,
173+
function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
174+
weights = nothing,
174175
n_threads = Threads.nthreads(),
175-
k_init = "k-means++", max_iters = 300,
176-
tol = eltype(design_matrix)(1e-6), verbose = false,
177-
init = nothing, rng = Random.GLOBAL_RNG, metric = Euclidean())
176+
k_init = "k-means++",
177+
max_iters = 300,
178+
tol = eltype(design_matrix)(1e-6),
179+
verbose = false,
180+
init = nothing,
181+
rng = Random.GLOBAL_RNG,
182+
metric = Euclidean())
178183

179184
nrow, ncol = size(design_matrix)
180185

181186
# Create containers based on the dimensions and specifications
182187
containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
183188

184189
return kmeans!(alg, containers, design_matrix, k, weights, metric;
185-
n_threads = n_threads, k_init = k_init, max_iters = max_iters,
186-
tol = tol, verbose = verbose, init = init, rng = rng)
190+
n_threads = n_threads,
191+
k_init = k_init,
192+
max_iters = max_iters,
193+
tol = tol,
194+
verbose = verbose,
195+
init = init,
196+
rng = rng)
187197

188198
end
189199

src/lloyd.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Basic algorithm for k-means calculation.
66
struct Lloyd <: AbstractKMeansAlg end
77

88
"""
9-
Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
9+
kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
1010
1111
Mutable version of `kmeans` function. Definition of arguments and results can be
1212
found in `kmeans`.

src/mini_batch.jl

Lines changed: 72 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
"""
22
MiniBatch(b::Int)
3+
`b` represents the size of the batch which should be sampled.
34
45
Sculley et al. 2007 Mini batch k-means algorithm implementation.
6+
7+
```julia
8+
X = rand(30, 100_000) # 100_000 random points in 30 dimensions
9+
10+
kmeans(MiniBatch(100), X, 3) # 3 clusters, MiniBatch algorithm with 100 batch samples at each iteration
11+
```
512
"""
613
struct MiniBatch <: AbstractKMeansAlg
714
b::Int # batch size
@@ -10,8 +17,8 @@ end
1017

1118
MiniBatch() = MiniBatch(100)
1219

13-
function kmeans!(alg::MiniBatch, X, k;
14-
weights = nothing, metric = Euclidean(), n_threads = Threads.nthreads(),
20+
function kmeans!(alg::MiniBatch, containers, X, k,
21+
weights = nothing, metric = Euclidean(); n_threads = Threads.nthreads(),
1522
k_init = "k-means++", init = nothing, max_iters = 300,
1623
tol = eltype(X)(1e-6), max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
1724

@@ -26,99 +33,100 @@ function kmeans!(alg::MiniBatch, X, k;
2633
N = zeros(T, k)
2734

2835
# Initialize nearest centers for both batch and whole dataset labels
29-
final_labels = Vector{Int}(undef, ncol) # dataset labels
30-
3136
converged = false
3237
niters = 0
3338
counter = 0
3439
J_previous = zero(T)
3540
J = zero(T)
36-
37-
# TODO: Main Steps. Batch update centroids until convergence
41+
totalcost = zero(T)
42+
batch_rand_idx = containers.batch_rand_idx
43+
44+
# Main Steps. Batch update centroids until convergence
3845
while niters <= max_iters # Step 4 in paper
3946

4047
# b examples picked randomly from X (Step 5 in paper)
41-
batch_rand_idx = isnothing(weights) ? rand(rng, 1:ncol, alg.b) : wsample(rng, 1:ncol, weights, alg.b)
42-
batch_sample = X[:, batch_rand_idx]
48+
batch_rand_idx = isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
4349

4450
# Cache/label the batch samples nearest to the centers (Step 6 & 7)
45-
@inbounds for i in axes(batch_sample, 2)
46-
min_dist = distance(metric, batch_sample, centroids, i, 1)
51+
@inbounds for i in batch_rand_idx
52+
min_dist = distance(metric, X, centroids, i, 1)
4753
label = 1
4854

4955
for j in 2:size(centroids, 2)
50-
dist = distance(metric, batch_sample, centroids, i, j)
56+
dist = distance(metric, X, centroids, i, j)
5157
label = dist < min_dist ? j : label
5258
min_dist = dist < min_dist ? dist : min_dist
5359
end
5460

55-
final_labels[batch_rand_idx[i]] = label
56-
end
57-
58-
# TODO: Batch gradient step
59-
@inbounds for j in axes(batch_sample, 2) # iterate over examples (Step 9)
61+
containers.labels[i] = label
6062

61-
# Get cached center/label for this x => labels[batch_rand_idx[j]] (Step 10)
62-
label = final_labels[batch_rand_idx[j]]
63+
##### Batch gradient step #####
64+
# iterate over examples (each column) ==> (Step 9)
65+
# Get cached center/label for each example label = labels[i] => (Step 10)
66+
6367
# Update per-center counts
64-
N[label] += isnothing(weights) ? 1 : weights[j] # verify (Step 11)
68+
N[label] += isnothing(weights) ? 1 : weights[i] # (Step 11)
6569

6670
# Get per-center learning rate (Step 12)
6771
lr = 1 / N[label]
6872

69-
# Take gradient step (Step 13) # TODO: Replace with an allocation-less loop.
70-
centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* batch_sample[:, j])
73+
# Take gradient step (Step 13) # TODO: Replace with faster loop?
74+
@views centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* X[:, i])
7175
end
7276

73-
# TODO: Reassign all labels based on new centres generated from the latest sample
74-
final_labels = reassign_labels(X, metric, final_labels, centroids)
77+
# Reassign all labels based on new centres generated from the latest sample
78+
containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
7579

76-
# TODO: Calculate cost on whole dataset after reassignment and check for convergence
77-
J = sum_of_squares(X, final_labels, centroids) # just a placeholder for now
80+
# Calculate cost on whole dataset after reassignment and check for convergence
81+
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
82+
J = sum(containers.sum_of_squares)
7883

7984
if verbose
8085
# Show progress and terminate if J stopped decreasing.
8186
println("Iteration $niters: Jclust = $J")
8287
end
8388

84-
# TODO: Check for early stopping convergence
89+
# Check for early stopping convergence
8590
if (niters > 1) & (abs(J - J_previous) < (tol * J))
8691
counter += 1
8792

8893
# Declare convergence if max_no_improvement criterion is met
8994
if counter >= max_no_improvement
9095
converged = true
91-
# TODO: Compute label assignment for the complete dataset
92-
final_labels = reassign_labels(X, metric, final_labels, centroids)
96+
# Compute label assignment for the complete dataset
97+
containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
9398

94-
# TODO: Compute totalcost for the complete dataset
95-
J = sum_of_squares(X, final_labels, centroids) # just a placeholder for now
99+
# Compute totalcost for the complete dataset
100+
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
101+
totalcost = sum(containers.sum_of_squares)
96102
break
97103
end
98104
else
99105
counter = 0
106+
end
107+
108+
# Warn users if model doesn't converge at max iterations
109+
if (niters > max_iters) & (!converged)
110+
111+
if verbose
112+
println("Clustering model failed to converge. Labelling data with latest centroids.")
113+
end
114+
containers.labels = reassign_labels(X, metric, containers.labels, centroids)
100115

116+
# Compute totalcost for unconverged model
117+
@parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
118+
totalcost = sum(containers.sum_of_squares)
119+
break
101120
end
102121

103122
J_previous = J
104123
niters += 1
105124
end
106125

107-
return centroids, niters, converged, final_labels, J # TODO: push learned artifacts to KmeansResult
108-
#return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
126+
# Push learned artifacts to KmeansResult
127+
return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
109128
end
110129

111-
# TODO: Only being used to test generic implementation. Get rid off after!
112-
function sum_of_squares(x, labels, centre)
113-
s = 0.0
114-
115-
for i in axes(x, 2)
116-
for j in axes(x, 1)
117-
s += (x[j, i] - centre[j, labels[i]])^2
118-
end
119-
end
120-
return s
121-
end
122130

123131
function reassign_labels(DMatrix, metric, labels, centres)
124132
@inbounds for i in axes(DMatrix, 2)
@@ -135,3 +143,24 @@ function reassign_labels(DMatrix, metric, labels, centres)
135143
end
136144
return labels
137145
end
146+
147+
"""
148+
create_containers(::MiniBatch, k, nrow, ncol, n_threads)
149+
150+
Internal function for the creation of all necessary intermidiate structures.
151+
152+
- `centroids_new` - container which holds new positions of centroids
153+
- `centroids_cnt` - container which holds number of points for each centroid
154+
- `labels` - vector which holds labels of corresponding points
155+
- `sum_of_squares` - vector which holds the sum of squares values for each thread
156+
"""
157+
function create_containers(alg::MiniBatch, X, k, nrow, ncol, n_threads)
158+
# Initiate placeholders to avoid allocations
159+
T = eltype(X)
160+
labels = Vector{Int}(undef, ncol) # labels vector
161+
sum_of_squares = Vector{T}(undef, 1) # total_sum_calculation
162+
batch_rand_idx = Vector{Int}(undef, alg.b)
163+
164+
return (batch_rand_idx = batch_rand_idx,
165+
labels = labels, sum_of_squares = sum_of_squares)
166+
end

test/test90_minibatch.jl

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ end
1616
rng = StableRNG(2020)
1717
X = rand(rng, 3, 100)
1818

19-
baseline = [kmeans(Lloyd(), X, 2).totalcost for i in 1:1_000] |> mean |> round
20-
# TODO: Switch to kmeans after full implementation
21-
res = [ParallelKMeans.kmeans!(MiniBatch(50), X, 2)[end] for i in 1:1_000] |> mean |> round
19+
baseline = [kmeans(Lloyd(), X, 2; max_iters=100_000).totalcost for i in 1:200] |> mean |> round
20+
21+
res = [kmeans(MiniBatch(10), X, 2; max_iters=100_000).totalcost for i in 1:200] |> mean |> round
2222

2323
@test baseline == res
2424
end
@@ -28,13 +28,9 @@ end
2828
rng = StableRNG(2020)
2929
X = rand(rng, 3, 100)
3030

31-
baseline = [kmeans(Lloyd(), X, 2;
32-
tol=1e-6, metric=Cityblock(),
33-
max_iters=500).totalcost for i in 1:1000] |> mean |> floor
34-
# TODO: Switch to kmeans after full implementation
35-
res = [ParallelKMeans.kmeans!(MiniBatch(), X, 2;
36-
metric=Cityblock(), tol=1e-6,
37-
max_iters=500)[end] for i in 1:1000] |> mean |> floor
31+
baseline = [kmeans(Lloyd(), X, 2; metric=Cityblock(), max_iters=100_000).totalcost for i in 1:200] |> mean |> round
32+
33+
res = [kmeans(MiniBatch(10), X, 2; metric=Cityblock(), max_iters=100_000).totalcost for i in 1:200] |> mean |> round
3834

3935
@test baseline == res
4036
end

0 commit comments

Comments
 (0)