Merge pull request #102 from PyDataBlog/optimise-mini

PyDataBlog · web-flow · commit 6d71394f24e1 · 2021-04-02T12:21:37.000+02:00
Optimise mini
diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,7 @@ julia:
   - 1.3
   - 1.4
   - 1.5
+  - 1.6
   - nightly
 after_success:
   - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())'
@@ -16,7 +17,7 @@ jobs:
   fast_finish: true
   include:
     - stage: Documentation
-      julia: 1.5
+      julia: 1.6
       script: julia --project=docs -e '
           using Pkg;
           Pkg.develop(PackageSpec(path=pwd()));
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -127,8 +127,8 @@ r.converged             # whether the procedure converged
 - [Elkan()](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf) - Recommended for high dimensional data.
 - [Yinyang()](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf) - Recommended for large dimensions and/or large number of clusters.
 - [Coreset()](http://proceedings.mlr.press/v51/lucic16-supp.pdf) - Recommended for very fast clustering of very large datasets, when extreme accuracy is not important.
+- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - Recommended for extremely large datasets.
 - [Geometric()](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf) - (Coming soon)
-- [MiniBatch()](https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) - (Coming soon)
 
 ### Practical Usage Examples
 
diff --git a/src/kmeans.jl b/src/kmeans.jl
@@ -170,20 +170,30 @@ alternatively one can use `rand` to choose random points for init.
 
 A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
 """
-function kmeans(alg::AbstractKMeansAlg, design_matrix, k; weights = nothing,
+function kmeans(alg::AbstractKMeansAlg, design_matrix, k; 
+                weights = nothing, 
                 n_threads = Threads.nthreads(),
-                k_init = "k-means++", max_iters = 300,
-                tol = eltype(design_matrix)(1e-6), verbose = false,
-                init = nothing, rng = Random.GLOBAL_RNG, metric = Euclidean())
+                k_init = "k-means++", 
+                max_iters = 300,
+                tol = eltype(design_matrix)(1e-6),
+                verbose = false,
+                init = nothing,
+                rng = Random.GLOBAL_RNG, 
+                metric = Euclidean())
 
     nrow, ncol = size(design_matrix)
 
     # Create containers based on the dimensions and specifications
     containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
 
     return kmeans!(alg, containers, design_matrix, k, weights, metric;
-                   n_threads = n_threads, k_init = k_init, max_iters = max_iters,
-                   tol = tol, verbose = verbose, init = init, rng = rng)
+                   n_threads = n_threads, 
+                   k_init = k_init, 
+                   max_iters = max_iters,
+                   tol = tol, 
+                   verbose = verbose, 
+                   init = init, 
+                   rng = rng)
 
 end
 
diff --git a/src/lloyd.jl b/src/lloyd.jl
@@ -6,7 +6,7 @@ Basic algorithm for k-means calculation.
 struct Lloyd <: AbstractKMeansAlg end
 
 """
-    Kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
+    kmeans!(alg::AbstractKMeansAlg, containers, design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
 
 Mutable version of `kmeans` function. Definition of arguments and results can be
 found in `kmeans`.
diff --git a/src/mini_batch.jl b/src/mini_batch.jl
@@ -1,7 +1,14 @@
 """
     MiniBatch(b::Int)
+    `b` represents the size of the batch which should be sampled.
 
     Sculley et al. 2007 Mini batch k-means algorithm implementation.
+
+```julia
+X = rand(30, 100_000)  # 100_000 random points in 30 dimensions
+
+kmeans(MiniBatch(100), X, 3)  # 3 clusters, MiniBatch algorithm with 100 batch samples at each iteration
+```
 """
 struct MiniBatch <: AbstractKMeansAlg
     b::Int  # batch size
@@ -10,8 +17,8 @@ end
 
 MiniBatch() = MiniBatch(100)
 
-function kmeans!(alg::MiniBatch, X, k;
-                 weights = nothing, metric = Euclidean(), n_threads = Threads.nthreads(),
+function kmeans!(alg::MiniBatch, containers, X, k,
+                 weights = nothing, metric = Euclidean(); n_threads = Threads.nthreads(),
                  k_init = "k-means++", init = nothing, max_iters = 300,
                  tol = eltype(X)(1e-6), max_no_improvement = 10, verbose = false, rng = Random.GLOBAL_RNG)
 
@@ -26,99 +33,100 @@ function kmeans!(alg::MiniBatch, X, k;
     N = zeros(T, k)
 
     # Initialize nearest centers for both batch and whole dataset labels
-    final_labels = Vector{Int}(undef, ncol)  # dataset labels
-
     converged = false
     niters = 0
     counter = 0
     J_previous = zero(T)
     J = zero(T)
-
-    # TODO: Main Steps. Batch update centroids until convergence
+    totalcost = zero(T)
+    batch_rand_idx = containers.batch_rand_idx
+    
+    # Main Steps. Batch update centroids until convergence
     while niters <= max_iters  # Step 4 in paper
 
         # b examples picked randomly from X (Step 5 in paper)
-        batch_rand_idx = isnothing(weights) ? rand(rng, 1:ncol, alg.b) : wsample(rng, 1:ncol, weights, alg.b)
-        batch_sample = X[:, batch_rand_idx]
+        batch_rand_idx = isnothing(weights) ? rand!(rng, batch_rand_idx, 1:ncol) : wsample!(rng, 1:ncol, weights, batch_rand_idx)
 
         # Cache/label the batch samples nearest to the centers (Step 6 & 7)
-        @inbounds for i in axes(batch_sample, 2)
-            min_dist = distance(metric, batch_sample, centroids, i, 1)
+        @inbounds for i in batch_rand_idx
+            min_dist = distance(metric, X, centroids, i, 1)
             label = 1
 
             for j in 2:size(centroids, 2)
-                dist = distance(metric, batch_sample, centroids, i, j)
+                dist = distance(metric, X, centroids, i, j)
                 label = dist < min_dist ? j : label
                 min_dist = dist < min_dist ? dist : min_dist
             end
 
-            final_labels[batch_rand_idx[i]] = label
-        end
-
-        # TODO: Batch gradient step
-        @inbounds for j in axes(batch_sample, 2)  # iterate over examples (Step 9)
+            containers.labels[i] = label
 
-            # Get cached center/label for this x  => labels[batch_rand_idx[j]] (Step 10)
-            label = final_labels[batch_rand_idx[j]]
+            ##### Batch gradient step  #####
+            # iterate over examples (each column) ==> (Step 9)
+            # Get cached center/label for each example label = labels[i] => (Step 10) 
+            
             # Update per-center counts
-            N[label] += isnothing(weights) ? 1 : weights[j]  # verify (Step 11)
+            N[label] += isnothing(weights) ? 1 : weights[i]  # (Step 11)
 
             # Get per-center learning rate (Step 12)
             lr = 1 / N[label]
 
-            # Take gradient step (Step 13) # TODO: Replace with an allocation-less loop.
-            centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* batch_sample[:, j])
+            # Take gradient step (Step 13) # TODO: Replace with faster loop?
+            @views centroids[:, label] .= (1 - lr) .* centroids[:, label] .+ (lr .* X[:, i])
         end
 
-        # TODO: Reassign all labels based on new centres generated from the latest sample
-        final_labels = reassign_labels(X, metric, final_labels, centroids)
+        # Reassign all labels based on new centres generated from the latest sample
+        containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
 
-        # TODO: Calculate cost on whole dataset after reassignment and check for convergence
-        J = sum_of_squares(X, final_labels, centroids)  # just a placeholder for now
+        # Calculate cost on whole dataset after reassignment and check for convergence
+        @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)  
+        J = sum(containers.sum_of_squares)
 
         if verbose
             # Show progress and terminate if J stopped decreasing.
             println("Iteration $niters: Jclust = $J")
         end
 
-        # TODO: Check for early stopping convergence
+        # Check for early stopping convergence
         if (niters > 1) & (abs(J - J_previous) < (tol * J))
             counter += 1
 
             # Declare convergence if max_no_improvement criterion is met
             if counter >= max_no_improvement
                 converged = true
-                # TODO: Compute label assignment for the complete dataset
-                final_labels = reassign_labels(X, metric, final_labels, centroids)
+                # Compute label assignment for the complete dataset
+                containers.labels .= reassign_labels(X, metric, containers.labels, centroids)
 
-                # TODO: Compute totalcost for the complete dataset
-                J = sum_of_squares(X, final_labels, centroids)  # just a placeholder for now
+                # Compute totalcost for the complete dataset
+                @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
+                totalcost = sum(containers.sum_of_squares)
                 break
             end
         else
             counter = 0
+        end
+
+        # Warn users if model doesn't converge at max iterations
+        if (niters > max_iters) & (!converged)
+
+            if verbose
+                println("Clustering model failed to converge. Labelling data with latest centroids.")
+            end
+            containers.labels = reassign_labels(X, metric, containers.labels, centroids)
 
+            # Compute totalcost for unconverged model
+            @parallelize 1 ncol sum_of_squares(containers, X, containers.labels, centroids, weights, metric)
+            totalcost = sum(containers.sum_of_squares)
+            break
         end
 
         J_previous = J
         niters += 1
     end
 
-    return centroids, niters, converged, final_labels, J  # TODO: push learned artifacts to KmeansResult
-    #return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
+    # Push learned artifacts to KmeansResult
+    return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
 end
 
-# TODO: Only being used to test generic implementation. Get rid off after!
-function sum_of_squares(x, labels, centre)
-    s = 0.0
-
-    for i in axes(x, 2)
-        for j in axes(x, 1)
-            s += (x[j, i] - centre[j, labels[i]])^2
-        end
-    end
-    return s
-end
 
 function reassign_labels(DMatrix, metric, labels, centres)
     @inbounds for i in axes(DMatrix, 2)
@@ -135,3 +143,24 @@ function reassign_labels(DMatrix, metric, labels, centres)
     end
     return labels
 end
+
+"""
+    create_containers(::MiniBatch, k, nrow, ncol, n_threads)
+
+Internal function for the creation of all necessary intermidiate structures.
+
+- `centroids_new` - container which holds new positions of centroids
+- `centroids_cnt` - container which holds number of points for each centroid
+- `labels` - vector which holds labels of corresponding points
+- `sum_of_squares` - vector which holds the sum of squares values for each thread
+"""
+function create_containers(alg::MiniBatch, X, k, nrow, ncol, n_threads)
+    # Initiate placeholders to avoid allocations
+    T = eltype(X) 
+    labels = Vector{Int}(undef, ncol)  # labels vector
+    sum_of_squares = Vector{T}(undef, 1)  # total_sum_calculation
+    batch_rand_idx = Vector{Int}(undef, alg.b)
+
+    return (batch_rand_idx = batch_rand_idx,
+            labels = labels, sum_of_squares = sum_of_squares)
+end
diff --git a/test/test90_minibatch.jl b/test/test90_minibatch.jl
@@ -16,9 +16,9 @@ end
     rng = StableRNG(2020)
     X = rand(rng, 3, 100)
 
-    baseline = [kmeans(Lloyd(), X, 2).totalcost for i in 1:1_000] |> mean |> round
-    # TODO: Switch to kmeans after full implementation
-    res = [ParallelKMeans.kmeans!(MiniBatch(50), X, 2)[end] for i in 1:1_000] |> mean |> round
+    baseline = [kmeans(Lloyd(), X, 2; max_iters=100_000).totalcost for i in 1:200] |> mean |> round
+
+    res = [kmeans(MiniBatch(10), X, 2; max_iters=100_000).totalcost for i in 1:200] |> mean |> round
 
     @test baseline == res
 end
@@ -28,13 +28,9 @@ end
     rng = StableRNG(2020)
     X = rand(rng, 3, 100)
 
-    baseline = [kmeans(Lloyd(), X, 2;
-                       tol=1e-6, metric=Cityblock(),
-                       max_iters=500).totalcost for i in 1:1000] |> mean |> floor
-    # TODO: Switch to kmeans after full implementation
-    res = [ParallelKMeans.kmeans!(MiniBatch(), X, 2;
-                                  metric=Cityblock(), tol=1e-6,
-                                  max_iters=500)[end] for i in 1:1000] |> mean |> floor
+    baseline = [kmeans(Lloyd(), X, 2; metric=Cityblock(), max_iters=100_000).totalcost for i in 1:200] |> mean |> round
+
+    res = [kmeans(MiniBatch(10), X, 2; metric=Cityblock(), max_iters=100_000).totalcost for i in 1:200] |> mean |> round
 
     @test baseline == res
 end