PyDataBlog
diff --git a/‎Project.toml‎
Lines changed: 6 additions & 4 deletions b/‎Project.toml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmark/bench01_distance.jl‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/bench01_distance.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/src/index.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/src/index.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/ParallelKMeans.jl‎
Lines changed: 2 additions & 3 deletions b/‎src/ParallelKMeans.jl‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/coreset.jl‎
Lines changed: 6 additions & 5 deletions b/‎src/coreset.jl‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/elkan.jl‎
Lines changed: 3 additions & 2 deletions b/‎src/elkan.jl‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/hamerly.jl‎
Lines changed: 3 additions & 2 deletions b/‎src/hamerly.jl‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/kmeans.jl‎
Lines changed: 17 additions & 4 deletions b/‎src/kmeans.jl‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎src/lloyd.jl‎
Lines changed: 8 additions & 6 deletions b/‎src/lloyd.jl‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/mlj_interface.jl‎
Lines changed: 47 additions & 17 deletions b/‎src/mlj_interface.jl‎
Lines changed: 47 additions & 17 deletions
@@ -1,24 +1,26 @@
 name = "ParallelKMeans"
 uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
 authors = ["Bernard Brenyah", "Andrey Oskin"]
-version = "0.1.6"
+version = "0.1.7"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
-StatsBase = "0.32, 0.33"
-julia = "1.3"
 Distances = "0.8.2"
 MLJModelInterface = "0.2.1"
+StatsBase = "0.32, 0.33"
+julia = "1.3"
 
 [extras]
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 
 [targets]
-test = ["Test", "Random", "Suppressor", "MLJBase"]
+test = ["Test", "Random", "Suppressor", "MLJBase", "StableRNGs"]
@@ -10,12 +10,12 @@ Random.seed!(2020)
 X = rand(3, 100_000)
 centroids = rand(3, 2)
 d = Vector{Float64}(undef, 100_000)
-suite["100kx3"] = @benchmarkable ParallelKMeans.colwise!($d, $X, $centroids)
+suite["100kx3"] = @benchmarkable ParallelKMeans.chunk_colwise($d, $X, $centroids, 1, nothing, 1:100_000, 1)
 
 X = rand(10, 100_000)
 centroids = rand(10, 2)
 d = Vector{Float64}(undef, 100_000)
-suite["100kx10"] = @benchmarkable ParallelKMeans.colwise!($d, $X, $centroids)
+suite["100kx10"] = @benchmarkable ParallelKMeans.chunk_colwise($d, $X, $centroids, 1, nothing, 1:100_000, 1)
 
 end # module
 
 
@@ -74,12 +74,12 @@ git checkout experimental
 - [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
 - [X] Implementation of [Yinyang K-Means: A Drop-In Replacement of the Classic K-Means with Consistent Speedup](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
 - [X] Implementation of [Coresets](http://proceedings.mlr.press/v51/lucic16-supp.pdf).
-- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
 - [X] Support for weighted K-means.
+- [X] Support of MLJ Random generation hyperparameter.
 - [ ] Support for other distance metrics supported by [Distances.jl](https://github.com/JuliaStats/Distances.jl#supported-distances).
-- [ ] Support of MLJ Random generation hyperparameter.
+- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
 - [ ] Native support for tabular data inputs outside of MLJModels' interface.
-- [ ] Refactoring and finalizaiton of API desgin.
+- [ ] Refactoring and finalization of API design.
 - [ ] GPU support.
 - [ ] Distributed calculations support.
 - [ ] Optimization of code base.
@@ -207,6 +207,7 @@ ________________________________________________________________________________
 - 0.1.4 Bug fixes.
 - 0.1.5 Added `Yinyang` algorithm.
 - 0.1.6 Added support for weighted k-means; Added `Coreset` algorithm; improved support for different types of the design matrix.
+- 0.1.7 Added `Yinyang` and `Coreset` support in MLJ interface; added `weights` support in MLJ; added RNG seed support in MLJ interface and through all algorithms.
 
 ## Contributing
 
 
@@ -1,20 +1,19 @@
 module ParallelKMeans
 
 using StatsBase
+using Random
 import MLJModelInterface
 import Base.Threads: @spawn
 import Distances
 
-const MMI = MLJModelInterface
-
 include("kmeans.jl")
 include("seeding.jl")
 include("lloyd.jl")
 include("hamerly.jl")
 include("elkan.jl")
 include("yinyang.jl")
-include("mlj_interface.jl")
 include("coreset.jl")
+include("mlj_interface.jl")
 
 export kmeans
 export Lloyd, Hamerly, Elkan, Yinyang, 阴阳, Coreset
 
@@ -38,9 +38,10 @@ Coreset(alg::AbstractKMeansAlg) = Coreset(100, alg)
 function kmeans!(alg::Coreset, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
-                tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
+                tol = eltype(design_matrix)(1e-6), verbose = false,
+                init = nothing, rng = Random.GLOBAL_RNG)
     nrow, ncol = size(X)
-    centroids = isnothing(init) ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
+    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init=k_init).centroids : deepcopy(init)
 
     T = eltype(X)
     # Steps 2-4 of the paper's algorithm 3
@@ -54,14 +55,14 @@ function kmeans!(alg::Coreset, containers, X, k, weights;
     @parallelize n_threads ncol chunk_update_sensitivity(alg, containers)
 
     # sample from containers.s
-    coreset_ids = wsample(1:ncol, containers.s, alg.m)
+    coreset_ids = wsample(rng, 1:ncol, containers.s, alg.m)
     coreset = X[:, coreset_ids]
     # create new weights as 1/s[i]
     coreset_weights = one(T) ./ @view containers.s[coreset_ids]
 
     # run usual kmeans for new set with new weights.
-    res = kmeans(alg.alg, coreset, k, coreset_weights, tol = tol, max_iters = max_iters,
-        verbose = verbose, init = centroids, n_threads = n_threads)
+    res = kmeans(alg.alg, coreset, k, weights = coreset_weights, tol = tol, max_iters = max_iters,
+        verbose = verbose, init = centroids, n_threads = n_threads, rng = rng)
 
     @parallelize n_threads ncol chunk_apply(alg, containers, res.centers, X, weights)
 
 
@@ -21,9 +21,10 @@ struct Elkan <: AbstractKMeansAlg end
 function kmeans!(alg::Elkan, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
-                tol = eltype(X)(1e-6), verbose = false, init = nothing)
+                tol = eltype(X)(1e-6), verbose = false,
+                init = nothing, rng = Random.GLOBAL_RNG)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, rng, init=k_init).centroids : deepcopy(init)
 
     update_containers(alg, containers, centroids, n_threads)
     @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
 
@@ -21,9 +21,10 @@ struct Hamerly <: AbstractKMeansAlg end
 function kmeans!(alg::Hamerly, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
-                tol = eltype(X)(1e-6), verbose = false, init = nothing)
+                tol = eltype(X)(1e-6), verbose = false,
+                init = nothing, rng = Random.GLOBAL_RNG)
     nrow, ncol = size(X)
-    centroids = init == nothing ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
+    centroids = init == nothing ? smart_init(X, k, n_threads, weights, rng, init=k_init).centroids : deepcopy(init)
 
     @parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X, weights)
 
 
@@ -40,6 +40,16 @@ struct KmeansResult{C<:AbstractMatrix{<:AbstractFloat},D<:Real,WC<:Real} <: Clus
     converged::Bool            # whether the procedure converged
 end
 
+"""
+    spliiter(n, k)
+
+Internal utility function, splits 1:n sequence to k chunks of approximately same size.
+"""
+function splitter(n, k)
+    xz = Int.(ceil.(range(0, n, length = k+1)))
+    return [xz[i]+1:xz[i+1] for i in 1:k]
+end
+
 """
     @parallelize(n_threads, ncol, f)
 
@@ -120,7 +130,8 @@ function sum_of_squares(containers, x, labels, centre, weights, r, idx)
 end
 
 """
-    Kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(), k_init="k-means++", max_iters=300, tol=1e-6, verbose=true)
+    kmeans([alg::AbstractKMeansAlg,] design_matrix, k; n_threads = nthreads(),
+    k_init="k-means++", max_iters=300, tol=1e-6, verbose=true, rng = Random.GLOBAL_RNG)
 
 This main function employs the K-means algorithm to cluster all examples
 in the training data (design_matrix) into k groups using either the
@@ -146,16 +157,18 @@ alternatively one can use `rand` to choose random points for init.
 
 A `KmeansResult` structure representing labels, centroids, and sum_squares is returned.
 """
-function kmeans(alg::AbstractKMeansAlg, design_matrix, k, weights = nothing;
+function kmeans(alg::AbstractKMeansAlg, design_matrix, k;
+                weights = nothing,
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
-                tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
+                tol = eltype(design_matrix)(1e-6), verbose = false,
+                init = nothing, rng = Random.GLOBAL_RNG)
     nrow, ncol = size(design_matrix)
     containers = create_containers(alg, design_matrix, k, nrow, ncol, n_threads)
 
     return kmeans!(alg, containers, design_matrix, k, weights, n_threads = n_threads,
                     k_init = k_init, max_iters = max_iters, tol = tol,
-                    verbose = verbose, init = init)
+                    verbose = verbose, init = init, rng = rng)
 end
 
 
 
@@ -17,9 +17,10 @@ centroids and so on, which are used during calculations.
 function kmeans!(alg::Lloyd, containers, X, k, weights;
                 n_threads = Threads.nthreads(),
                 k_init = "k-means++", max_iters = 300,
-                tol = eltype(design_matrix)(1e-6), verbose = false, init = nothing)
+                tol = eltype(design_matrix)(1e-6), verbose = false,
+                init = nothing, rng = Random.GLOBAL_RNG)
     nrow, ncol = size(X)
-    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, init=k_init).centroids : deepcopy(init)
+    centroids = isnothing(init) ? smart_init(X, k, n_threads, weights, rng, init=k_init).centroids : deepcopy(init)
 
     T = eltype(X)
     converged = false
@@ -61,12 +62,13 @@ function kmeans!(alg::Lloyd, containers, X, k, weights;
     return KmeansResult(centroids, containers.labels, T[], Int[], T[], totalcost, niters, converged)
 end
 
-kmeans(design_matrix, k, weights = nothing;
+kmeans(design_matrix, k;
+    weights = nothing,
     n_threads = Threads.nthreads(),
     k_init = "k-means++", max_iters = 300, tol = 1e-6,
-    verbose = false, init = nothing) =
-        kmeans(Lloyd(), design_matrix, k, weights; n_threads = n_threads, k_init = k_init, max_iters = max_iters, tol = tol,
-            verbose = verbose, init = init)
+    verbose = false, init = nothing, rng = Random.GLOBAL_RNG) =
+        kmeans(Lloyd(), design_matrix, k; weights = weights, n_threads = n_threads, k_init = k_init, max_iters = max_iters, tol = tol,
+            verbose = verbose, init = init, rng = rng)
 
 """
     create_containers(::Lloyd, k, nrow, ncol, n_threads)
 
@@ -1,46 +1,51 @@
 # Expose all instances of user specified structs and package artifcats.
+const MMI = MLJModelInterface
+
 const ParallelKMeans_Desc = "Parallel & lightning fast implementation of all available variants of the KMeans clustering algorithm
                              in native Julia. Compatible with Julia 1.3+"
 
 # availalbe variants for reference
 const MLJDICT = Dict(:Lloyd => Lloyd(),
                      :Hamerly => Hamerly(),
-                     :Elkan => Elkan())
+                     :Elkan => Elkan(),
+					 :Yinyang => Yinyang(),
+					 :Coreset => Coreset(),
+					 :阴阳 => Coreset())
 
 ####
 #### MODEL DEFINITION
 ####
 
 mutable struct KMeans <: MMI.Unsupervised
-    algo::Symbol
+    algo::Union{Symbol, AbstractKMeansAlg}
     k_init::String
     k::Int
     tol::Float64
     max_iters::Int
     copy::Bool
     threads::Int
+    rng::Union{AbstractRNG, Int}
+	weights
     init
 end
 
 
-function KMeans(; algo=:Hamerly, k_init="k-means++",
-                k=3, tol=1e-6, max_iters=300, copy=true,
-                threads=Threads.nthreads(), init=nothing)
+function KMeans(; algo = :Hamerly, k_init = "k-means++",
+                k = 3, tol = 1e-6, max_iters = 300, copy = true,
+                threads = Threads.nthreads(), init = nothing,
+				rng = Random.GLOBAL_RNG, weights = nothing)
 
-    model   = KMeans(algo, k_init, k, tol, max_iters, copy, threads, init)
+    model   = KMeans(algo, k_init, k, tol, max_iters, copy, threads, rng, weights, init)
     message = MMI.clean!(model)
     isempty(message) || @warn message
     return model
 end
 
 
 function MMI.clean!(m::KMeans)
-    warning = String[]
+	warning = String[]
 
-    if !(m.algo ∈ keys(MLJDICT))
-        push!(warning, "Unsupported KMeans variant. Defaulting to Hamerly algorithm.")
-        m.algo = :Hamerly
-	end
+	m.algo = clean_algo(m.algo, warning)
 
     if !(m.k_init ∈ ["k-means++", "random"])
         push!(warning, "Only \"k-means++\" or \"random\" seeding algorithms are supported. Defaulting to k-means++ seeding.")
@@ -89,15 +94,23 @@ function MMI.fit(m::KMeans, verbosity::Int, X)
         DMatrix = convert(Array{Float64, 2}, MMI.matrix(X, transpose=true))
     end
 
-    # lookup available algorithms
-    algo = MLJDICT[m.algo]  # select algo
+	# setup rng
+	rng = get_rng(m.rng)
+
+	if !isnothing(m.weights) && (size(DMatrix, 2) != length(m.weights))
+		@warn "Size mismatch, number of points in X $(size(DMatrix, 2)) not equal weights length $(length(m.weights)). Weights parameter ignored."
+		weights = nothing
+	else
+
+		weights = m.weights
+	end
 
     # fit model and get results
     verbose = verbosity > 0  # Display fitting operations if verbosity > 0
-    result = ParallelKMeans.kmeans(algo, DMatrix, m.k;
-                                      n_threads = m.threads, k_init=m.k_init,
-                                      max_iters=m.max_iters, tol=m.tol, init=m.init,
-                                      verbose=verbose)
+    result = ParallelKMeans.kmeans(m.algo, DMatrix, m.k;
+                                      n_threads = m.threads, k_init = m.k_init,
+                                      max_iters = m.max_iters, tol = m.tol, init = m.init,
+                                      rng = rng, verbose = verbose, weights = weights)
 
     cluster_labels = MMI.categorical(1:m.k)
     fitresult = (centers = result.centers, labels = cluster_labels, converged = result.converged)
@@ -192,3 +205,20 @@ MMI.metadata_model(KMeans,
     weights = false,
     descr   = ParallelKMeans_Desc,
 	path	= "ParallelKMeans.KMeans")
+
+####
+#### Auxiliary functions
+####
+
+get_rng(rng::Int) = MersenneTwister(rng)
+get_rng(rng) = rng
+
+clean_algo(algo::AbstractKMeansAlg, warning) = algo
+function clean_algo(algo::Symbol, warning)
+	if !(algo ∈ keys(MLJDICT))
+		push!(warning, "Unsupported KMeans variant. Defaulting to Hamerly algorithm.")
+		return MLJDICT[:Hamerly]
+	else
+		return MLJDICT[algo]
+	end
+end