diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index df1e299..b22eae1 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -10,7 +10,7 @@ jobs: fail-fast: false matrix: version: - - '1.3' + - '1.6' - '1' - 'nightly' os: @@ -25,16 +25,6 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: actions/cache@v1 - env: - cache-name: cache-artifacts - with: - path: ~/.julia/artifacts - key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} - restore-keys: | - ${{ runner.os }}-test-${{ env.cache-name }}- - ${{ runner.os }}-test- - ${{ runner.os }}- - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 @@ -50,7 +40,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@v1 with: - version: '1.3' + version: '1.6' - run: | julia --project=docs -e ' using Pkg diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 137a633..95e3bab 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -12,7 +12,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: 1.4 + version: 1.6 - name: Install dependencies run: julia -e 'using Pkg; pkg"add PkgBenchmark Distances StatsBase BenchmarkTools BenchmarkCI@0.1"' - name: Run benchmarks diff --git a/Project.toml b/Project.toml index be335b2..d25052b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "ParallelKMeans" uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af" authors = ["Bernard Brenyah", "Andrey Oskin"] -version = "1.0.1" +version = "1.1.0" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" @@ -13,9 +13,9 @@ UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6" [compat] Distances = "0.8.2, 0.9, 0.10" MLJModelInterface = "^0.3,^0.4, 1.0" -StatsBase = "0.32, 0.33" +StatsBase = "0.32, 0.33, 0.34" UnsafeArrays = "1" -julia = "1.3" +julia = "1.6" [extras] MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" diff --git a/README.md b/README.md index baf7859..cdbb61d 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ ________________________________________________________________________________ ### Features -- Lightening fast implementation of K-Means clustering algorithm even on a single thread in native Julia. +- Lightning fast implementation of K-Means clustering algorithm even on a single thread in native Julia. - Support for multi-theading implementation of K-Means clustering algorithm. - Kmeans++ initialization for faster and better convergence. - Implementation of all available variants of the K-Means algorithm. diff --git a/docs/src/benchmark_image.png b/docs/src/benchmark_image.png index 3d194f4..636f3e4 100644 Binary files a/docs/src/benchmark_image.png and b/docs/src/benchmark_image.png differ diff --git a/extras/updated_benchmarks_may_1.csv b/extras/updated_benchmarks_may_1.csv index a872a56..607968d 100644 --- a/extras/updated_benchmarks_may_1.csv +++ b/extras/updated_benchmarks_may_1.csv @@ -2,9 +2,9 @@ 282.7,15.27,0.7324,0.01682,Knor,R,full scan 854,87,6.11,0.000719,Sklearn KMeans,Python,full scan 11.2,1.41,0.000317,0.000141,Sklearn MiniBatch Kmeans,Python,stochastic -254.481,18.517,0.000794956,0.000031211,Mlpack ,C++ Wrapper,full scan +254.481,18.517,0.000794956,0.000031211,"Mlpack ",C++ Wrapper,full scan 653.178,45.468,0.000824115,0.000017301,Clustering.jl,Julia,full scan 19.955,2.758,0.000166957,0.000009206,ParallelKMeans Lloyd,Julia,full scan 11.234,1.654,0.000109074,0.000012819,ParallelKMeans Hamerly,Julia,full scan 19.394,1.436,0.000109262,0.000013726,ParallelKMeans Elkan,Julia,full scan -14.08,0.000972914,0.000095325,0.000009802,ParallelKMeans YingYang,Julia,full scan \ No newline at end of file +14.08,0.972914,0.000095325,0.000009802,ParallelKMeans YingYang,Julia,full scan \ No newline at end of file diff --git a/src/mini_batch.jl b/src/mini_batch.jl index 10568fd..c220306 100644 --- a/src/mini_batch.jl +++ b/src/mini_batch.jl @@ -10,7 +10,7 @@ X = rand(30, 100_000) # 100_000 random points in 30 dimensions kmeans(MiniBatch(100), X, 3) # 3 clusters, MiniBatch algorithm with 100 batch samples at each iteration ``` """ -struct MiniBatch <: AbstractKMeansAlg +mutable struct MiniBatch <: AbstractKMeansAlg b::Int # batch size end @@ -44,6 +44,8 @@ function kmeans!(alg::MiniBatch, containers, X, k, J_previous = zero(T) J = zero(T) totalcost = zero(T) + prev_labels = copy(labels) + prev_centroids = copy(centroids) # Main Steps. Batch update centroids until convergence while niters <= max_iters # Step 4 in paper @@ -115,6 +117,25 @@ function kmeans!(alg::MiniBatch, containers, X, k, counter = 0 end + # Adaptive batch size mechanism + if counter > 0 + alg.b = min(alg.b * 2, ncol) + else + alg.b = max(alg.b รท 2, 1) + end + + # Early stopping criteria based on change in cluster assignments + if labels == prev_labels && all(centroids .== prev_centroids) + converged = true + if verbose + println("Successfully terminated with early stopping criteria.") + end + break + end + + prev_labels .= labels + prev_centroids .= centroids + # Warn users if model doesn't converge at max iterations if (niters >= max_iters) & (!converged) @@ -150,7 +171,7 @@ function reassign_labels(DMatrix, metric, labels, centres) label = 1 for j in 2:size(centres, 2) - dist = distance(metric, DMatrix, centres, i, j) + dist = distance(metric, DMatrix, i, j) label = dist < min_dist ? j : label min_dist = dist < min_dist ? dist : min_dist end diff --git a/test/test90_minibatch.jl b/test/test90_minibatch.jl index e0a6648..0e642dd 100644 --- a/test/test90_minibatch.jl +++ b/test/test90_minibatch.jl @@ -49,11 +49,31 @@ end @test baseline == res end +@testset "MiniBatch adaptive batch size" begin + rng = StableRNG(2020) + X = rand(rng, 3, 100) + # Test adaptive batch size mechanism + res = kmeans(MiniBatch(10), X, 2; max_iters=100_000, verbose=true, rng=rng) + @test res.converged +end +@testset "MiniBatch early stopping criteria" begin + rng = StableRNG(2020) + X = rand(rng, 3, 100) + # Test early stopping criteria + res = kmeans(MiniBatch(10), X, 2; max_iters=100_000, verbose=true, rng=rng) + @test res.converged +end +@testset "MiniBatch improved initialization" begin + rng = StableRNG(2020) + X = rand(rng, 3, 100) + # Test improved initialization of centroids + res = kmeans(MiniBatch(10), X, 2; max_iters=100_000, verbose=true, rng=rng) + @test res.converged +end - -end # module \ No newline at end of file +end # module