Skip to content

Commit cf417c7

Browse files
committed
Kmeans from scratch
1 parent e3c2ce0 commit cf417c7

File tree

1 file changed

+48
-49
lines changed

1 file changed

+48
-49
lines changed

Kmeans_image_compression.ipynb

Lines changed: 48 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
},
1010
{
1111
"cell_type": "code",
12-
"execution_count": null,
12+
"execution_count": 23,
1313
"metadata": {},
1414
"outputs": [],
1515
"source": [
@@ -21,7 +21,7 @@
2121
},
2222
{
2323
"cell_type": "code",
24-
"execution_count": null,
24+
"execution_count": 27,
2525
"metadata": {},
2626
"outputs": [],
2727
"source": [
@@ -32,33 +32,31 @@
3232
},
3333
{
3434
"cell_type": "code",
35-
"execution_count": 25,
35+
"execution_count": 28,
3636
"metadata": {},
3737
"outputs": [
3838
{
39-
"name": "stdout",
40-
"output_type": "stream",
41-
"text": [
42-
"[[0. 0.]]\n",
43-
"[1.84207953 4.6075716 ]\n"
39+
"ename": "FileNotFoundError",
40+
"evalue": "[Errno 2] No such file or directory: '../datasets/kmeans_image_compression.npy'",
41+
"output_type": "error",
42+
"traceback": [
43+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
44+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
45+
"Cell \u001b[0;32mIn[28], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m temp1 \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m temp1\n",
46+
"Cell \u001b[0;32mIn[27], line 2\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_data\u001b[39m():\n\u001b[0;32m----> 2\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m../datasets/kmeans_image_compression.npy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
47+
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/numpy/lib/npyio.py:427\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding, max_header_size)\u001b[0m\n\u001b[1;32m 425\u001b[0m own_fid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 426\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 427\u001b[0m fid \u001b[38;5;241m=\u001b[39m stack\u001b[38;5;241m.\u001b[39menter_context(\u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mos_fspath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 428\u001b[0m own_fid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 430\u001b[0m \u001b[38;5;66;03m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
48+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../datasets/kmeans_image_compression.npy'"
4449
]
4550
}
4651
],
4752
"source": [
48-
"temp = load_data()\n",
49-
"\n",
50-
"temmm = np.zeros((1, 2))\n",
51-
"\n",
52-
"print(temmm)\n",
53-
"\n",
54-
"temmm = temp[0]\n",
55-
"\n",
56-
"print(temmm)"
53+
"temp1 = load_data()\n",
54+
"temp1"
5755
]
5856
},
5957
{
6058
"cell_type": "code",
61-
"execution_count": 17,
59+
"execution_count": 6,
6260
"metadata": {},
6361
"outputs": [],
6462
"source": [
@@ -79,6 +77,21 @@
7977
" centroids.append([x, y])\n",
8078
" return centroids\n",
8179
"\n",
80+
"\n",
81+
"\n",
82+
"def calculate_centroids(X, centroids):\n",
83+
" X_centroids = np.zeros(X.shape[0])\n",
84+
" for i in range(X.shape[0]):\n",
85+
" distance = []\n",
86+
" for j in range(centroids.shape[0]):\n",
87+
" norm = np.linalg.norm(X[i] - centroids[j])\n",
88+
" distance.append(norm)\n",
89+
"\n",
90+
" X_centroids[i] = np.argmin(distance)\n",
91+
" return X_centroids\n",
92+
"\n",
93+
"\n",
94+
"\n",
8295
"def calulate_loss(data, centroids):\n",
8396
" \n",
8497
" return\n",
@@ -94,25 +107,20 @@
94107
},
95108
{
96109
"cell_type": "code",
97-
"execution_count": 28,
110+
"execution_count": 7,
98111
"metadata": {},
99112
"outputs": [
100113
{
101-
"name": "stdout",
102-
"output_type": "stream",
103-
"text": [
104-
"-0.24512712766170175 8.203398153359817\n"
105-
]
106-
},
107-
{
108-
"ename": "AttributeError",
109-
"evalue": "'list' object has no attribute 'shape'",
114+
"ename": "FileNotFoundError",
115+
"evalue": "[Errno 2] No such file or directory: '../datasets/kmeans_image_compression.npy'",
110116
"output_type": "error",
111117
"traceback": [
112118
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
113-
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
114-
"Cell \u001b[0;32mIn[28], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]):\n\u001b[1;32m 34\u001b[0m distance \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 35\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mlen\u001b[39m(\u001b[43mcentroids\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m[\u001b[38;5;241m0\u001b[39m])):\n\u001b[1;32m 36\u001b[0m norm \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mlinalg\u001b[38;5;241m.\u001b[39mnorm(X[i] \u001b[38;5;241m-\u001b[39m centroids[j])\n\u001b[1;32m 37\u001b[0m distance\u001b[38;5;241m.\u001b[39mappend(norm)\n",
115-
"\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'shape'"
119+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
120+
"Cell \u001b[0;32mIn[7], line 25\u001b[0m\n\u001b[1;32m 23\u001b[0m iterations \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m100\u001b[39m\n\u001b[1;32m 24\u001b[0m cost_array \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m---> 25\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m1\u001b[39m, K\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m):\n\u001b[1;32m 28\u001b[0m summ \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
121+
"Cell \u001b[0;32mIn[5], line 2\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_data\u001b[39m():\n\u001b[0;32m----> 2\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m../datasets/kmeans_image_compression.npy\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
122+
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/numpy/lib/npyio.py:427\u001b[0m, in \u001b[0;36mload\u001b[0;34m(file, mmap_mode, allow_pickle, fix_imports, encoding, max_header_size)\u001b[0m\n\u001b[1;32m 425\u001b[0m own_fid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 426\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 427\u001b[0m fid \u001b[38;5;241m=\u001b[39m stack\u001b[38;5;241m.\u001b[39menter_context(\u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mos_fspath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 428\u001b[0m own_fid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 430\u001b[0m \u001b[38;5;66;03m# Code to distinguish from NumPy binary files and pickles.\u001b[39;00m\n",
123+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../datasets/kmeans_image_compression.npy'"
116124
]
117125
}
118126
],
@@ -144,19 +152,13 @@
144152
"X = load_data()\n",
145153
"\n",
146154
"for k in range(1, K+1):\n",
155+
" summ = 0\n",
147156
" lowest_cost = 100000\n",
148157
" for iteration in range(iterations):\n",
149158
" centroids = initialize_centroids(X, k)\n",
150159
" X_centroids = np.zeros(X.shape[0])\n",
151160
" while(True):\n",
152-
" for i in range(X.shape[0]):\n",
153-
" distance = []\n",
154-
" for j in range(len(centroids.shape[0])):\n",
155-
" norm = np.linalg.norm(X[i] - centroids[j])\n",
156-
" distance.append(norm)\n",
157-
"\n",
158-
" X_centroids[i] = np.argmin(distance)\n",
159-
"\n",
161+
" X_centroids = calculate_centroids(X, centroids)\n",
160162
" ## calculate mean of each cluster\n",
161163
" means = np.zeros(range(K))\n",
162164
" for kk in range(K):\n",
@@ -173,15 +175,12 @@
173175
" else:\n",
174176
" centroids = means\n",
175177
"\n",
176-
" # ## calculate cost of the final centroids\n",
177-
" # for i in range(X.shape[0]):\n",
178-
" # distance = []\n",
179-
" # for j in range(len(centroids.shape[0])):\n",
180-
" # norm = np.linalg.norm(X[i] - centroids[j])\n",
181-
" # distance.append(norm)\n",
182-
" \n",
183-
" \n",
184-
"\n"
178+
" ## calculate cost of the final centroids\n",
179+
" X_centroids = calculate_centroids(X, centroids)\n",
180+
" for i in range(len(X_centroids)):\n",
181+
" summ += np.linalg.norm(X[i] - centroids[X_centroids[i]])\n",
182+
"\n",
183+
" print(summ)"
185184
]
186185
},
187186
{
@@ -194,7 +193,7 @@
194193
],
195194
"metadata": {
196195
"kernelspec": {
197-
"display_name": "Python 3",
196+
"display_name": "Python 3 (ipykernel)",
198197
"language": "python",
199198
"name": "python3"
200199
},

0 commit comments

Comments
 (0)