Clustering#
Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.
We’ll use a dataset generator that is part of scikit-learn called make_moons
. This generates data that falls into 2 different sets with a shape that looks like half-moons.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
def generate_data():
xvec, val = datasets.make_moons(200, noise=0.2)
# encode the output to be 2 elements
x = []
v = []
for xv, vv in zip(xvec, val):
x.append(np.array(xv))
v.append(vv)
return np.array(x), np.array(v)
x, v = generate_data()
Let’s look at a point and it’s value
print(f"x = {x[0]}, value = {v[0]}")
x = [-0.79059875 0.63876912], value = 0
Now let’s plot the data
def plot_data(x, v):
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
fig, ax = plt.subplots()
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
ax.set_aspect("equal")
return fig
fig = plot_data(x, v)
data:image/s3,"s3://crabby-images/aca2c/aca2c18a0b687d004e7db106f81db86b8246d5e0" alt="../_images/f94d2524a495915748a571e7fe7cd07e21bf011af3641f8d60665f1548903319.png"
We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.
First we setup and train our network
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from tensorflow.keras.optimizers import RMSprop
2025-02-20 15:25:17.629025: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 15:25:17.632251: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 15:25:17.638786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1740065117.651935 4685 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740065117.656120 4685 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 15:25:17.671965: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 2
1 from keras.models import Sequential
----> 2 from keras.layers.core import Dense, Dropout, Activation
3 from tensorflow.keras.optimizers import RMSprop
ModuleNotFoundError: No module named 'keras.layers.core'
model = Sequential()
model.add(Dense(50, input_dim=2, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 2
1 model = Sequential()
----> 2 model.add(Dense(50, input_dim=2, activation="relu"))
3 model.add(Dense(20, activation="relu"))
4 model.add(Dense(1, activation="sigmoid"))
NameError: name 'Dense' is not defined
rms = RMSprop()
model.compile(loss='binary_crossentropy',
optimizer=rms, metrics=['accuracy'])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 1
----> 1 rms = RMSprop()
2 model.compile(loss='binary_crossentropy',
3 optimizer=rms, metrics=['accuracy'])
NameError: name 'RMSprop' is not defined
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model, show_shapes=True, dpi=65).create(prog='dot', format='svg'))
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[10], line 2
1 from IPython.display import SVG
----> 2 from keras.utils.vis_utils import model_to_dot
4 SVG(model_to_dot(model, show_shapes=True, dpi=65).create(prog='dot', format='svg'))
ModuleNotFoundError: No module named 'keras.utils.vis_utils'
We seem to need a lot of epochs here to get a good result
epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 2
1 epochs = 100
----> 2 results = model.fit(x, v, batch_size=50, epochs=epochs)
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/trainers/trainer.py:1049, in Trainer._assert_compile_called(self, method_name)
1047 else:
1048 msg += f"calling `{method_name}()`."
-> 1049 raise ValueError(msg)
ValueError: You must call `compile()` before using the model.
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[12], line 1
----> 1 score = model.evaluate(x, v, verbose=0)
2 print(f"score = {score[0]}")
3 print(f"accuracy = {score[1]}")
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/trainers/trainer.py:1049, in Trainer._assert_compile_called(self, method_name)
1047 else:
1048 msg += f"calling `{method_name}()`."
-> 1049 raise ValueError(msg)
ValueError: You must call `compile()` before using the model.
Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2)
, where N
is the number of points
res = model.predict(np.array([[-2, 2]]))
res
2025-02-20 15:25:19.819856: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[13], line 1
----> 1 res = model.predict(np.array([[-2, 2]]))
2 res
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/models/sequential.py:164, in Sequential.build(self, input_shape)
162 return
163 if not self._layers:
--> 164 raise ValueError(
165 f"Sequential model {self.name} cannot be built because it has "
166 "no layers. Call `model.add(layer)`."
167 )
168 if isinstance(self._layers[0], InputLayer):
169 if self._layers[0].batch_shape != input_shape:
ValueError: Sequential model sequential cannot be built because it has no layers. Call `model.add(layer)`.
We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.
Let’s plot the partitioning
M = 128
N = 128
xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75
xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)
To make the prediction go faster, we want to feed in a vector of these points, of the form:
[[xpt[0], ypt[0]],
[xpt[1], ypt[1]],
...
]
We can see that this packs them into the vector
pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])
Now we do the prediction. We will get a vector out, which we reshape to match the original domain.
res = model.predict(pairs, verbose=0)
res.shape = (M, N)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[16], line 1
----> 1 res = model.predict(pairs, verbose=0)
2 res.shape = (M, N)
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
119 filtered_tb = _process_traceback_frames(e.__traceback__)
120 # To get the full stack trace, call:
121 # `keras.config.disable_traceback_filtering()`
--> 122 raise e.with_traceback(filtered_tb) from None
123 finally:
124 del filtered_tb
File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/models/sequential.py:164, in Sequential.build(self, input_shape)
162 return
163 if not self._layers:
--> 164 raise ValueError(
165 f"Sequential model {self.name} cannot be built because it has "
166 "no layers. Call `model.add(layer)`."
167 )
168 if isinstance(self._layers[0], InputLayer):
169 if self._layers[0].batch_shape != input_shape:
ValueError: Sequential model sequential cannot be built because it has no layers. Call `model.add(layer)`.
Finally, round to 0 or 1
domain = np.where(res > 0.5, 1, 0)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 domain = np.where(res > 0.5, 1, 0)
NameError: name 'res' is not defined
and we can plot the data
fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 2
1 fig, ax = plt.subplots()
----> 2 ax.imshow(domain.T, origin="lower",
3 extent=[xmin, xmax, ymin, ymax], alpha=0.25)
4 xpt = [q[0] for q in x]
5 ypt = [q[1] for q in x]
NameError: name 'domain' is not defined
data:image/s3,"s3://crabby-images/307bc/307bc31fa09c7b75741211028bdece697ba2f459" alt="../_images/9e13561b06f5d2c575e2e9d0ab28f3ca3efd7ae4d5738c0cd619a55149090489.png"