Clustering

Clustering#

Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.

We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
def generate_data():
    xvec, val = datasets.make_moons(200, noise=0.2)

    # encode the output to be 2 elements
    x = []
    v = []
    for xv, vv in zip(xvec, val):
        x.append(np.array(xv))
        v.append(vv)

    return np.array(x), np.array(v)
x, v = generate_data()

Let’s look at a point and it’s value

print(f"x = {x[0]}, value = {v[0]}")
x = [-0.79059875  0.63876912], value = 0

Now let’s plot the data

def plot_data(x, v):
    xpt = [q[0] for q in x]
    ypt = [q[1] for q in x]

    fig, ax = plt.subplots()
    ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
    ax.set_aspect("equal")
    return fig
fig = plot_data(x, v)
../_images/f94d2524a495915748a571e7fe7cd07e21bf011af3641f8d60665f1548903319.png

We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.

First we setup and train our network

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from tensorflow.keras.optimizers import RMSprop
2025-02-20 15:25:17.629025: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 15:25:17.632251: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 15:25:17.638786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1740065117.651935    4685 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740065117.656120    4685 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 15:25:17.671965: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[7], line 2
      1 from keras.models import Sequential
----> 2 from keras.layers.core import Dense, Dropout, Activation
      3 from tensorflow.keras.optimizers import RMSprop

ModuleNotFoundError: No module named 'keras.layers.core'
model = Sequential()
model.add(Dense(50, input_dim=2, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 2
      1 model = Sequential()
----> 2 model.add(Dense(50, input_dim=2, activation="relu"))
      3 model.add(Dense(20, activation="relu"))
      4 model.add(Dense(1, activation="sigmoid"))

NameError: name 'Dense' is not defined
rms = RMSprop()
model.compile(loss='binary_crossentropy',
              optimizer=rms, metrics=['accuracy'])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 rms = RMSprop()
      2 model.compile(loss='binary_crossentropy',
      3               optimizer=rms, metrics=['accuracy'])

NameError: name 'RMSprop' is not defined
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes=True, dpi=65).create(prog='dot', format='svg'))
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[10], line 2
      1 from IPython.display import SVG
----> 2 from keras.utils.vis_utils import model_to_dot
      4 SVG(model_to_dot(model, show_shapes=True, dpi=65).create(prog='dot', format='svg'))

ModuleNotFoundError: No module named 'keras.utils.vis_utils'

We seem to need a lot of epochs here to get a good result

epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 2
      1 epochs = 100
----> 2 results = model.fit(x, v, batch_size=50, epochs=epochs)

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/trainers/trainer.py:1049, in Trainer._assert_compile_called(self, method_name)
   1047 else:
   1048     msg += f"calling `{method_name}()`."
-> 1049 raise ValueError(msg)

ValueError: You must call `compile()` before using the model.
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[12], line 1
----> 1 score = model.evaluate(x, v, verbose=0)
      2 print(f"score = {score[0]}")
      3 print(f"accuracy = {score[1]}")

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/trainers/trainer.py:1049, in Trainer._assert_compile_called(self, method_name)
   1047 else:
   1048     msg += f"calling `{method_name}()`."
-> 1049 raise ValueError(msg)

ValueError: You must call `compile()` before using the model.

Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points

res = model.predict(np.array([[-2, 2]]))
res
2025-02-20 15:25:19.819856: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[13], line 1
----> 1 res = model.predict(np.array([[-2, 2]]))
      2 res

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/models/sequential.py:164, in Sequential.build(self, input_shape)
    162     return
    163 if not self._layers:
--> 164     raise ValueError(
    165         f"Sequential model {self.name} cannot be built because it has "
    166         "no layers. Call `model.add(layer)`."
    167     )
    168 if isinstance(self._layers[0], InputLayer):
    169     if self._layers[0].batch_shape != input_shape:

ValueError: Sequential model sequential cannot be built because it has no layers. Call `model.add(layer)`.

We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.

Let’s plot the partitioning

M = 128
N = 128

xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75

xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)

To make the prediction go faster, we want to feed in a vector of these points, of the form:

[[xpt[0], ypt[0]],
 [xpt[1], ypt[1]],
 ...
]

We can see that this packs them into the vector

pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])

Now we do the prediction. We will get a vector out, which we reshape to match the original domain.

res = model.predict(pairs, verbose=0)
res.shape = (M, N)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[16], line 1
----> 1 res = model.predict(pairs, verbose=0)
      2 res.shape = (M, N)

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py:122, in filter_traceback.<locals>.error_handler(*args, **kwargs)
    119     filtered_tb = _process_traceback_frames(e.__traceback__)
    120     # To get the full stack trace, call:
    121     # `keras.config.disable_traceback_filtering()`
--> 122     raise e.with_traceback(filtered_tb) from None
    123 finally:
    124     del filtered_tb

File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/site-packages/keras/src/models/sequential.py:164, in Sequential.build(self, input_shape)
    162     return
    163 if not self._layers:
--> 164     raise ValueError(
    165         f"Sequential model {self.name} cannot be built because it has "
    166         "no layers. Call `model.add(layer)`."
    167     )
    168 if isinstance(self._layers[0], InputLayer):
    169     if self._layers[0].batch_shape != input_shape:

ValueError: Sequential model sequential cannot be built because it has no layers. Call `model.add(layer)`.

Finally, round to 0 or 1

domain = np.where(res > 0.5, 1, 0)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 domain = np.where(res > 0.5, 1, 0)

NameError: name 'res' is not defined

and we can plot the data

fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
          extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]

ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 2
      1 fig, ax = plt.subplots()
----> 2 ax.imshow(domain.T, origin="lower",
      3           extent=[xmin, xmax, ymin, ymax], alpha=0.25)
      4 xpt = [q[0] for q in x]
      5 ypt = [q[1] for q in x]

NameError: name 'domain' is not defined
../_images/9e13561b06f5d2c575e2e9d0ab28f3ca3efd7ae4d5738c0cd619a55149090489.png