Mini Project: Customer Segmentation using Clustering

Problem Type

Unsupervised Learning (Clustering)

Objective

Segment mall customers into meaningful groups for targeted marketing strategies.

Dataset

Workflow

  1. Data cleaning and understanding
  2. Visualization and feature exploration
  3. Preprocessing and scaling
  4. Train 3 clustering algorithms: K-Means, Agglomerative Clustering, Hierarchical Clustering
  5. Evaluate and pick the best algorithm
  6. Conclude learnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage, fcluster

1. Data Loading

data = pd.read_csv("Mall_Customers.csv")
data.head()
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
data.shape
(200, 5)
clean_data = data.copy()
clean_data.columns = clean_data.columns.str.strip()

2. Data Cleaning

clean_data.isnull().sum()
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64
duplicate_count = int(clean_data.duplicated().sum())
print("Duplicate rows:", duplicate_count)
Duplicate rows: 0
clean_data = clean_data.drop_duplicates().reset_index(drop=True)
print("Shape after cleaning:", clean_data.shape)
Shape after cleaning: (200, 5)
clean_data.describe()
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000

3. Exploratory Data Analysis

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

sns.histplot(clean_data["Age"], bins=20, kde=True, ax=axes[0, 0])
axes[0, 0].set_title("Age Distribution")

sns.histplot(clean_data["Annual Income (k$)"], bins=20, kde=True, ax=axes[0, 1])
axes[0, 1].set_title("Annual Income Distribution")

sns.histplot(clean_data["Spending Score (1-100)"], bins=20, kde=True, ax=axes[1, 0])
axes[1, 0].set_title("Spending Score Distribution")

sns.scatterplot(data=clean_data, x="Annual Income (k$)", y="Spending Score (1-100)", ax=axes[1, 1])
axes[1, 1].set_title("Income vs Spending")

plt.tight_layout()
plt.show()

4. Data Preprocessing

feature_columns = ["Annual Income (k$)", "Spending Score (1-100)"]
X = clean_data[feature_columns].copy()
X.head()
Annual Income (k$) Spending Score (1-100)
0 15 39
1 15 81
2 16 6
3 16 77
4 17 40
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled.shape
(200, 2)

5. Model Selection

k_values = list(range(2, 11))
kmeans_scores = []
agg_scores = []
kmeans_inertia = []

for k in k_values:
    kmeans_model_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_labels_temp = kmeans_model_temp.fit_predict(X_scaled)
    agg_labels_temp = AgglomerativeClustering(n_clusters=k).fit_predict(X_scaled)

    kmeans_scores.append(silhouette_score(X_scaled, kmeans_labels_temp))
    agg_scores.append(silhouette_score(X_scaled, agg_labels_temp))
    kmeans_inertia.append(kmeans_model_temp.inertia_)

best_k_kmeans = k_values[int(np.argmax(kmeans_scores))]
best_k_agg = k_values[int(np.argmax(agg_scores))]

print(f"Best k for K-Means: {best_k_kmeans} with silhouette score {max(kmeans_scores):.4f}")
Best k for K-Means: 5 with silhouette score 0.5547
plt.figure(figsize=(6, 4))
plt.plot(k_values, kmeans_inertia, marker="o")
plt.title("K-Means Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(k_values, kmeans_scores, marker="o", label="KMeans")
plt.plot(k_values, agg_scores, marker="o", label="Agglomerative")
plt.title("Silhouette Score vs k")
plt.xlabel("k")
plt.ylabel("Silhouette Score")
plt.legend()
plt.show()

score_table = pd.DataFrame({
    "k": k_values,
    "KMeans Silhouette": kmeans_scores,
    "Agglomerative Silhouette": agg_scores
})
score_table
k KMeans Silhouette Agglomerative Silhouette
0 2 0.321271 0.384234
1 3 0.466585 0.461048
2 4 0.493907 0.492551
3 5 0.554657 0.553809
4 6 0.539880 0.538676
5 7 0.528149 0.519795
6 8 0.455215 0.430862
7 9 0.457085 0.437690
8 10 0.443171 0.433901
print("Best k for KMeans:", best_k_kmeans)
print("Best k for Agglomerative:", best_k_agg)
Best k for KMeans: 5
Best k for Agglomerative: 5

6. Hierarchical Clustering

hier_k_values = list(range(2, 11))
hier_scores = []
hier_labels_list = []
hier_linkage_matrix = linkage(X_scaled, method="ward")
for k in hier_k_values:
    labels = fcluster(hier_linkage_matrix, t=k, criterion="maxclust") - 1
    hier_labels_list.append(labels)
    hier_scores.append(silhouette_score(X_scaled, labels))

best_hier_index = int(np.argmax(hier_scores))
best_hier_k = hier_k_values[best_hier_index]
best_hier_score = hier_scores[best_hier_index]
hierarchical_labels = hier_labels_list[best_hier_index]
hier_table = pd.DataFrame({
    "k": hier_k_values,
    "silhouette": hier_scores
})
hier_table
k silhouette
0 2 0.384234
1 3 0.461048
2 4 0.492551
3 5 0.553809
4 6 0.538676
5 7 0.519795
6 8 0.430862
7 9 0.437690
8 10 0.433901
print("Best Hierarchical k:", best_hier_k)
print("Best Hierarchical silhouette:", best_hier_score)
Best Hierarchical k: 5
Best Hierarchical silhouette: 0.5538089226688662

7. Final Comparison

kmeans_model = KMeans(n_clusters=best_k_kmeans, random_state=42, n_init=10)
kmeans_labels = kmeans_model.fit_predict(X_scaled)

agg_model = AgglomerativeClustering(n_clusters=best_k_agg)
agg_labels = agg_model.fit_predict(X_scaled)

kmeans_score = silhouette_score(X_scaled, kmeans_labels)
agg_score = silhouette_score(X_scaled, agg_labels)
hier_score = best_hier_score

comparison = pd.DataFrame({
    "Algorithm": ["KMeans", "Agglomerative", "Hierarchical"],
    "Parameter (K)": [best_k_kmeans, best_k_agg, best_hier_k],
    "Silhouette Score": [kmeans_score, agg_score, hier_score]
})

best_algorithm = comparison.loc[comparison["Silhouette Score"].idxmax(), "Algorithm"]

print(comparison)
print("Best algorithm:", best_algorithm)
       Algorithm  Parameter (K)  Silhouette Score
0         KMeans              5          0.554657
1  Agglomerative              5          0.553809
2   Hierarchical              5          0.553809
Best algorithm: KMeans
valid_comparison = comparison.dropna(subset=["Silhouette Score"]).reset_index(drop=True)
if len(valid_comparison) > 0:
    best_algorithm = valid_comparison.loc[valid_comparison["Silhouette Score"].idxmax(), "Algorithm"]
else:
    best_algorithm = "KMeans"
best_algorithm
'KMeans'

8. Cluster Visualisation

clustered = clean_data.copy()
clustered["KMeans"] = kmeans_labels
clustered["Agglomerative"] = agg_labels
clustered["Hierarchical"] = hierarchical_labels

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, column, title in zip(
    axes,
    ["KMeans", "Agglomerative", "Hierarchical"],
    ["K-Means Clusters", "Agglomerative Clusters", "Hierarchical Clusters"],
):
    sns.scatterplot(
        data=clustered,
        x="Annual Income (k$)",
        y="Spending Score (1-100)",
        hue=column,
        palette="tab10",
        ax=ax,
        s=55,
    )
    ax.set_title(title)
    ax.legend(title=column, bbox_to_anchor=(1.02, 1), loc="upper left")

plt.tight_layout()
plt.show()

9. Cluster Summary

if best_algorithm == "KMeans":
    final_labels = kmeans_labels
elif best_algorithm == "Agglomerative":
    final_labels = agg_labels
else:
    final_labels = hierarchical_labels

clustered["BestCluster"] = final_labels
clustered[["Annual Income (k$)", "Spending Score (1-100)", "BestCluster"]]
Annual Income (k$) Spending Score (1-100) BestCluster
0 15 39 4
1 15 81 2
2 16 6 4
3 16 77 2
4 17 40 4
... ... ... ...
195 120 79 1
196 126 28 3
197 126 74 1
198 137 18 3
199 137 83 1

200 rows × 3 columns

cluster_summary = clustered.groupby("BestCluster")[["Age", "Annual Income (k$)", "Spending Score (1-100)"]].mean().round(2)
cluster_summary
Age Annual Income (k$) Spending Score (1-100)
BestCluster
0 42.72 55.30 49.52
1 32.69 86.54 82.13
2 25.27 25.73 79.36
3 41.11 88.20 17.11
4 45.22 26.30 20.91
cluster_counts = clustered["BestCluster"].value_counts().sort_index()
cluster_counts
BestCluster
0    81
1    39
2    22
3    35
4    23
Name: count, dtype: int64
print("Final Decision")
print("Best Algorithm:", best_algorithm)
Final Decision
Best Algorithm: KMeans

Reasons:

  1. It gave the highest silhouette score among the 3 algorithms.
  2. It works very well for round and clearly separated clusters.
  3. It is simple, fast, and easy to explain in a mini project.

Conclusion and Learning

This mini project follows the full required flow: cleaning, visualization, preprocessing, model training, evaluation, best model selection, and learning conclusion.