week4_K-MeansClusterAnalysis
1.- Python Source Code
# -*- coding: utf-8 -*-
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
os.chdir("/home/fas-uae/Documents/curso/w4")
# Load the data
data = pd.read_csv("../tree_addhealth.csv")
# Upper-case all DataFrame column names
data.columns = map(str.upper, data.columns)
# Data Management
dataset = data.dropna()
# Subset clustering variables
cluster=dataset[['ALCEVR1','MAREVER1','ALCPROBS1','DEVIANT1','VIOL1','DEP1',
'ESTEEM1','SCHCONN1','PARACTV', 'PARPRES','FAMCONCT']]
cluster.describe()
# Standardize clustering variables to have mean=0 and sd=1
clustervar=cluster.copy()
clustervar['ALCEVR1']=preprocessing.scale(clustervar['ALCEVR1'].astype('float64'))
clustervar['ALCPROBS1']=preprocessing.scale(clustervar['ALCPROBS1'].astype('float64'))
clustervar['MAREVER1']=preprocessing.scale(clustervar['MAREVER1'].astype('float64'))
clustervar['DEP1']=preprocessing.scale(clustervar['DEP1'].astype('float64'))
clustervar['ESTEEM1']=preprocessing.scale(clustervar['ESTEEM1'].astype('float64'))
clustervar['VIOL1']=preprocessing.scale(clustervar['VIOL1'].astype('float64'))
clustervar['DEVIANT1']=preprocessing.scale(clustervar['DEVIANT1'].astype('float64'))
clustervar['FAMCONCT']=preprocessing.scale(clustervar['FAMCONCT'].astype('float64'))
clustervar['SCHCONN1']=preprocessing.scale(clustervar['SCHCONN1'].astype('float64'))
clustervar['PARACTV']=preprocessing.scale(clustervar['PARACTV'].astype('float64'))
clustervar['PARPRES']=preprocessing.scale(clustervar['PARPRES'].astype('float64'))
# Split data into train and test sets
cluster_train, cluster_test = train_test_split(clustervar, test_size=.3, random_state=328)
# k-means cluster analysis for 1-9 clusters
clusters=range(1,10)
meandist=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(cluster_train)
clusassign=model.predict(cluster_train)
meandist.append(sum(np.min(cdist(cluster_train, model.cluster_centers_, 'euclidean'), axis=1))
/ cluster_train.shape[0])
'''
Plot average distance from observations from the cluster centroid
to use the Elbow Method to identify number of clusters to choose
'''
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting K with the Elbow Method')
# Interpret 3 cluster solution
model3=KMeans(n_clusters=3)
model3.fit(cluster_train)
clusassign=model3.predict(cluster_train)
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(cluster_train)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,)
plt.xlabel('Canonical variable 1')
plt.ylabel('Canonical variable 2')
plt.title('Scatterplot of Canonical Variables for 3 Clusters')
'''
BEGIN multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
'''
# Create a unique identifier variable from the index for the
# cluster training data to merge with the cluster assignment variable
cluster_train.reset_index(level=0, inplace=True)
# Create a list that has the new index variable
clusterList=list(cluster_train['index'])
# Create a list of cluster assignments
labels=list(model3.labels_)
# Combine index variable list with cluster assignment list into a dictionary
newList=dict(zip(clusterList, labels))
newList
# Convert newList dictionary to a dataframe
newCluster=DataFrame.from_dict(newList, orient='index')
newCluster
# Rename the cluster assignment column
newCluster.columns = ['cluster']
# Now do the same for the cluster assignment variable
# create a unique identifier variable from the index for the
# cluster assignment dataframe
# to merge with cluster training data
newCluster.reset_index(level=0, inplace=True)
# Merge the cluster assignment dataframe with the cluster training variable dataframe
# by the index variable
merged_train=pd.merge(cluster_train, newCluster, on='index')
merged_train.head(n=100)
# Cluster frequencies
merged_train.cluster.value_counts()
'''
FINALLY multiple steps to merge cluster assignment with clustering variables to examine
cluster variable means by cluster
'''
# FINALLY calculate clustering variable means by cluster
clustergrp = merged_train.groupby('cluster').mean()
print ('Clustering variable means by cluster')
print(clustergrp)
# Validate clusters in training data by examining cluster differences in GPA using ANOVA
# first have to merge GPA with clustering variables and cluster assignment data
gpa_data=dataset['GPA1']
# Split GPA data into train and test sets
gpa_train, gpa_test = train_test_split(gpa_data, test_size=.3, random_state=328)
gpa_train1=pd.DataFrame(gpa_train)
gpa_train1.reset_index(level=0, inplace=True)
merged_train_all=pd.merge(gpa_train1, merged_train, on='index')
sub1 = merged_train_all[['GPA1', 'cluster']].dropna()
gpamod = smf.ols(formula='GPA1 ~ C(cluster)', data=sub1).fit()
print (gpamod.summary())
print ('Means for GPA by cluster')
m1= sub1.groupby('cluster').mean()
print (m1)
print ('Standard deviations for GPA by cluster')
m2= sub1.groupby('cluster').std()
print (m2)
mc1 = multi.MultiComparison(sub1['GPA1'], sub1['cluster'])
res1 = mc1.tukeyhsd()
print(res1.summary())
# Plot clusters
plt.show()
2.- Output
Figure 1: Plot_ScatterplotOfCanonicalVariablesFor3Clusters
Figure 2: console_K-MeansClusterAnalysis
3.- Interpretation
After the analysis, 3 types of adolescents are considered in the cluster:
Cluster 0 - most likely to have used alcohol
Cluster 1 - most problematic adolescents
Cluster 2 - least problematic adolescents
As part of the analysis, it was observed that adolescents in cluster (the most problematic group) had the lowest GPA of 2.4147; adolescents in cluster 2 (least problematic group), had the highest GPA of 3.001.
In addition, as part of the test, it was observed that the clusters differed significantly in the standard deviation of GPA from their mean GPA value. It can be seen that, for the standard deviation GPA, the difference between cluster 0 and cluster 2 is much smaller than both with cluster 1.
Part of these results can also be seen in the graph above.
0 notes