curso-machine-learning - Tumblr blog

curso-machine-learning · 3 years ago

Text

Assignment 4 - K-Means Cluster Analysis

Data from GapMinder containing development indicators such as life expectancy, annual income per person, employment rate, etc. is used to group countries with similar values.

Considerations:

Ten cluster variables (all quantitative): alcohol consumption per adult, breast cancer new cases per 100000 female, CO2 emission, female employment rate, Internet use rate, life expectancy at birth, residential electricity consumption per person, suicide rate, employment rate and urbanisation rate.

One validation variable: Gross Domestic Product per capita.

All data have a mean=0 and standard deviation=1.

Data divided into training set (75%) and test set (25%).

K-mean algorithm was applied to determine the number of optimal cluster (k=1-4 clusters).

The elbow curve above suggests 3 clusters.

cluster index alcconsumption breastcancerper100th co2emissions 0 108.968750 0.820648 0.963347 -0.003070 1 99.380952 -0.729134 -0.886842 -0.077616 2 100.585366 -0.358717 -0.443235 -0.170846

cluster femaleemployrate internetuserate lifeexpectancy 0 0.177131 1.111704 0.779555 1 0.920481 -1.049155 -1.284006 2 -0.741121 -0.395236 -0.013668

cluster relectricperperson suicideper100th employrate urbanrate 0 0.698480 0.559731 -0.084920 0.685688 1 -0.629038 0.073251 0.944097 -1.196415 2 -0.271877 -0.516922 -0.598068 0.089599

Code (Python)

# -*- coding: utf-8 -*-

“”“

from pandas import Series, DataFrame import pandas as pd import numpy as np import matplotlib.pylab as plt from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.cluster import KMeans

“”“ Data Management ”“”

# Import data

data = pd.read_csv(“gapminder.csv”) data.describe()

# Select variables of interest

data=data.drop([‘polityscore’,'armedforcesrate’,'hivrate’,'oilperperson’], axis=1)

# Drop incomplete observations

data_clean = data.dropna()

# Subset clustering variables

cluster=data_clean.drop(['country’], axis=1) cluster.describe()

# Convert variables to numeric

cluster['incomeperperson’] = cluster['incomeperperson’].convert_objects(convert_numeric=True) cluster['alcconsumption’] = cluster['alcconsumption’].convert_objects(convert_numeric=True) cluster['breastcancerper100th’] = cluster['breastcancerper100th’].convert_objects(convert_numeric=True) cluster['co2emissions’] = cluster['co2emissions’].convert_objects(convert_numeric=True) cluster['femaleemployrate’] = cluster['femaleemployrate’].convert_objects(convert_numeric=True) cluster['internetuserate’] = cluster['internetuserate’].convert_objects(convert_numeric=True) cluster['lifeexpectancy’] = cluster['lifeexpectancy’].convert_objects(convert_numeric=True) cluster['relectricperperson’] = cluster['relectricperperson’].convert_objects(convert_numeric=True) cluster['suicideper100th’] = cluster['suicideper100th’].convert_objects(convert_numeric=True) cluster['employrate’] = cluster['employrate’].convert_objects(convert_numeric=True) cluster['urbanrate’] = cluster['urbanrate’].convert_objects(convert_numeric=True)

# Drop incomplete observations

cluster= cluster.dropna()

# Standardize clustering variables to have mean=0 and sd=1

clustervar=cluster.copy() clustervar['incomeperperson’]=preprocessing.scale(clustervar['incomeperperson’].astype('float64’)) clustervar['alcconsumption’]=preprocessing.scale(clustervar['alcconsumption’].astype('float64’)) clustervar['breastcancerper100th’]=preprocessing.scale(clustervar['breastcancerper100th’].astype('float64’)) clustervar['co2emissions’]=preprocessing.scale(clustervar['co2emissions’].astype('float64’)) clustervar['femaleemployrate’]=preprocessing.scale(clustervar['femaleemployrate’].astype('float64’)) clustervar['internetuserate’]=preprocessing.scale(clustervar['internetuserate’].astype('float64’)) clustervar['lifeexpectancy’]=preprocessing.scale(clustervar['lifeexpectancy’].astype('float64’)) clustervar['relectricperperson’]=preprocessing.scale(clustervar['relectricperperson’].astype('float64’)) clustervar['suicideper100th’]=preprocessing.scale(clustervar['suicideper100th’].astype('float64’)) clustervar['employrate’]=preprocessing.scale(clustervar['employrate’].astype('float64’)) clustervar['urbanrate’]=preprocessing.scale(clustervar['urbanrate’].astype('float64’))

# Drop annual income per person to validate clustering

clustervar=clustervar.drop(['incomeperperson’],axis=1)

# Split data into train and test sets

clus_train, clus_test = train_test_split(clustervar, test_size=.25, random_state=123)

# k-means cluster analysis for 1-4 clusters

from scipy.spatial.distance import cdist clusters=range(1,5) meandist=[]

for k in clusters: model=KMeans(n_clusters=k) model.fit(clus_train) clusassign=model.predict(clus_train) meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean’), axis=1)) / clus_train.shape[0])

“”“ Plot average distance from observations from the cluster centroid to use the Elbow Method to identify number of clusters to choose ”“”

plt.plot(clusters, meandist) plt.xlabel('Number of clusters’) plt.ylabel('Average distance’) plt.title('Selecting k with the Elbow Method’)

# Interpret 3 cluster solution

model3=KMeans(n_clusters=3) model3.fit(clus_train) clusassign=model3.predict(clus_train)

# Plot clusters

from sklearn.decomposition import PCA pca_2 = PCA(2) plot_columns = pca_2.fit_transform(clus_train) plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model3.labels_,) plt.xlabel('Canonical variable 1’) plt.ylabel('Canonical variable 2’) plt.title('Scatterplot of Canonical Variables for 3 Clusters’) plt.show()

“”“ BEGIN multiple steps to merge cluster assignment with clustering variables to examine cluster variable means by cluster ”“”

# Create a unique identifier variable from the index for the cluster training data to merge with the cluster assignment variable

clus_train.reset_index(level=0, inplace=True)

# Create a list that has the new index variable

cluslist=list(clus_train['index’])

# Create a list of cluster assignments

labels=list(model3.labels_)

# Combine index variable list with cluster assignment list into a dictionary

newlist=dict(zip(cluslist, labels)) newlist

# Convert newlist dictionary to a dataframe

newclus=DataFrame.from_dict(newlist, orient='index’) newclus

# Rename the cluster assignment column

newclus.columns = ['cluster’]

# Same for the cluster assignment variable create a unique identifier variable from the index for the cluster assignment dataframe to merge with cluster training data

newclus.reset_index(level=0, inplace=True)

# Merge the cluster assignment dataframe with the cluster training variable dataframe by the index variable

merged_train=pd.merge(clus_train, newclus, on='index’) merged_train.head(n=100)

# Cluster frequencies

merged_train.cluster.value_counts()

“”“ END multiple steps to merge cluster assignment with clustering variables to examine cluster variable means by cluster ”“”

# Calculate clustering variable means by cluster

clustergrp = merged_train.groupby('cluster’).mean() print (“Clustering variable means by cluster”) print(clustergrp)

# Validate clusters in training data by examining cluster differences

# Merge GPA with clustering variables and cluster assignment data

gpa_data=cluster['incomeperperson’]

# Split GPA data into train and test sets

gpa_train, gpa_test = train_test_split(gpa_data, test_size=.3, random_state=123) gpa_train1=pd.DataFrame(gpa_train) gpa_train1.reset_index(level=0, inplace=True) merged_train_all=pd.merge(gpa_train1, merged_train, on='index’) sub1 = merged_train_all[['incomeperperson’, 'cluster’]].dropna()

import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi

gpamod = smf.ols(formula='incomeperperson ~ C(cluster)’, data=sub1).fit() print (gpamod.summary())

print ('means for annual income per person by cluster’) m1= sub1.groupby('cluster’).mean() print (m1)

print ('standard deviations for income per person by cluster’) m2= sub1.groupby('cluster’).std() print (m2)

mc1 = multi.MultiComparison(sub1['incomeperperson’], sub1['cluster’]) res1 = mc1.tukeyhsd() print(res1.summary())

0 notes