akanksha8goyal - Tumblr blog

akanksha8goyal · 5 years ago

Text

K-means cluster analysis for gapminder data set

To study the association between the alcohol consumption rate with various explanatory variables present in the data set, I tried to K-means cluster analysis.

Code:-

from pandas import Series, DataFrame

import pandas as pd

import numpy as np

import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split

from sklearn import preprocessing

from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)

data = pd.read_csv("gapminder.csv")

# Data Management

data_clean = data.dropna()

# subset clustering variables

cluster=data_clean[['urbanrate','incomeperperson','employrate','femaleemployrate','internetuserate','lifeexpectancy','co2emissions']]

cluster.describe()

# standardize clustering variables to have mean=0 and sd=1

clustervar=cluster.copy()

clustervar['urbanrate']=preprocessing.scale(clustervar['urbanrate'].astype('float64'))

clustervar['incomeperperson']=preprocessing.scale(clustervar['incomeperperson'].astype('float64'))

clustervar['employrate']=preprocessing.scale(clustervar['employrate'].astype('float64'))

clustervar['femaleemployrate']=preprocessing.scale(clustervar['femaleemployrate'].astype('float64'))

clustervar['internetuserate']=preprocessing.scale(clustervar['internetuserate'].astype('float64'))

clustervar['lifeexpectancy']=preprocessing.scale(clustervar['lifeexpectancy'].astype('float64'))

clustervar['co2emissions']=preprocessing.scale(clustervar['co2emissions'].astype('float64'))

# split data into train and test sets

clus_train, clus_test = train_test_split(clustervar, test_size=.3, random_state=123)

# k-means cluster analysis for 1-9 clusters

from scipy.spatial.distance import cdist

clusters=range(1,10)

meandist=[]

for k in clusters:

model=KMeans(n_clusters=k)

model.fit(clus_train)

clusassign=model.predict(clus_train)

meandist.append(sum(np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1))

/ clus_train.shape[0])

"""

Plot average distance from observations from the cluster centroid

to use the Elbow Method to identify number of clusters to choose

"""

plt.plot(clusters, meandist)

plt.xlabel('Number of clusters')

plt.ylabel('Average distance')

plt.title('Selecting k with the Elbow Method')

# Interpret 4 cluster solution

model4=KMeans(n_clusters=4)

model4.fit(clus_train)

clusassign=model4.predict(clus_train)

# plot clusters

from sklearn.decomposition import PCA

pca_2 = PCA(2)

plot_columns = pca_2.fit_transform(clus_train)

plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model4.labels_,)

plt.xlabel('Canonical variable 1')

plt.ylabel('Canonical variable 2')

plt.title('Scatterplot of Canonical Variables for 4 Clusters')

plt.show()

"""

BEGIN multiple steps to merge cluster assignment with clustering variables to examine

cluster variable means by cluster

"""

# create a unique identifier variable from the index for the

# cluster training data to merge with the cluster assignment variable

clus_train.reset_index(level=0, inplace=True)

# create a list that has the new index variable

cluslist=list(clus_train['index'])

# create a list of cluster assignments

labels=list(model4.labels_)

# combine index variable list with cluster assignment list into a dictionary

newlist=dict(zip(cluslist, labels))

newlist

# convert newlist dictionary to a dataframe

newclus=DataFrame.from_dict(newlist, orient='index')

newclus

# rename the cluster assignment column

newclus.columns = ['cluster']

# now do the same for the cluster assignment variable

# create a unique identifier variable from the index for the

# cluster assignment dataframe

# to merge with cluster training data

newclus.reset_index(level=0, inplace=True)

# merge the cluster assignment dataframe with the cluster training variable dataframe

# by the index variable

merged_train=pd.merge(clus_train, newclus, on='index')

merged_train.head(n=100)

# cluster frequencies

merged_train.cluster.value_counts()

"""

END multiple steps to merge cluster assignment with clustering variables to examine

cluster variable means by cluster

"""

# FINALLY calculate clustering variable means by cluster

clustergrp = merged_train.groupby('cluster').mean()

print ("Clustering variable means by cluster")

print(clustergrp)

# validate clusters in training data by examining cluster differences in alcohol comsumption using ANOVA

# first have to merge alcconsumption with clustering variables and cluster assignment data

alcconsumption_data=data_clean['alcconsumption']

# split alcconsumption data into train and test sets

alcconsumption_train, alcconsumption_test = train_test_split(alcconsumption_data, test_size=.3, random_state=123)

alcconsumption_train1=pd.DataFrame(alcconsumption_train)

alcconsumption_train1.reset_index(level=0, inplace=True)

merged_train_all=pd.merge(alcconsumption_train1, merged_train, on='index')

sub1 = merged_train_all[['alcconsumption', 'cluster']].dropna()

import statsmodels.formula.api as smf

import statsmodels.stats.multicomp as multi

Alcmod = smf.ols(formula='alcconsumption ~ C(cluster)', data=sub1).fit()

print (Alcmod.summary())

print ('means for alcohol consumption by cluster')

m1= sub1.groupby('cluster').mean()

print (m1)

print ('standard deviations for alcohol consumption by cluster')

m2= sub1.groupby('cluster').std()

print (m2)

mc1 = multi.MultiComparison(sub1['alcconsumption'], sub1['cluster'])

res1 = mc1.tukeyhsd()

print(res1.summary())

Output:-

urbanrate incomeperperson employrate femaleemployrate \

count 56.000000 56.000000 56.000000 56.000000

mean 68.072143 12982.654643 57.491072 47.044643

std 16.572755 12712.681023 7.559035 10.781329

min 27.140000 558.062877 41.099998 18.200001

25% 60.935000 2532.598585 52.175000 41.999999

50% 68.570000 6219.692968 58.150002 48.299999

75% 78.210000 25373.478548 62.325000 54.374999

max 100.000000 39972.352770 76.000000 68.900002

internetuserate lifeexpectancy co2emissions

count 56.000000 56.000000 5.600000e+01

mean 52.464245 75.497446 1.676553e+10

std 26.218205 5.787174 4.687420e+10

min 3.700003 52.797000 2.262553e+08

25% 33.049632 73.091750 1.859310e+09

50% 48.980090 75.766500 3.852409e+09

75% 77.533598 80.578250 1.135942e+10

max 93.277508 83.394000 3.342210e+11

Clustering variable means by cluster

level_0 index urbanrate incomeperperson employrate \

cluster

0 22.500000 139.100000 0.010585 -0.484829 0.153370

1 21.545455 74.818182 -0.316240 0.049204 -0.257292

2 15.000000 18.500000 0.526534 -0.025579 0.376630

3 15.900000 187.700000 0.163775 0.358347 -0.313843

femaleemployrate internetuserate lifeexpectancy co2emissions

cluster

0 -0.029448 -0.158977 -0.334814 -0.225292

1 -0.351321 -0.043499 0.065862 -0.151002

2 0.535148 0.010265 0.086601 -0.195021

3 -0.039743 0.210920 -0.194123 0.460261

OLS Regression Results

==============================================================================

Dep. Variable: alcconsumption R-squared: 0.029

Model: OLS Adj. R-squared: -0.054

Method: Least Squares F-statistic: 0.3481

Date: Thu, 19 Nov 2020 Prob (F-statistic): 0.791

Time: 14:30:22 Log-Likelihood: -118.47

No. Observations: 39 AIC: 244.9

Df Residuals: 35 BIC: 251.6

Df Model: 3

Covariance Type: nonrobust

===================================================================================

coef std err t P>|t| [0.025 0.975]

-----------------------------------------------------------------------------------

Intercept 9.0500 1.685 5.372 0.000 5.630 12.470

C(cluster)[T.1] 0.6691 2.328 0.287 0.775 -4.056 5.395

C(cluster)[T.2] 2.3275 2.527 0.921 0.363 -2.803 7.458

C(cluster)[T.3] 0.1050 2.383 0.044 0.965 -4.732 4.942

==============================================================================

Omnibus: 1.070 Durbin-Watson: 1.643

Prob(Omnibus): 0.586 Jarque-Bera (JB): 0.961

Skew: -0.171 Prob(JB): 0.619

Kurtosis: 2.311 Cond. No. 4.76

==============================================================================

Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

means for alcohol consumption by cluster

alcconsumption

cluster

0 9.050000

1 9.719091

2 11.377500

3 9.155000

standard deviations for alcohol consumption by cluster

alcconsumption

cluster

0 5.955690

1 5.962259

2 3.629230

3 5.016013

Multiple Comparison of Means - Tukey HSD, FWER=0.05

===================================================

group1 group2 meandiff p-adj lower upper reject

---------------------------------------------------

0 1 0.6691 0.9 -5.6086 6.9467 False

0 2 2.3275 0.7714 -4.4876 9.1426 False

0 3 0.105 0.9 -6.3204 6.5304 False

1 2 1.6584 0.9 -5.0176 8.3345 False

1 3 -0.5641 0.9 -6.8417 5.7136 False

2 3 -2.2225 0.7943 -9.0376 4.5926 False

Discussion:-

A k-means cluster analysis was conducted to identify underlying subgroups of countries rates based on their similarity of responses on 7 variables that represent characteristics that could have an impact on their alcohol consumption rates. Clustering variables included all quantitative variables. All clustering variables were standardized to have a mean of 0 and a standard deviation of 1.

Data were randomly split into a training set that included 70% of the observations and a test set that included 30% of the observations. A series of k-means cluster analyses were conducted on the training data specifying k=1-9 clusters, using Euclidean distance. The variance in the clustering variables that was accounted for by the clusters (r-square) was plotted for each of the nine cluster solutions in an elbow curve to provide guidance for choosing the number of clusters to interpret.

The elbow curve was inconclusive, suggesting that the 2, 4 ,6 and 8-cluster solutions might be interpreted. The results below are for an interpretation of the 4-cluster solution.

Canonical discriminant analyses was used to reduce the 7 clustering variable down a few variables that accounted for most of the variance in the clustering variables. A scatterplot of the first two canonical variables by cluster indicated that the observations in clusters 2 and 3 were densely packed with relatively low within cluster variance, and did not overlap very much with the other clusters. Cluster 1 and 3 was generally distinct with observations had greater spread suggesting higher within cluster variance. The results of this plot suggest that the best cluster solution may have fewer than 4 clusters, so it will be especially important to also evaluate the cluster solutions with fewer than 4 clusters.

The means on the clustering variables showed that compared to the other clusters, countries in the third cluster, cluster 2, had the highest urbanization rate as well as employment rate, female employment rate and highest income per person as compared to others. On the other hand in the first cluster, cluster 0, countries have positive urbanization rate and employment rate but negative association with all other variables. The second cluster, cluster 1, appears to have least urban rate and least female employment rate.

In order to externally validate the clusters, an Analysis of Variance (ANOVA) was conducted to test for significant differences between the clusters on alcohol consumption rates. A tukey test was used for post hoc comparisons between the clusters. Results indicated no significant differences between the clusters on alcohol consumption (F = 0.3481, p=0.791). The tukey post hoc comparisons showed no significant differences between clusters on alcohol consumption. Countries in cluster 3 had the highest alcohol consumption (mean=11.38, sd=3.63), and cluster 0 had the lowest alcohol consumption (mean=9.05, sd=5.96).

0 notes

akanksha8goyal · 5 years ago

Text

Lasso Regression for gapminder data set

To study the association between the alcohol consumption rate with various explanatory variables present in the data set, I tried to run Lasso regression model.

Code:-

import pandas

import numpy as np

import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LassoLarsCV

#from pandas import series, DataFrame

#Load the dataset

data= pandas.read_csv("gapminder.csv")

# Data management

data_clean = data.dropna()

#select predictor and traget variable as separate data sets

predvar=data_clean[['urbanrate','incomeperperson','employrate','femaleemployrate','internetuserate','lifeexpectancy','co2emissions',]]

target = data_clean.alcconsumption

# standardize predictors to have mean=0 and sd=1

predictors=predvar.copy()

from sklearn import preprocessing

predictors['urbanrate']=preprocessing.scale(predictors['urbanrate'].astype('float64'))

predictors['incomeperperson']=preprocessing.scale(predictors['incomeperperson'].astype('float64'))

predictors['employrate']=preprocessing.scale(predictors['employrate'].astype('float64'))

predictors['femaleemployrate']=preprocessing.scale(predictors['femaleemployrate'].astype('float64'))

predictors['internetuserate']=preprocessing.scale(predictors['internetuserate'].astype('float64'))

predictors['lifeexpectancy']=preprocessing.scale(predictors['lifeexpectancy'].astype('float64'))

predictors['co2emissions']=preprocessing.scale(predictors['co2emissions'].astype('float64'))

#split data into train and test sets

pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)

#specify the lasso regression model

model=LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

#print variable names and regression coefficients

dict(zip(predictors.columns, model.coef_))

#plot coefficient progression

m_log_alphas = -np.log10(model.alphas_)

ax=plt.gca()

plt.plot(m_log_alphas, model.coef_path_.T)

plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')

plt.ylabel('Regression Coefficients')

plt.xlabel('-log(alpha)')

plt.title('Regression Coefficients Progression for Lasso Paths')

#plot mean square error for each fold

m_log_alphascv = -np.log10(model.cv_alphas_)

plt.figure()

plt.plot(m_log_alphascv, model.mse_path_, ':')

plt.plot(m_log_alphascv, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds',linewidth=2)

plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')

plt.legend()

plt.ylabel('Mean squared error')

plt.xlabel('-log(alpha)')

plt.title('Mean squared error on each fold')

#MSE from training and test data

from sklearn.metrics import mean_squared_error

train_error = mean_squared_error(tar_train, model.predict(pred_train))

test_error = mean_squared_error(tar_test, model.predict(pred_test))

print('training data MSE')

print(train_error)

print('test data MSE')

print(test_error)

#R-squared from training and test data

rsquared_train=model.score(pred_train,tar_train)

rsquared_test=model.score(pred_test,tar_test)

print('training data R-squared')

print(rsquared_train)

print('test data R-squared')

print(rsquared_test)

Results:-

{'urbanrate': 0.0,

'incomeperperson': -0.8783843514566742,

'employrate': -3.9380320202232673,

'femaleemployrate': 4.747274967297162,

'internetuserate': 1.2788329920485118,

'lifeexpectancy': 0.0,

'co2emissions': 0.0}

training data MSE

11.753146478431066

test data MSE

10.874834857283794

training data R-squared

0.5519324076835691

test data R-squared

0.622681147524026

Discussion:-

· Out of 7 explanatory variables taken into account for studying the relationship with Lasso regression model, three predictors are showing a value of zero. Female employ rate and Employ rate are having highest regression coefficients values and were most strongly associated with alcohol consumption rate, followed by internet use rate and income per person.

· Female employ rate and internet use rate were positively associated with alcohol consumption rate and employ rate and income per person were negatively associated.

· After plotting the progression of the regression coefficients we see that the regression coefficients female employ rate, the red line had the largest regression coefficient. It was therefore entered into the model first, followed by employ rate, the blue line at step two.

· In internet use rate, the green line at step three and so on. After plotting the curve showing the change in the mean square error for change in the penalty parameter alpha at each step in the selection process. We that that it decreases rapidly and then levels off to a point at which adding more predictors doesn’t lead to much reduction in the mean square error.

· As expected the selected model was somewhat accurate in predicting alcohol consumption rate in the test data and the test mean square error was pretty close to the training mean square error.

· The R-square values were 0.55 and 0.62 indicating that the selected model explained 55% and 62% of the variance in alcohol consumption rate for the training and test sets respectively.

0 notes

akanksha8goyal · 5 years ago

Text

Creating random forest for gapminder data set

To study the association between the alcohol consumption rate with various explanatory variables present in the data set, I tried to create a random forest. Firstly as my response variable was quantitive I binned it two binary categories around its mean with 0 indicating lower than mean alcohol consumption and 1 indicating higher than mean level of alcohol consumption.

Code:-

#from pandas import series, Dataframes

import pandas

import numpy

import os

import matplotlib.pyplot as plt

import sklearn

import graphviz

from sklearn.model_selection import train_test_split

#from sklearn.cross_validation import train_test_split

from sklearn.tree import DecisionTreeClassifier

#from sklearn.metrics import Classification_report

import sklearn.metrics

# Feature Importance

from sklearn import datasets

import sklearn.ensemble

from sklearn.ensemble import ExtraTreesClassifier

os.chdir("E:\Akon")

#Load the dataset

GM_data=pandas.read_csv("gapminder.csv")

data_clean=GM_data.dropna()

data_clean.dtypes

data_clean.describe()

# calculating mean

mean1=data_clean['alcconsumption'].mean()

print(mean1)

# categorical response variable creation

def Alcom(row):

if row ['alcconsumption']<=mean1:

return 0

if row ['alcconsumption']>mean1:

return 1

data_clean['Alcom']=data_clean.apply(lambda row:Alcom(row),axis=1)

#split into raining and testing sets

predictors=data_clean[['urbanrate','incomeperperson','employrate','lifeexpectancy','internetuserate','femaleemployrate','oilperperson','hivrate','co2emissions']]

targets=data_clean.Alcom

pred_train,pred_test,tar_train,tar_test=train_test_split(predictors, targets,test_size=.4)

pred_train.shape

pred_test.shape

tar_train.shape

tar_test.shape

#Build model on training data

from sklearn.ensemble import RandomForestClassifier

classifier=RandomForestClassifier(n_estimators=25)

classifier=classifier.fit(pred_train, tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test, predictions)

sklearn.metrics.accuracy_score(tar_test, predictions)

#fit an extra trees model to the data

model = ExtraTreesClassifier()

model.fit(pred_train,tar_train)

#display the relative importance of each attribute

print(model.feature_importances_)

##Running a different number of trees and see the effect on the accuracy

trees=range(25)

accuracy=numpy.zeros(25)

for idx in range(len(trees)):

classifier=RandomForestClassifier(n_estimators=idx+1)

classifier=classifier.fit(pred_train, tar_train)

predictions=classifier.predict(pred_test)

accuracy[idx]=sklearn.metrics.accuracy_score(tar_test,predictions)

plt.cla()

plt.plot(trees, accuracy)

Output:-

country object

incomeperperson float64

alcconsumption float64

armedforcesrate float64

breastcancerper100th float64

co2emissions float64

femaleemployrate float64

hivrate float64

internetuserate float64

lifeexpectancy float64

oilperperson float64

polityscore float64

relectricperperson float64

suicideper100th float64

employrate float64

urbanrate float64

dtype: object

incomeperperson alcconsumption ... employrate urbanrate

count 56.000000 56.000000 ... 56.000000 56.000000

mean 12982.654643 9.429107 ... 57.491072 68.072143

std 12712.681023 5.266750 ... 7.559035 16.572755

min 558.062877 0.050000 ... 41.099998 27.140000

25% 2532.598585 6.417500 ... 52.175000 60.935000

50% 6219.692968 10.035000 ... 58.150002 68.570000

75% 25373.478548 13.135000 ... 62.325000 78.210000

max 39972.352770 19.150000 ... 76.000000 100.000000

[8 rows x 15 columns]

mean1=data_clean['alcconsumption'].mean()

print(mean1)

9.429107142857145

pred_train.shape

(33, 9)

pred_test.shape

(23, 9)

tar_train.shape

(33,)

tar_test.shape

(23,)

array([[6, 5],

[3, 9]], dtype=int64)

accuracy_score

0.6521739130434783

model.feature

[0.1152204 0.11197636 0.11566871 0.12852731 0.24883679 0.12058165

0.0876851 0.03774745 0.03375624]

Discussion:-

Starting with the dtype function it is showing the type of each variable with country being object, others being float64 and created binary variable Alcom as interger64.

The describe function has given the count, mean, standard deviation and other features of the variables.

The mean of alcohol consumption is coming out to be 9.429 and the data is bifurcated on the basis of this value in binary.

The shape of Training predictor is coming out to be (33, 9) implying that the number of observations under training that is 60% of the data is 33 with 9 explanatory variables.

The shape of Target predictor is coming out to be (23, 9) implying that the number of observations under target set that is 40% of the data is 23 with 9 explanatory variables.

The diagonal 6 and 9 represents the number of true negative for non-drinkers and the number of true positives respectively.3 represents the number of false negatives, classifying regular drinkers as not regular drinkers. And the 5 on the top right represents the number of false positives, classifying a non-regular drinker as a regular drinker.

The accuracy score is approximately 0.65 which suggests that the random forest model is 65 % accurate and it has classified 65% of the sample correctly as either regular or not regular drinkers.

We can that the variable with highest importance score at 0.25 is internet use rate and the variable with lowest importance score is CO2 emissions at 0.03.

The correct classification rate for the random forest was 65% and as we can that there are 6 trees with accuracy around or above 65%, so we cannot say confidently that it may be appropriate to interpret a single decision tree for this data.

0 notes

akanksha8goyal · 5 years ago

Text

Creating decision tree for gapminder data set

To study the association between the alcohol consumption rate with various explanatory variables present in the data set, I tried to create a decision tree. Firstly as my response variable was quantitative I binned it two binary categories around its mean with 0 indicating lower than mean alcohol consumption and 1 indicating higher than mean level of alcohol consumption.

Code:-

from pandas import series, Dataframes

import pandas

import numpy

import os

import matplotlib.pyplot as plt

import sklearn

import graphviz

from sklearn.model_selection import train_test_split

from sklearn import tree

from sklearn.tree import DecisionTreeClassifier

#from sklearn.metrics import Classification_report

import sklearn.metrics

os.chdir("E:\Akon")

#Load the dataset

GM_data=pandas.read_csv("gapminder.csv")

data_clean=GM_data.dropna()

data_clean.dtypes

data_clean.describe()

# calculating mean

mean1=data_clean['alcconsumption'].mean()

print(mean1)

# categorical response variable creation

def Alcom(row):

if row ['alcconsumption']<=mean1:

return 0

if row ['alcconsumption']>mean1:

return 1

data_clean['Alcom']=data_clean.apply(lambda row:Alcom(row),axis=1)

## Modelling and prediction

#split into raining and testing sets

predictors=data_clean[['urbanrate','incomeperperson','employrate','lifeexpectancy','internetuserate','femaleemployrate']]

targets=data_clean.Alcom

pred_train,pred_test,tar_train,tar_test=train_test_split(predictors, targets,test_size=.4)

pred_train.shape

pred_test.shape

tar_train.shape

tar_test.shape

#Build model on training data

classifier=DecisionTreeClassifier()

Classifier=classifier.fit(pred_train, tar_train)

predictions=classifier.predict(pred_test)

sklearn.metrics.confusion_matrix(tar_test, predictions)

sklearn.metrics.accuracy_score(tar_test, predictions)

#Displaying the decision tree

from sklearn import tree

#from stringIO import StringIO

from io import StringIO

from IPython.display import Image

out=StringIO()

tree.export_graphviz(classifier, out_file=out)

import pydotplus

graph=pydotplus.graph_from_dot_data(out.getvalue())

Img=(Image(graph.create_png()))

display(Img)

Output:-

Out[3]:

country object

incomeperperson float64

alcconsumption float64

armedforcesrate float64

breastcancerper100th float64

co2emissions float64

femaleemployrate float64

hivrate float64

internetuserate float64

lifeexpectancy float64

oilperperson float64

polityscore float64

relectricperperson float64

suicideper100th float64

employrate float64

urbanrate float64

Alcom int64

incomeperperson alcconsumption ... urbanrate Alcom

count 56.000000 56.000000 ... 56.000000 56.000000

mean 12982.654643 9.429107 ... 68.072143 0.607143

std 12712.681023 5.266750 ... 16.572755 0.492805

min 558.062877 0.050000 ... 27.140000 0.000000

25% 2532.598585 6.417500 ... 60.935000 0.000000

50% 6219.692968 10.035000 ... 68.570000 1.000000

75% 25373.478548 13.135000 ... 78.210000 1.000000

max 39972.352770 19.150000 ... 100.000000 1.000000

[8 rows x 16 columns]

mean=9.429107142857145

pred_train.shape

Out[7]: (33, 6)

pred_test.shape

Out[8]: (23, 6)

array([[5, 3],

[7, 8]], dtype=int64)

Out[14]: 0.5652173913043478

Discussion:-

Starting with the dtype function it is showing the type of each variable with country being object, others being float64 and created binary variable Alcom as interger64.

The describe function has given the count, mean, standard deviation and other features of the variables.

The mean of alcohol consumption is coming out to be 9.429 and the data is bifurcated on the basis of this value in binary.

The shape of Training predictor is coming out to be (33, 6) implying that the number of observations under training that is 60% of the data is 33 with 6 explanatory variables.

The shape of Target predictor is coming out to be (23, 6) implying that the number of observations under target set that is 40% of the data is 23 with 6 explanatory variables.

The diagonal 5 and 8 represents the number of true negative for non-drinkers and the number of true positives respectively.7 represents the number of false negatives, classifying regular drinkers as not regular drinkers. And the 3 on the top right represents the number of false positives, classifying a non-regular drinker as a regular drinker.

The accuracy score is approximately 0.565 which suggests that the decision tree model is 56.5 % accurate and it has classified 56.5% of the sample correctly as either regular or not regular drinkers.

In the first decision tree my binary variable alcom is the target and I have used 6 explanatory variables. The resulting tree starts with a split on X [4] which is internet use rate. If value of internet use rate is smaller than 41.59 with sample size of 33 it moves to the left side of the split.

From this node, another split is made on income per person, such that among those countries with less internet use rate in the first split and also less income per person none is a non-regular drinker and only one is a regular drinker.

To the right we see that the countries with high income per person that is X [2] greater than 41.75, all 8 countries are regular drinkers.

0 notes

akanksha8goyal · 5 years ago

Text

Logistic regression model for examining association in gapminder dataset

For studying the association between urbanization rate and alcohol consumption levels for different countries in the gapminder dataset by considering (binning) the response variable alcohol consumption in two categories with value 0 and 1 getting segregated at the alcohol consumption level of 10, where 0 suggests low consumption level and 1 showing high consumption level.

The major explanatory variable urbanization rate being quantitative, logistic regression model was used. Other independent variables like income per person and employment rate were also considered one by one. The odds ratios and confidence intervals were calculated.

Code for the same is mentioned below:-

import pandas

import numpy

import seaborn

import matplotlib.pyplot as plt

import statsmodels.api as sm

import statsmodels.formula.api as smf

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

data['employrate']=data['employrate'].convert_objects(convert_numeric=True)

##deletion of missing values

sub1=data[['urbanrate','alcconsumption','incomeperperson','employrate']].dropna()

# categorical response variable creation

def Alc(row):

if row ['alcconsumption']<=10:

return 0

if row ['alcconsumption']>10:

return 1

sub1['Alc']=sub1.apply(lambda row:Alc(row),axis=1)

## Logistic regression

Ireg1= smf.logit(formula='Alc ~ urbanrate', data=sub1).fit()

print(Ireg1.summary())

#odds ratio

print("Odds ratio")

print(numpy.exp(Ireg1.params))

##odd ratios with 95% confidence intervals

params=Ireg1.params

conf=Ireg1.conf_int()

conf['OR']=params

conf.columns=['Lower CI', 'Upper CI', 'OR']

print(numpy.exp(conf))

## Logistic regression with adding incomeperperson

Ireg2= smf.logit(formula='Alc ~ urbanrate + incomeperperson', data=sub1).fit()

print(Ireg2.summary())

#odds ratio

print("Odds ratio")

print(numpy.exp(Ireg2.params))

##odd ratios with 95% confidence intervals

params=Ireg2.params

conf=Ireg1.conf_int()

conf['OR']=params

conf.columns=['Lower CI', 'Upper CI', 'OR']

print(numpy.exp(conf))

## Logistic regression with adding employrate

Ireg3= smf.logit(formula='Alc ~ employrate', data=sub1).fit()

print(Ireg3.summary())

#odds ratio

print("Odds ratio")

print(numpy.exp(Ireg3.params))

##odd ratios with 95% confidence intervals

params=Ireg3.params

conf=Ireg3.conf_int()

conf['OR']=params

conf.columns=['Lower CI', 'Upper CI', 'OR']

print(numpy.exp(conf))

Output:-

Logit Regression Results

==============================================================

Dep. Variable: Alc No. Observations: 162

Model: Logit Df Residuals: 160

Method: MLE Df Model: 1

Date: Wed, 19 Aug 2020 Pseudo R-squ.: 0.06705

Time: 18:36:31 Log-Likelihood: -83.418

converged: True LL-Null: -89.413

LLR p-value: 0.0005350

==============================================================

coef std err z P>|z| [0.025 0.975]

------------------------------------------------------------------------------

Intercept -2.9616 0.620 -4.774 0.000 -4.178 -1.746

urbanrate 0.0304 0.009 3.255 0.001 0.012 0.049

==============================================================

Odds ratio

Intercept 0.051738

urbanrate 1.030852

dtype: float64

Lower CI Upper CI OR

Intercept 0.015336 0.174540 0.051738

urbanrate 1.012162 1.049888 1.030852

Logit Regression Results

==============================================================

Dep. Variable: Alc No. Observations: 162

Model: Logit Df Residuals: 159

Method: MLE Df Model: 2

Date: Wed, 19 Aug 2020 Pseudo R-squ.: 0.08950

Time: 18:44:31 Log-Likelihood: -81.411

converged: True LL-Null: -89.413

LLR p-value: 0.0003347

==============================================================

coef std err z P>|z| [0.025 0.975]

-----------------------------------------------------------------------------------

Intercept -2.5273 0.638 -3.959 0.000 -3.779 -1.276

urbanrate 0.0173 0.011 1.537 0.124 -0.005 0.039

incomeperperson 4.046e-05 2.05e-05 1.974 0.048 2.8e-07 8.06e-05

==============================================================

Odds ratio

Intercept 0.079874

urbanrate 1.017460

incomeperperson 1.000040

dtype: float64

Lower CI Upper CI OR

Intercept 0.015336 0.174540 0.079874

urbanrate 1.012162 1.049888 1.017460

Logit Regression Results

==============================================================

Dep. Variable: Alc No. Observations: 162

Model: Logit Df Residuals: 160

Method: MLE Df Model: 1

Date: Wed, 19 Aug 2020 Pseudo R-squ.: 0.04966

Time: 18:44:31 Log-Likelihood: -84.973

converged: True LL-Null: -89.413

LLR p-value: 0.002883

==============================================================

coef std err z P>|z| [0.025 0.975]

------------------------------------------------------------------------------

Intercept 2.0949 1.133 1.849 0.064 -0.125 4.315

employrate -0.0563 0.020 -2.835 0.005 -0.095 -0.017

==============================================================

Odds ratio

Intercept 8.124459

employrate 0.945228

dtype: float64

Lower CI Upper CI OR

Intercept 0.882212 74.819692 8.124459

employrate 0.909129 0.982760 0.945228

Results:-

From the first logistic regression model between Alcohol consumption, classified as new categorical variable Alc and urbanrate the p value comes out to be 0.00053 stating that our regression is significant. The odds ratio comes out to be 1.03 with confidence intervals of 1.01 and 1.049 stating that we can say with 95% confidence that the odd ratios fall between 1.01 and 1.049.As our odd ratio is very closer to 1 we can emphasize that it’s likely that our model would be statistically non-significant.

After adding incomperperson along with urbanrate the p value comes out to be 0.00034 stating that our regression still remains significant. However the odds ratio for urbanrate decreases to be 1.01 with confidence intervals of 1.01 and 1.049 and odds ratio for incomeperperson comes out to be 1. We can say with 95% confidence that the odd ratios fall between 1.01 and 1.049.But as our odd ratio remains very closer to 1 we can emphasize that it’s likely that our model would be statistically non-significant.

After testing for employrate the p value comes out to be 0.0028 stating that our regression still remains significant. The odds ratio for employrate comes out to be 0.945 with confidence intervals of 0.909 and 0.983. We can say with 95% confidence that the odd ratios fall between 0.909 and 0.983.But again as our odd ratio remains very closer to 1 we can emphasize that it’s likely that our model would be statistically non-significant.

After the quantitative response variable alcohol consumption we found out the association between it and other independent variables remained statistically non-significant only with no prominent confounding observed.

0 notes

akanksha8goyal · 5 years ago

Text

Multiple regression model for examining association in gapminder dataset.

For studying the association between urbanization rate and alcohol consumption levels for different countries in the gapminder dataset various tools were used.

As the response variable alcohol consumption and the major explanatory variable urbanization rate both are quantitative, linear regression model was used first and foremost. After that for getting more accurate fitting polynomial regression was done, followed by multiple regression analysis taking into consideration other independent variables like income per person and employment rate.

Code for the same is mentioned below:-

import pandas

import numpy

import seaborn

import matplotlib.pyplot as plt

import statsmodels.api as sm

import statsmodels.formula.api as smf

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

data['employrate']=data['employrate'].convert_objects(convert_numeric=True)

##deletion of missing values

sub1=data[['urbanrate','alcconsumption','incomeperperson','employrate']].dropna()

####Polynomial Regression###

#### Linear Scatterplot

scat1=seaborn.regplot(x="urbanrate",y="alcconsumption",scatter=True, data=sub1)

plt.xlabel('Urbanization Rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption')

##fitting second order polynomial

#running 2 scatterplots together to get both linear and second order fit lines

scat1=seaborn.regplot(x="urbanrate",y="alcconsumption",scatter=True, order=2, data=sub1)

plt.xlabel('Urbanization Rate')

plt.ylabel('Alcohol Consumption')

print(scat1)

# Centering the quantitative IVs for regression analysis

sub1['urbanrate_c']=(sub1['urbanrate']-sub1['urbanrate'].mean())

sub1['incomeperperson_c']=(sub1['incomeperperson']-sub1['incomeperperson'].mean())

sub1['employrate_c']=(sub1['employrate']-sub1['employrate'].mean())

#linear regression analysis

print("OLS regrression model for the association between urbanization rate and alcohol consumption")

reg1=smf.ols('alcconsumption ~ urbanrate_c', data=sub1).fit()

print(reg1.summary())

#quadratic polynomial regression analysis

reg2=smf.ols('alcconsumption ~ urbanrate_c+ I(urbanrate_c**2)',data=sub1).fit()

print(reg2.summary())

##Evaluating model fit

reg3=smf.ols('alcconsumption ~ urbanrate_c + I(urbanrate_c**2) + incomeperperson_c + employrate_c',data=sub1).fit()

print(reg3.summary())

#Q-Q plot for mormalilty

fig4=sm.qqplot(reg3.resid, line='r')

#simple plot of residuals

stdres=pandas.DataFrame(reg3.resid_pearson)

plt.plot(stdres, 'o' , ls='None')

l=plt.axhline(y=0, color='r')

plt.ylabel('Standardized Residual')

plt.xlabel('Observation Number')

#additional regression diagnostic plots

fig2 = plt.figure(figsize=(12,8))

fig2=sm.graphics.plot_regress_exog(reg3, "incomeperperson_c", fig=fig2)

#leverage plot

fig3=sm.graphics.influence_plot(reg3, size=8)

print(fig3)

Output:-

OLS Regression Results

=============================================================

Dep. Variable: alcconsumption R-squared: 0.086

Model: OLS Adj. R-squared: 0.080

Method: Least Squares F-statistic: 15.03

Date: Thu, 13 Aug 2020 Prob (F-statistic): 0.000154

Time: 17:37:10 Log-Likelihood: -482.69

No. Observations: 162 AIC: 969.4

Df Residuals: 160 BIC: 975.5

Df Model: 1

Covariance Type: nonrobust

============================================================

coef std err t P>|t| [0.025 0.975]

-------------------------------------------------------------------------------

Intercept 6.8124 0.376 18.097 0.000 6.069 7.556

urbanrate_c 0.0650 0.017 3.876 0.000 0.032 0.098

=============================================================

Omnibus: 11.341 Durbin-Watson: 1.979

Prob(Omnibus): 0.003 Jarque-Bera (JB): 11.712

Skew: 0.640 Prob(JB): 0.00286

Kurtosis: 3.309 Cond. No. 22.5

=============================================================Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

OLS Regression Results

=============================================================

Dep. Variable: alcconsumption R-squared: 0.096

Model: OLS Adj. R-squared: 0.085

Method: Least Squares F-statistic: 8.454

Date: Thu, 13 Aug 2020 Prob (F-statistic): 0.000324

Time: 17:37:10 Log-Likelihood: -481.77

No. Observations: 162 AIC: 969.5

Df Residuals: 159 BIC: 978.8

Df Model: 2

Covariance Type: nonrobust

============================================================

coef std err t P>|t| [0.025 0.975]

---------------------------------------------------------------------------------------

Intercept 7.3077 0.526 13.890 0.000 6.269 8.347

urbanrate_c 0.0619 0.017 3.672 0.000 0.029 0.095

I(urbanrate_c ** 2) -0.0010 0.001 -1.344 0.181 -0.002 0.000

============================================================

Omnibus: 10.885 Durbin-Watson: 1.906

Prob(Omnibus): 0.004 Jarque-Bera (JB): 11.184

Skew: 0.628 Prob(JB): 0.00373

Kurtosis: 3.283 Cond. No. 1.01e+03

============================================================Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

[2] The condition number is large, 1.01e+03. This might indicate that there are

strong multicollinearity or other numerical problems.

OLS Regression Results

==============================================================

Dep. Variable: alcconsumption R-squared: 0.130

Model: OLS Adj. R-squared: 0.108

Method: Least Squares F-statistic: 5.874

Date: Thu, 13 Aug 2020 Prob (F-statistic): 0.000198

Time: 17:37:10 Log-Likelihood: -478.66

No. Observations: 162 AIC: 967.3

Df Residuals: 157 BIC: 982.8

Df Model: 4

Covariance Type: nonrobust

============================================================

coef std err t P>|t| [0.025 0.975]

---------------------------------------------------------------------------------------

Intercept 7.5850 0.546 13.883 0.000 6.506 8.664

urbanrate_c 0.0268 0.023 1.175 0.242 -0.018 0.072

I(urbanrate_c ** 2) -0.0015 0.001 -1.925 0.056 -0.003 4e-05

incomeperperson_c 0.0001 4.7e-05 2.473 0.014 2.34e-05 0.000

employrate_c -0.0082 0.041 -0.199 0.843 -0.089 0.073

=============================================================

Omnibus: 15.843 Durbin-Watson: 1.925

Prob(Omnibus): 0.000 Jarque-Bera (JB): 17.427

Skew: 0.741 Prob(JB): 0.000164

Kurtosis: 3.623 Cond. No. 1.54e+04

=============================================================

Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

[2] The condition number is large, 1.54e+04. This might indicate that there are

strong multicollinearity or other numerical problems.

Q-Q plot:-

Residuals’ plot:-

Regression diagnostic plots

Leverage plot

Summary results:-

I plotted the graph taking the order of my primary explanatory variable urbanization rate both 1 as well as 2.As it is not much clear from the graph that the best fit is order=1 /linear or 2/quadratic we will look further at the p values of linear and polynomial regression models.

The p value from linear regression model between alcconsumption:- response variable and urbanrate:- explanatory variable is coming out to be 0.00154 which is less than our cut off value of 0.05 stating a significant relationship between the two with the F statistic value of 15.03 and r squared value of 0.086. The intercept is coming to be 6.8124 and slope is coming to be 0.065 stating appositive linear relationship. We can predict the changes in response variable with an accuracy of 8.6%.

From polynomial regression model with order = 2 we see that the p value is coming to be 0.181 with a negative beta value of 0.0010 stating that there is no statistically significant relationship between the urbanization rate and alcohol consumption levels. So the best fit will be linear only.

Taking into considerations other explanatory variables i.e. income per person and employment rate we did run a multiple regression model giving results as:-The overall p value is coming to be 0.000198 with r squared value 0.13. The intercept came out to be 7.5850.

Variable p-value Beta value

Urbanrate 0.242 0.0268

Urbanrate**2 0.056 -0.0015

Incomeperperson 0.014 0.0001

Employrate 0.843 -0.0082

We see that after adding the other variables the p value related to our major explanatory variable has crossed the shifted the cutoff value of 0.05 and relationship between the primary explanatory variable and response variable has become statistically insignificant with no evidence now to reject the null hypothesis, thus incomperperson is a confounder. It confounds the relationship between the alcohol consumption levels and urbanization rate.

The Q-Q plot shows that the residuals are following a straight line, deviating at the lower and higher quantities, meaning the curvilinear association that I observed in my scatter plot may not be fully estimated by the quadratic term.

From the residual plot e can see that there is one observation which has three or more standard deviations from the mean, stating we are having one extreme outlier.

From the leverage plot we can see that we have 2 outliers with residual value greater than 2 with one having leverage value close to 0.02 and other having leverage value of 0.07. Both of which are close to zero stating having no high leverage outliers.

0 notes

akanksha8goyal · 5 years ago

Text

Testing linear regression model

Testing the association between the urbanization rate and alcohol consumption levels in the quantitative gapminder datset. As the explanatory variable- urbanization rate as well as the response variable variable alcohol consumption both are quantitative, scatter plot was created with urbanrate on x axis and alcconsumption on y axis.

Basic Linear regression model for quantitative variable-Person correlation was used for testing the association.

Faurther for centering the explanatory variable, mean was calculated and subtracted from all the observations to create a new variable urban with the centered mean close to zero.

Code:-

import pandas

import numpy

import seaborn

import matplotlib.pyplot as plt

import statsmodels.api

import statsmodels.formula.api as smf

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

#### Basic Linear regression

scat1=seaborn.regplot(x="urbanrate",y="alcconsumption",scatter=True, data=data)

plt.xlabel('Urbanization Rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption')

print(scat1)

print("OLS regrression model for the association between urbanization rate and alcohol consumption")

reg1=smf.ols('alcconsumption ~ urbanrate', data=data).fit()

print(reg1.summary())

##Centering the explanatory variable

print('mean')

mean1=data['urbanrate'].mean()

print(mean1)

data['urban']=data['urbanrate']-mean1

c1=data['urban'].value_counts(sort=False)

print(c1)

print('mean of centered observations')

mean2=data['urban'].mean()

print(mean2)

Output of the above code:-

OLS regrression model for the association between urbanization rate and alcohol consumption

OLS Regression Results

==============================================================

Dep. Variable: alcconsumption R-squared: 0.075

Model: OLS Adj. R-squared: 0.070

Method: Least Squares F-statistic: 14.73

Date: Thu, 30 Jul 2020 Prob (F-statistic): 0.000171

Time: 00:33:16 Log-Likelihood: -543.98

No. Observations: 183 AIC: 1092.

Df Residuals: 181 BIC: 1098.

Df Model: 1

Covariance Type: nonrobust

==============================================================

coef std err t P>|t| [0.025 0.975]

------------------------------------------------------------------------------

Intercept 3.4899 0.915 3.814 0.000 1.684 5.295

urbanrate 0.0591 0.015 3.838 0.000 0.029 0.089

==============================================================

Omnibus: 10.025 Durbin-Watson: 1.958

Prob(Omnibus): 0.007 Jarque-Bera (JB): 10.257

Skew: 0.573 Prob(JB): 0.00592

Kurtosis: 3.178 Cond. No. 155.

==============================================================

Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

mean

56.76935960591131

-19.949360 1

18.150640 1

10.390640 1

32.150640 1

27.770640 1

23.690640 1

-44.229360 1

-34.229360 1

17.150640 1

-30.309360 1

-17.389360 1

-28.389360 1

-38.809360 1

36.550640 1

-29.469360 1

-19.929360 2

-15.349360 1

8.150640 1

31.670640 1

-22.329360 1

5.910640 1

20.710640 1

-9.729360 1

-15.569360 1

16.690640 1

-26.309360 1

20.590640 1

-28.929360 2

-25.889360 1

-18.909360 1

1.170640 1

29.790640 1

10.130640 1

26.750640 1

-32.009360 1

37.490640 1

17.730640 1

3.790640 1

4.570640 2

34.890640 1

-27.249360 1

20.770640 1

-42.449360 1

35.490640 1

35.910640 1

13.130640 1

-31.249360 1

-43.549360 1

24.930640 1

-12.929360 1

-7.989360 1

-26.129360 1

0.410640 1

-24.189360 1

-14.769360 1

-9.989360 1

-35.169360 1

20.350640 1

-20.489360 1

-26.929360 1

Name: urban, Length: 194, dtype: int64

mean of centered observations

1.8446109445594718e-14

Discussion :-

From the scatter plot we can see that there is a positive linear relationship between urbanization rate and alcohol consumption levels.

With the help of regression model we find out that F-statistic is coming to be 14.73 with a p value of 0.000171 which is less than the beta value of 0.05 thus we can say that we can reject the null hypothesis.

The r-squared value is 0.075 stating we can predict about 7.5% variability in the response variable alcohol consumption if we know the urbanization rate.

the linear equation for the same to calculate the predicted value of Alcohol consumption level can be calculated with slope = 0.0591 and intercept=3.49, thus alcconsumption=3.49+(0.0591*urbanrate)

For centering the quantitative variable the mean was calculated to be 56.77 which was subtracted from all the observations with centered mean coming out to be almost zero.

0 notes

akanksha8goyal · 5 years ago

Text

Writing about your data:-Gapminder dataset

Sample:-

Gapminder covers data for all 192 UN members, combining data for Serbia and Montenegro. It includes data for 24 other areas, generating a total of 215 areas. It has over 200 indicators, including gross Domestic product, total employment rate, and estimated HIV prevalence, the different countries being the unique identifiers. It basically seeks to increase the use and understanding of statistics about social, economic, and environmental development at local, national, and global levels. The data analytic sample for this study included countries with per capita income above zero.

Procedure:-

GapMinder collects its data from a handful of sources, including the Institute for Health Metrics and Evaluation, US Census Bureau’s International Database, United Nations Statistics Division, and the World Bank. With the source written in front of each of the data item in the codebook.

Measures:-

For the study correlation between urban rate and alcohol consumption which if is further impacted by the income per person?

Alcconsumption which represents the 2008 alcohol consumption per adult (age 15+) in litres. It is basically the recorded and estimated average alcohol consumption, adult (15+) per capita consumption in litres pure alcohol. Source of data being WHO.

Urbanrate representing the 2008 urban population (% of total). Urban population refers to people living in urban areas as defined by national statistical offices (calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects).Source being World Bank.

Incomeperperson is the 2010 Gross Domestic Product per capita in constant 2000 US$. It is the inflation but not the differences in the cost of living between countries have been taken into account. Source of data being the World Bank Work Development Indicators.

As the complete data was quantitative Pearson Correlation coefficient was the statistical test used earlier to manage the data.Although for using Chi Square test and ANOVA test categorization of explanatory variable urbanization rate and even response variable alcohol consumption was done.

0 notes

akanksha8goyal · 5 years ago

Text

Studying moderation in Correlation Coefficient

For studying the association between urbanization rate and alcohol consumption with a third variable income per person while using Pearson correlation-statistical data analysis tool.

To explore this income per person was categorized in three categories-low with income less than 800, medium with income between 800-1000 and high above 1000 using dummy codes 0,1,2.

For more clarity scatter plot was made for all three categories between urbanization rate and alcohol consumption.Correlation coefficients and associated value were been derived for an actual interpretation.

Code:-

import pandas

import numpy

import seaborn

import scipy.stats

import matplotlib.pyplot as plt

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

#subset data to countries with per capita income is above zero

sub1=data[(data['incomeperperson']>0)]

data_clean=sub1.dropna()

print('association between urbanization rate and alcohol consumption')

print(scipy.stats.pearsonr(data_clean['urbanrate'], data_clean['alcconsumption']))

def income (row):

if row['incomeperperson']<=800:

return 0

elif row['incomeperperson']<=1000:

return 1

elif row['incomeperperson']>1000:

return 2

data_clean['income']=data_clean.apply(lambda row: income(row), axis=1)

c1=data_clean['income'].value_counts(sort=False, dropna=False)

print(c1)

sub2=data_clean[(data_clean['income']==0)]

sub3=data_clean[(data_clean['income']==1)]

sub4=data_clean[(data_clean['income']==2)]

print('association between urbanization rate and alcohol consumption for low income countries')

print(scipy.stats.pearsonr(sub2['urbanrate'], sub2['alcconsumption']))

print('association between urbanization rate and alcohol consumption for middle income countries')

print(scipy.stats.pearsonr(sub3['urbanrate'], sub3['alcconsumption']))

print('association between urbanization rate and alcohol consumption for higher income countries')

print(scipy.stats.pearsonr(sub4['urbanrate'], sub4['alcconsumption']))

scat1=seaborn.regplot(x="urbanrate",y="alcconsumption",fit_reg=True, data=sub2)

plt.xlabel('Urban Rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption for low income countries')

scat2=seaborn.regplot(x="urbanrate",y="alcconsumption",fit_reg=True, data=sub3)

plt.xlabel('Urban rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption for medium income countries')

scat3=seaborn.regplot(x="urbanrate",y="alcconsumption",fit_reg=True, data=sub4)

plt.xlabel('Urban rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption for higher income countries')

Output of the same:-

Association between urbanization rate and alcohol consumption

(0.2761979380401515, 0.00019008936389288417)

0 51

1 3

2 124

Name: income, dtype: int64

Association between urbanization rate and alcohol consumption for low income countries

(-0.04431335727423586, 0.7574988134710285)

Association between urbanization rate and alcohol consumption for middle income countries

(-0.6201162152980627, 0.574170870811378)

Association between urbanization rate and alcohol consumption for higher income countries

(0.14937396859897265, 0.09775917225379815)

Findings:-

As we can see from the Pearson correlation analysis the correlation coefficient between urbanization rate and alcohol consumption is coming to be 0.276 with a small p value of 0.0001, telling us that the relationship between these two is statistically significant but association is weak.

However after dividing into 3 categories based on the income per person for seeing its moderation effect we see that

For low income countries the correlation coefficient is -0.04 and p value is coming to be 0.76.Stating a negative but very weak linear relationship as well as no significant correlation.

For medium income countries correlation coefficient is -0.62 and associated p value is 0.57.Stating a negative and strong linear relationship but no significant correlation.

For high income companies correlation coefficient is coming to be 0.15 and associated p value is 0.098. Stating a positive linear relationship again weak with no significant correlation.

All the categorized p values are coming to be greater than the cut off value of 0.05 stating there is as such no significant effect of income levels was seen on the alcohol consumption levels depending on the urbanization rates.

Same is visible with the scatter plots also, we can say that the per capita income does not influence the association between the urbanization rates and alcohol consumption levels.

1 note · View note

akanksha8goyal · 5 years ago

Text

Pearson Coefficient correlation for studying Gapminder data

For studying the association between urbanization rate and alcohol consumption and if further impacted by the per capita income, Pearson correlation-statistical data analysis tool was used.

Scatter plot was made for both between urbanization rate and alcohol consumption and per capita income and alcohol consumption for having an initial understanding. Correlation coefficients and associated value were been derived for an actual interpretation.

Code:-

import pandas

import numpy

import seaborn

import scipy

import matplotlib.pyplot as plt

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

#subset data to countries with per capita income is above zero

sub1=data[(data['incomeperperson']>0)]

scat1=seaborn.regplot(x="urbanrate",y="alcconsumption",fit_reg=True, data=sub1)

plt.xlabel('Urban Rate')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between urbanization rate and alcohol consumption')

scat2=seaborn.regplot(x="incomeperperson",y="alcconsumption",fit_reg=True, data=sub1)

plt.xlabel('Income per person')

plt.ylabel('Alcohol Consumption')

plt.title('Scatterplot for the Association between per capita income and alcohol consumption')

data_clean=data.dropna()

print('association between urbanization rate and alcohol consumption')

print(scipy.stats.pearsonr(data_clean['urbanrate'], data_clean['alcconsumption']))

print('association between income per person and alcohol consumption')

print(scipy.stats.pearsonr(data_clean['incomeperperson'], data_clean['alcconsumption']))

Output of the same:-

association between urbanization rate and alcohol consumption

(0.2761979380401515, 0.00019008936389288417)

association between income per person and alcohol consumption

(0.29352309338840077, 6.995342377562454e-05)

Findings:-

As we can see from the Pearson correlation analysis, the correlation coefficient between urbanization rate and alcohol consumption is coming to be 0.276 with a small p value of 0.0001, telling us that the relationship between these two is statistically significant.

The correlation coefficient between income per person and alcohol consumption is coming to be 0.293 with a very small p value, telling us that the relationship between these two is also statistically significant.

However the r values are closer to zero so the linear relationship is bit weak.

Squaring the correlation coefficients - Firstly between urbanization rate and alcohol consumption the r squared value is coming to be 0.076.This could be interpreted that if we know the urban rate we can predict 7.6% of the variability we will see in the alcohol consumption.

Between income per person and alcohol consumption the r squared value is coming to be 0.086, we can say that if we know the income per person we can predict the 8.6% of the variability we’ll in the alcohol consumption.

0 notes

akanksha8goyal · 5 years ago

Text

Chi-Square analysis for studying Gapminder data

For studying the correlation between urbanization rate and alcohol consumption statistical data analysis tool- Chi-Square test was done.

As the data given in Gapminder dataset is all quantitative and Chi-Square test requires a quantitative explanatory as well as response variable. Three category new variable urban was created to do the analysis and two categories of response variable alcohol consumption was created and post hoc test was done afterwards for further understanding.

Code:-

import pandas

import numpy

import scipy.stats

data=pandas.read_csv('gapminder.csv',low_memory=False)

#bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables to numeric

data['urbanrate']=pandas.to_numeric(data['urbanrate'],errors='coerce')

data['alcconsumption']=pandas.to_numeric(data['alcconsumption'],errors='coerce')

data['incomeperperson']=pandas.to_numeric(data['incomeperperson'],errors='coerce')

#subset data with countries having per capita income greater than zero

sub1=data[(data['incomeperperson']>0)]

#subset variable in new data frame

sub2=data[['urbanrate','alcconsumption','incomeperperson']].dropna()

#new variable creation

def urban(row):

if row['urbanrate']<=33:

return 0

if row['urbanrate']>33 and row['urbanrate']<=66:

return 1

if row['urbanrate']>66:

return 2

sub2['urban']=sub2.apply(lambda row:urban(row),axis=1)

# frequency distribution for new varibale urban

print ('counts for urban')

c1=sub2['urban'].value_counts(sort=False)

print(c1)

#new variable creation

def alc(row):

if row['alcconsumption']<=15:

return 8

if row['alcconsumption']>15:

return 9

sub2['alc']=sub2.apply(lambda row:alc(row),axis=1)

# frequency distribution for new varibale alc

print ('counts for alc')

c2=sub2['alc'].value_counts(sort=False)

print(c2)

#contingency table of observed counts

ct1=pandas.crosstab(sub2['alc'],sub2['urban'])

print(ct1)

#column percentages

colsum=ct1.sum(axis=0)

colpct=ct1/colsum

print(colpct)

#chi-square

print('chi-square value, p value, expected counts')

cs1=scipy.stats.chi2_contingency(ct1)

print(cs1)

#post hoc tests

recode1={0:0,1:1}

sub2['comp0v1']=sub2['urban'].map(recode1)

#contingency table of observed counts

ct2=pandas.crosstab(sub2['alc'],sub2['comp0v1'])

print(ct2)

#column percentages

colsum=ct2.sum(axis=0)

colpct=ct2/colsum

print(colpct)

#chi-square

print('chi-square value, p value, expected counts')

cs2=scipy.stats.chi2_contingency(ct2)

print(cs2)

recode2={0:0,2:2}

sub2['comp0v2']=sub2['urban'].map(recode2)

#contingency table of observed counts

ct3=pandas.crosstab(sub2['alc'],sub2['comp0v2'])

print(ct3)

#column percentages

colsum=ct3.sum(axis=0)

colpct=ct3/colsum

print(colpct)

#chi-square

print('chi-square value, p value, expected counts')

cs3=scipy.stats.chi2_contingency(ct3)

print(cs3)

recode3={1:1,2:2}

sub2['comp1v2']=sub2['urban'].map(recode3)

#contingency table of observed counts

ct4=pandas.crosstab(sub2['alc'],sub2['comp1v2'])

print(ct4)

#column percentages

colsum=ct4.sum(axis=0)

colpct=ct4/colsum

print(colpct)

#chi-square

print('chi-square value, p value, expected counts')

cs4=scipy.stats.chi2_contingency(ct4)

print(cs4)

Output of the same:-

counts for urban

0 39

1 73

2 66

Name: urban, dtype: int64

counts for alc

8 167

9 11

Name: alc, dtype: int64

urban 0 1 2

alc

8 38 71 58

9 1 2 8

urban 0 1 2

alc

8 0.974359 0.972603 0.878788

9 0.025641 0.027397 0.121212

chi-square value, p value, expected counts

(6.387805461801927, 0.041011501349333256, 2, array([[36.58988764, 68.48876404, 61.92134831],

[ 2.41011236, 4.51123596, 4.07865169]]))

comp0v1 0.000000 1.000000

alc

8 38 71

9 1 2

comp0v1 0.000000 1.000000

alc

8 0.974359 0.972603

9 0.025641 0.027397

chi-square value, p value, expected counts

(0.3129126748581316, 0.5758983274108751, 1, array([[37.95535714, 71.04464286],

[ 1.04464286, 1.95535714]]))

comp0v2 0.000000 2.000000

alc

8 38 58

9 1 8

comp0v2 0.000000 2.000000

alc

8 0.974359 0.878788

9 0.025641 0.121212

chi-square value, p value, expected counts

(1.767782998251748, 0.1836566886054906, 1, array([[35.65714286, 60.34285714],

[ 3.34285714, 5.65714286]]))

comp1v2 1.000000 2.000000

alc

8 71 58

9 2 8

comp1v2 1.000000 2.000000

alc

8 0.972603 0.878788

9 0.027397 0.121212

chi-square value, p value, expected counts

(3.272059355903733, 0.070469111212184, 1, array([[67.74820144, 61.25179856],

[ 5.25179856, 4.74820144]]))

Findings:-

As we can see from the Chi-square analysis test the p value is coming to be 0.0410 which is less than the cut off value of f 0.05 so we can reject the null hypothesis stating that there is no correlation between the urbanization rate and alcohol consumption but as the complete data was quantitative and categories were created ,which are more than two (explanatory variable) so Post HOC test was conducted for more clear understanding.

Adjusted p value for the test comes out to be 0.017 i.e 0.05 divided by 3 ( combinations of explanatory variables). The p value of 3 combinations came out to be 0.57, 0.18 and 0.07 which are all greater than adjusted p value implying that all means are equal and no significant correlation exits between the groups created in the two variables urbanization rate and alcohol consumption.

0 notes

akanksha8goyal · 5 years ago

Text

ANOVA analysis for studying Gapminder data

For studying the correlation between urbanization rate and alcohol consumption statistical data analysis tool- ANOVA- F test was done.

As the data given in Gapminder dataset is all quantitative and F test requires a quantitative explanatory variable. Four category new variable urban was created to do the analysis and post hoc test was done afterwards for further understanding.

Code:-

# -*- coding: utf-8 -*-

"""

Created on Sat Jun 27 02:10:24 2020

@author: abc

"""

import pandas

import numpy

import statsmodels.formula.api as smf

import statsmodel.stats.multicomp as multi

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

#setting variables you be working with numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

#subset data to young adults

#subset data to countries with per capita income is above zero

sub1=data[(data['incomeperperson']>0)]

#subset variable in new dataframe

sub2=data[['urbanrate','alcconsumption','incomeperperson']].dropna()

#new variable creation

def urban(row):

if row['urbanrate']<=25:

return 1

if row['urbanrate']>25 and row['urbanrate']<=50:

return 2

if row['urbanrate']>50 and row['urbanrate']<=75:

return 3

if row['urbanrate']>75:

return 4

sub2['urban']=sub2.apply(lambda row:urban(row), axis=1)

# frequency distribution for new variable urban

print('counts for urban')

c2=sub2['urban'].value_counts(sort=False)

print(c2)

#using ols function for calculating the F-statistic and associated p value

model1=smf.ols(formula='alcconsumption ~ C(urban)',data=sub2)

results1=model1.fit()

print (results1.summary())

# post hoc test

mc1=multi.MultiComparison(sub2['alcconsumption'], sub2['urban'])

res1=mc1.tukeyhsd()

print(res1.summary())

Output of the same:-

counts for urban

1 20

2 54

3 67

4 37

Name: urban, dtype: int64

OLS Regression Results

==============================================================

Dep. Variable: alcconsumption R-squared: 0.100

Model: OLS Adj. R-squared: 0.085

Method: Least Squares F-statistic: 6.477

Date: Sun, 28 Jun 2020 Prob (F-statistic): 0.000351

Time: 18:06:31 Log-Likelihood: -525.60

No. Observations: 178 AIC: 1059.

Df Residuals: 174 BIC: 1072.

Df Model: 3 Covariance Type: nonrobust

=============================================================

coef std err t P>|t| [0.025 0.975]

---------------------------------------------------------------------------------

Intercept 4.6780 1.048 4.462 0.000 2.609 6.747

C(urban)[T.2] 0.5377 1.227 0.438 0.662 -1.885 2.960

C(urban)[T.3] 3.7410 1.195 3.131 0.002 1.383 6.099

C(urban)[T.4] 2.9736 1.301 2.285 0.024 0.405 5.542

=============================================================

Omnibus: 9.609 Durbin-Watson: 1.866

Prob(Omnibus): 0.008 Jarque-Bera (JB): 9.595

Skew: 0.542 Prob(JB): 0.00825

Kurtosis: 3.342 Cond. No. 7.03

=============================================================

Warnings:

[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Multiple Comparison of Means - Tukey HSD,FWER=0.05

============================================

group1 group2 meandiff lower upper reject

--------------------------------------------

1 2 0.5377 -2.6463 3.7218 False

1 3 3.741 0.6415 6.8404 True

1 4 2.9736 -0.4023 6.3496 False

2 3 3.2032 0.9787 5.4277 True

2 4 2.4359 -0.1601 5.0318 False

3 4 -0.7673 -3.2588 1.7241 False

--------------------------------------------

Findings:-

As we can see from the F analysis test the p value is coming to be 0.000351 which is much less than the cut off value f 0.05 so we can clearly reject the null hypothesis stating that there is no correlation between the urbanization rate and alcohol consumption.

Further for more clear understanding Post HOC test was conducted, results of which states that the mean of group 3 is significantly different from group 1 and group 2 implying not all means are equal and we have a significant correlation between the two variables.

0 notes

akanksha8goyal · 5 years ago

Text

Describing associations visually

For getting more clear results and seeing the data more clearly data management tools are used and further uni variate and bi variate graphs were plotted.

The research question: Studying the association between between the urbanization rate and alcohol consumption, further if impacted by the per per income.

Code for plotting the graphs:

Output display of the above code:-

Plotting uni variate graphs for all three variables we need to study the association between, i.e. urbanization rate, alcohol consumption and income per person.

describe urbanization rate

count 189.000000

mean 56.333757

std 23.726551

min 10.400000

25% 36.820000

50% 57.280000

75% 73.920000

max 100.000000

Name: urbanrate, dtype: float64

describe alcohol consumption

count 179.000000

mean 6.840950

std 4.900727

min 0.050000

25% 2.730000

50% 6.120000

75% 10.035000

max 23.010000

Name: alcconsumption, dtype: float64

describe income per person

count 190.000000

mean 8740.966076

std 14262.809083

min 103.775857

25% 748.245151

50% 2553.496056

75% 9379.891165

max 105147.437697

Name: incomeperperson, dtype: float64

Bivariate graphs:-

Summary:-

From the first univariate graph of urbanization rate we can see that the mean rate is about 56% in the 189 countries studied with a standard deviation of +- 24%. Minimum value of urbanrate remaining to be about 10% going to the maximum of 100%. Also we can see that the graph is slightly right skewed i.e more variability below 50% as compared to above 50%.

From the second univariate graph of alcohol consumption (pure alcohol in litres) we can predict that the mean consumption is about 7 litres in the 179 countries studied with a standard deviation of +- 4.9 litres. Minimum consumption value remaining to be about 0.05 litres going to the maximum of 23 litres. The graph is left skewed.

From the third univariate graph of income per person in US$ we can see that the mean per capita income is about 8741 dollars in the 190 countries studied with a standard deviation of +- 14286.8 dollars. Minimum income value remaining to be about 104 dollars going to the maximum of 105147 dollars. The graph is again left skewed with more number of countries having lower per capita income.

Plotting bivariate graphs to study association between urbanization rate and alcohol consumption, urbanization rate and income per person, and alcohol consumption and income per person.

Plotting urbanization rate as an explanatory variable on x axis and alcohol consumption as a response variable on y axis we can see from the below graphs that the increasing rate of urbanization the alcohol consumption is also increasing with a high positive linear correlation can be seen in the first bivariate graph.

For further studying plotting urbanization rate as an explanatory variable on x axis and per capita income as a response variable on y axis we see from that the increasing rate of urbanization is supporting the increased per capita income however the variability in this graph is bit more as compared to the previous one, there is a positive linear correlation between these two as can be seen in the second bivariate graph.

We plotted the income per person on the x axis as the explanatory variable against alcohol consumption on the y axis as the response variable to see the correlation between these two but the graph generated has high degree of variation.

0 notes

akanksha8goyal · 5 years ago

Text

Data management with python

For getting more clear results and seeing the data more clearly data management tools are used.

The research question: Studying the association between between the urbanization rate and alcohol consumption, further if impacted by the per per income.

Applying some of the data management steps: creating a new variable for urbanization rate and splitting alcohol consumption into quartiles.

Code:-

# -*- coding: utf-8 -*-

"""

Created on Sun Jun 14 00:18:35 2020

@author: Akanksha Goyal

"""

import pandas

import numpy

# any additional libraries wouls be imported here

data=pandas.read_csv('gapminder.csv', low_memory=False)

# bug fix for display formats to avoid run time errors

pandas.set_option('display.float_format',lambda x:'%f'%x)

# seting variables to numeric

data ['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data ['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data ['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

#subset data to countries with per capita income is above zero

sub1=data[(data['incomeperperson']>0)]

#make a copy of my new subsetted data

sub2=sub1.copy()

#subset variable in new dataframe

sub3=data[['urbanrate','alcconsumption','incomeperperson']]

#new variable creation

def urban(row):

if row['urbanrate']<=25:

return 1

if row['urbanrate']>25 and row['urbanrate']<=50:

return 2

if row['urbanrate']>50 and row['urbanrate']<=75:

return 3

if row['urbanrate']>75:

return 4

sub3['urban']=sub3.apply(lambda row:urban(row), axis=1)

# frequency distribution for new variable urban

print('counts for urban')

c2=sub3['urban'].value_counts(sort=False)

print(c2)

print('percentage for urban')

p1=sub3['urban'].value_counts(sort=False, normalize=True)

print(p1)

# categorize quantitative variable based on quartile spilt

print('alcohol consumption- 4 categories-quartiles')

sub2['alcconsumption']=pandas.qcut(sub2.alcconsumption,4,labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])

c3=sub2['alcconsumption'].value_counts(sort=False, dropna=True)

print(c3)

#crosstabs evaluation

print(pandas.crosstab(sub2['alcconsumption'],sub2['alcconsumption']))

#frequency distribution for Alcohol consumption

print('counts on alcohol consumption')

c4=sub2['alcconsumption'].value_counts(sort=False)

print(c4)

print('percentage on alcohol consumption')

p2=sub2['alcconsumption'].value_counts(sort=False, normalize=True)

print(p2)

Results from Above code:-

. Creating new variable urban for urbanization rate with value 1 for rate 0 to 25%, value 2 for rate 25 to 50, value 3 for rate 50 to 75 and value 4 for rate 75 to 100.

. Result came out to be:- 22 countries have urban value 1, 59 with 2, 74 with 3 and 48 with 4 showing that maximum countries have an urbanization rate ranging from 50 to 75%.

. Categorizing alcohol consumption in four quartiles labels 1=0%tile 2=25%tile 3=50%tile 4=75%tile showing that alcohol consumption in liters is almost equally divided in all the four quartiles.

Used crosstab to verify which data in coming in which quartile.

Output Display:-

runfile('E:/Akon/untitled3.py', wdir='E:/Akon')

counts for urban

1.000000 22

2.000000 59

3.000000 74

4.000000 48

Name: urban, dtype: int64

percentage for urban

1.000000 0.108374

2.000000 0.290640

3.000000 0.364532

4.000000 0.236453

Name: urban, dtype: float64

alcohol consumption- 4 categories-quartiles

1=0%tile 45

2=25%tile 45

3=50%tile 44

4=75%tile 45

Name: alcconsumption, dtype: int64

alcconsumption 1=0%tile 2=25%tile 3=50%tile 4=75%tile

alcconsumption

1=0%tile 45 0 0 0

2=25%tile 0 45 0 0

3=50%tile 0 0 44 0

4=75%tile 0 0 0 45

counts on alcohol consumption

1=0%tile 45

2=25%tile 45

3=50%tile 44

4=75%tile 45

Name: alcconsumption, dtype: int64

percentage on alcohol consumption

1=0%tile 0.251397

2=25%tile 0.251397

3=50%tile 0.245810

4=75%tile 0.251397

Name: alcconsumption, dtype: float64

0 notes

akanksha8goyal · 5 years ago

Text

Running my first python program

Running my first program with Python for establishing the association between the urbanization rates and alcohol consumption, To know if it is further impacted by the income per person.

Code for display of three variables as frequency tables and selecting columns and rows:

# -*- coding: utf-8 -*-

"""

Created on Sun Jun 7 23:11:51 2020

@author: Akanksha Goyal

"""

import pandas

import numpy

pandas.set_option('display.float_format',lambda x:'%f'%x)

data=pandas.read_csv('gapminder.csv',low_memory=False)

print(len(data)) # number of observations (rows)

print(len(data.columns)) # number of variables (columns)

#setting variables we will be working with to numeric

data['urbanrate']=data['urbanrate'].convert_objects(convert_numeric=True)

data['alcconsumption']=data['alcconsumption'].convert_objects(convert_numeric=True)

data['incomeperperson']=data['incomeperperson'].convert_objects(convert_numeric=True)

print ('counts for urbanrate i.e. 2008 urban population')

c1=data['urbanrate'].value_counts(sort=False)

print (c1)

print('percentage for urbanrate')

p1=data['urbanrate'].value_counts(sort=False,normalize=True)

print(p1)

print ('alcconsumption i.e. 2008 alcohol consumption per adult , liters')

c2=data['alcconsumption'].value_counts(sort=False)

print (c2)

print('percentage for alcconsumption')

p2=data['alcconsumption'].value_counts(sort=False,normalize=True)

print(p2)

print ('counts for incomeperserson i.e. 2010 gross domestic product per capita')

c3=data['incomeperperson'].value_counts(sort=False)

print (c3)

print('percentage for incomeperperson')

p3=data['incomeperperson'].value_counts(sort=False,normalize=True)

print(p3)

#subset data to countries with urbanization is atleast greater than 30% and per capita income is above zero

sub1=data[(data['urbanrate']>=30)& (data['incomeperperson']>0)]

#make a copy of my new subsetted data

sub2=sub1.copy()

#frequency distribution on new sub2 dataframe

print('counts for urbanization rate')

c4=sub2['urbanrate'].value_counts(sort=False)

print(c4)

print('percentage for urbanization rate')

p4=sub2['urbanrate'].value_counts(sort=False,normalize=True)

print(p4)

print('counts for alcohol consumption')

c5=sub2['alcconsumption'].value_counts(sort=False)

print(c5)

print('percentage for alcohol consumption')

p5=sub2['alcconsumption'].value_counts(sort=False,normalize=True)

print(p5)

print('counts for per capita income')

c6=sub2['incomeperperson'].value_counts(sort=False)

print(c6)

print('percentage for per capita income')

p6=sub2['incomeperperson'].value_counts(sort=False,normalize=True)

print(p6)

The output display for above code is given as below:

In the data file selected a total of 213 rows are there indicating the number of observations and 16 columns are there indicating the number of variables. 3 variables selected are urbanrate, alcconsumption and incomeperperson. Below we can see the count of various variables as the output of program and their frequency distribution. Missing data have been removed in the count as well as frequency calculation as shown after every result. Count shows how often the variable is occurring like the urban rate of 100% is occurring 6 times with a normalized frequency of 0.029557.

Also after imposing conditions on rows of urbanization rate greater than and equal to 30 and per capita income greater than 0 the results become refined.

print(len(data)) # number of observations (rows)

print(len(data.columns)) # number of variables (columns)

213

counts for urbanrate i.e. 2008 urban population

92.000000 1

100.000000 6

74.500000 1

73.500000 1

17.000000 1

61.000000 1

67.500000 1

41.000000 1

56.420000 1

57.940000 1

65.580000 2

88.920000 1

95.640000 1

41.420000 1

26.680000 1

92.260000 1

23.000000 1

42.000000 1

51.920000 1

66.900000 1

24.040000 1

41.760000 1

66.500000 1

30.460000 1

86.680000 1

42.720000 1

30.640000 1

60.740000 1

36.520000 1

39.380000 1

63.860000 1

59.580000 1

86.960000 1

43.440000 1

60.700000 1

77.200000 1

72.840000 1

43.840000 1

64.780000 1

24.760000 1

36.280000 1

66.480000 1

50.020000 1

17.240000 1

33.320000 1

46.840000 1

77.360000 1

78.420000 1

48.780000 1

91.660000 1

63.260000 1

60.560000 1

28.080000 1

67.160000 1

21.600000 1

56.020000 1

57.180000 1

73.920000 1

25.460000 1

28.380000 1

Name: urbanrate, Length: 194, dtype: int64

percentage for urbanrate

92.000000 0.004926

100.000000 0.029557

74.500000 0.004926

73.500000 0.004926

17.000000 0.004926

61.000000 0.004926

67.500000 0.004926

41.000000 0.004926

56.420000 0.004926

57.940000 0.004926

65.580000 0.009852

88.920000 0.004926

95.640000 0.004926

41.420000 0.004926

26.680000 0.004926

92.260000 0.004926

23.000000 0.004926

42.000000 0.004926

51.920000 0.004926

66.900000 0.004926

24.040000 0.004926

41.760000 0.004926

66.500000 0.004926

30.460000 0.004926

86.680000 0.004926

42.720000 0.004926

30.640000 0.004926

60.740000 0.004926

36.520000 0.004926

39.380000 0.004926

63.860000 0.004926

59.580000 0.004926

86.960000 0.004926

43.440000 0.004926

60.700000 0.004926

77.200000 0.004926

72.840000 0.004926

43.840000 0.004926

64.780000 0.004926

24.760000 0.004926

36.280000 0.004926

66.480000 0.004926

50.020000 0.004926

17.240000 0.004926

33.320000 0.004926

46.840000 0.004926

77.360000 0.004926

78.420000 0.004926

48.780000 0.004926

91.660000 0.004926

63.260000 0.004926

60.560000 0.004926

28.080000 0.004926

67.160000 0.004926

21.600000 0.004926

56.020000 0.004926

57.180000 0.004926

73.920000 0.004926

25.460000 0.004926

28.380000 0.004926

Name: urbanrate, Length: 194, dtype: float64

alcconsumption i.e. 2008 alcohol consumption per adult , liters

15.000000 1

5.250000 1

3.990000 1

9.750000 1

0.500000 1

9.500000 1

6.560000 1

5.000000 1

4.990000 1

4.430000 1

11.010000 1

5.120000 1

7.790000 1

1.870000 1

5.920000 2

0.920000 1

3.020000 1

6.990000 1

12.050000 1

12.020000 1

3.610000 1

12.480000 1

0.280000 1

8.680000 1

0.520000 1

13.310000 1

11.410000 1

0.340000 2

9.720000 1

4.390000 1

0.560000 1

7.300000 1

1.320000 1

6.420000 1

3.880000 1

10.620000 1

9.860000 1

8.550000 1

0.650000 1

10.710000 1

12.840000 1

1.290000 1

3.390000 2

10.080000 1

2.270000 1

9.460000 1

8.170000 1

1.030000 1

5.050000 1

6.660000 1

3.110000 1

7.320000 1

2.760000 1

1.640000 1

0.050000 1

16.300000 1

5.210000 1

0.320000 1

9.480000 1

8.690000 1

Name: alcconsumption, Length: 180, dtype: int64

percentage for alcconsumption

15.000000 0.005348

5.250000 0.005348

3.990000 0.005348

9.750000 0.005348

0.500000 0.005348

9.500000 0.005348

6.560000 0.005348

5.000000 0.005348

4.990000 0.005348

4.430000 0.005348

11.010000 0.005348

5.120000 0.005348

7.790000 0.005348

1.870000 0.005348

5.920000 0.010695

0.920000 0.005348

3.020000 0.005348

6.990000 0.005348

12.050000 0.005348

12.020000 0.005348

3.610000 0.005348

12.480000 0.005348

0.280000 0.005348

8.680000 0.005348

0.520000 0.005348

13.310000 0.005348

11.410000 0.005348

0.340000 0.010695

9.720000 0.005348

4.390000 0.005348

0.560000 0.005348

7.300000 0.005348

1.320000 0.005348

6.420000 0.005348

3.880000 0.005348

10.620000 0.005348

9.860000 0.005348

8.550000 0.005348

0.650000 0.005348

10.710000 0.005348

12.840000 0.005348

1.290000 0.005348

3.390000 0.010695

10.080000 0.005348

2.270000 0.005348

9.460000 0.005348

8.170000 0.005348

1.030000 0.005348

5.050000 0.005348

6.660000 0.005348

3.110000 0.005348

7.320000 0.005348

2.760000 0.005348

1.640000 0.005348

0.050000 0.005348

16.300000 0.005348

5.210000 0.005348

0.320000 0.005348

9.480000 0.005348

8.690000 0.005348

Name: alcconsumption, Length: 180, dtype: float64

counts for incomeperserson i.e. 2010 gross domestic product per capita

5188.900935 1

8614.120219 1

39972.352768 1

279.180453 1

161.317137 1

11894.464075 1

1036.830725 1

9106.327234 1

11744.834167 1

2231.993335 1

1392.411829 1

2146.358593 1

10480.817203 1

595.874535 1

5348.597192 1

10749.419238 1

12729.454400 1

19630.540547 1

554.879840 1

9175.796015 1

2025.282665 1

1975.551906 1

268.331790 1

5634.003948 1

37662.751250 1

25249.986061 1

558.062877 1

1728.020976 1

2923.144355 1

6105.280743 1

1784.071284 1

557.947513 1

103.775857 1

1144.102193 1

285.224449 1

2636.787800 1

17092.460004 1

31993.200694 1

22275.751661 1

1326.741757 1

4189.436587 1

5332.238591 1

1232.794137 1

338.266391 1

268.259450 1

495.734247 1

6334.105194 1

12505.212545 1

269.892881 1

6238.537506 1

1324.194906 1

6243.571318 1

27595.091347 1

5011.219456 1

1258.762596 1

377.421113 1

2344.896916 1

25306.187193 1

4180.765821 1

25575.352623 1

Name: incomeperperson, Length: 190, dtype: int64

percentage for incomeperperson

5188.900935 0.005263

8614.120219 0.005263

39972.352768 0.005263

279.180453 0.005263

161.317137 0.005263

11894.464075 0.005263

1036.830725 0.005263

9106.327234 0.005263

11744.834167 0.005263

2231.993335 0.005263

1392.411829 0.005263

2146.358593 0.005263

10480.817203 0.005263

595.874535 0.005263

5348.597192 0.005263

10749.419238 0.005263

12729.454400 0.005263

19630.540547 0.005263

554.879840 0.005263

9175.796015 0.005263

2025.282665 0.005263

1975.551906 0.005263

268.331790 0.005263

5634.003948 0.005263

37662.751250 0.005263

25249.986061 0.005263

558.062877 0.005263

1728.020976 0.005263

2923.144355 0.005263

6105.280743 0.005263

1784.071284 0.005263

557.947513 0.005263

103.775857 0.005263

1144.102193 0.005263

285.224449 0.005263

2636.787800 0.005263

17092.460004 0.005263

31993.200694 0.005263

22275.751661 0.005263

1326.741757 0.005263

4189.436587 0.005263

5332.238591 0.005263

1232.794137 0.005263

338.266391 0.005263

268.259450 0.005263

495.734247 0.005263

6334.105194 0.005263

12505.212545 0.005263

269.892881 0.005263

6238.537506 0.005263

1324.194906 0.005263

6243.571318 0.005263

27595.091347 0.005263

5011.219456 0.005263

1258.762596 0.005263

377.421113 0.005263

2344.896916 0.005263

25306.187193 0.005263

4180.765821 0.005263

25575.352623 0.005263

Name: incomeperperson, Length: 190, dtype: float64

After applying subsets:

counts for urbanization rate

92.000000 1

100.000000 5

74.500000 1

73.500000 1

61.000000 1

67.500000 1

66.960000 1

41.000000 1

72.840000 1

43.440000 1

65.580000 2

88.920000 1

95.640000 1

56.420000 1

52.360000 1

94.260000 1

42.000000 1

66.500000 1

51.920000 1

71.400000 1

67.160000 1

37.760000 1

30.460000 1

86.680000 1

42.720000 1

73.480000 1

39.380000 1

88.740000 1

81.460000 1

61.340000 2

77.200000 1

82.420000 1

60.140000 1

36.840000 2

54.240000 1

63.860000 1

38.580000 1

86.960000 1

77.880000 1

60.700000 1

73.200000 1

43.840000 1

94.220000 1

30.640000 1

56.740000 1

50.020000 1

33.320000 1

46.840000 1

77.360000 1

78.420000 1

36.280000 1

63.260000 1

56.560000 1

81.820000 1

59.580000 1

66.900000 1

56.020000 1

73.920000 1

35.420000 1

30.880000 1

Name: urbanrate, Length: 148, dtype: int64

percentage for urbanization rate

92.000000 0.006452

100.000000 0.032258

74.500000 0.006452

73.500000 0.006452

61.000000 0.006452

67.500000 0.006452

66.960000 0.006452

41.000000 0.006452

72.840000 0.006452

43.440000 0.006452

65.580000 0.012903

88.920000 0.006452

95.640000 0.006452

56.420000 0.006452

52.360000 0.006452

94.260000 0.006452

42.000000 0.006452

66.500000 0.006452

51.920000 0.006452

71.400000 0.006452

67.160000 0.006452

37.760000 0.006452

30.460000 0.006452

86.680000 0.006452

42.720000 0.006452

73.480000 0.006452

39.380000 0.006452

88.740000 0.006452

81.460000 0.006452

61.340000 0.012903

77.200000 0.006452

82.420000 0.006452

60.140000 0.006452

36.840000 0.012903

54.240000 0.006452

63.860000 0.006452

38.580000 0.006452

86.960000 0.006452

77.880000 0.006452

60.700000 0.006452

73.200000 0.006452

43.840000 0.006452

94.220000 0.006452

30.640000 0.006452

56.740000 0.006452

50.020000 0.006452

33.320000 0.006452

46.840000 0.006452

77.360000 0.006452

78.420000 0.006452

36.280000 0.006452

63.260000 0.006452

56.560000 0.006452

81.820000 0.006452

59.580000 0.006452

66.900000 0.006452

56.020000 0.006452

73.920000 0.006452

35.420000 0.006452

30.880000 0.006452

Name: urbanrate, Length: 148, dtype: float64

counts for alcohol consumption

15.000000 1

9.750000 1

9.460000 1

9.480000 1

9.500000 1

5.000000 1

1.050000 1

4.990000 1

9.860000 1

11.010000 1

5.120000 1

7.790000 1

3.990000 1

5.920000 2

0.920000 1

3.020000 1

6.990000 1

0.200000 1

12.020000 1

12.480000 1

1.870000 1

8.680000 1

7.080000 1

9.720000 1

6.560000 1

4.980000 1

14.430000 1

17.470000 1

0.690000 1

5.210000 1

0.650000 1

2.560000 1

10.410000 1

12.210000 1

0.520000 1

6.530000 1

6.420000 1

12.110000 1

8.550000 1

16.300000 1

0.560000 1

12.840000 1

1.290000 1

3.390000 1

10.080000 1

2.270000 1

10.710000 1

8.170000 1

1.030000 1

11.410000 1

6.660000 1

3.110000 1

12.140000 1

2.760000 1

17.240000 1

0.050000 1

0.340000 1

4.960000 1

0.320000 1

14.940000 1

Name: alcconsumption, Length: 144, dtype: int64

percentage for alcohol consumption

15.000000 0.006897

9.750000 0.006897

9.460000 0.006897

9.480000 0.006897

9.500000 0.006897

5.000000 0.006897

1.050000 0.006897

4.990000 0.006897

9.860000 0.006897

11.010000 0.006897

5.120000 0.006897

7.790000 0.006897

3.990000 0.006897

5.920000 0.013793

0.920000 0.006897

3.020000 0.006897

6.990000 0.006897

0.200000 0.006897

12.020000 0.006897

12.480000 0.006897

1.870000 0.006897

8.680000 0.006897

7.080000 0.006897

9.720000 0.006897

6.560000 0.006897

4.980000 0.006897

14.430000 0.006897

17.470000 0.006897

0.690000 0.006897

5.210000 0.006897

0.650000 0.006897

2.560000 0.006897

10.410000 0.006897

12.210000 0.006897

0.520000 0.006897

6.530000 0.006897

6.420000 0.006897

12.110000 0.006897

8.550000 0.006897

16.300000 0.006897

0.560000 0.006897

12.840000 0.006897

1.290000 0.006897

3.390000 0.006897

10.080000 0.006897

2.270000 0.006897

10.710000 0.006897

8.170000 0.006897

1.030000 0.006897

11.410000 0.006897

6.660000 0.006897

3.110000 0.006897

12.140000 0.006897

2.760000 0.006897

17.240000 0.006897

0.050000 0.006897

0.340000 0.006897

4.960000 0.006897

0.320000 0.006897

14.940000 0.006897

Name: alcconsumption, Length: 144, dtype: float64

counts for per capita income

5188.900935 1

8614.120219 1

39972.352768 1

1621.177078 1

11894.464075 1

1036.830725 1

9106.327234 1

11744.834167 1

2231.993335 1

1392.411829 1

3745.649852 1

5348.597192 1

10749.419238 1

12729.454400 1

19630.540547 1

554.879840 1

9175.796015 1

1975.551906 1

268.331790 1

5634.003948 1

37662.751250 1

25249.986061 1

595.874535 1

5332.238591 1

2923.144355 1

6105.280743 1

372.728414 1

591.067944 1

5184.709328 1

21087.394125 1

5330.401612 1

18982.269285 1

105147.437697 1

2425.471293 1

5528.363114 1

103.775857 1

28033.489283 1

285.224449 1

2636.787800 1

17092.460004 1

31993.200694 1

22275.751661 1

1326.741757 1

4189.436587 1

1232.794137 1

2668.020519 1

948.355952 1

6334.105194 1

12505.212545 1

269.892881 1

6238.537506 1

1324.194906 1

6243.571318 1

27595.091347 1

5011.219456 1

1258.762596 1

2344.896916 1

25306.187193 1

4180.765821 1

25575.352623 1

Name: incomeperperson, Length: 155, dtype: int64

percentage for per capita income

5188.900935 0.006452

8614.120219 0.006452

39972.352768 0.006452

1621.177078 0.006452

11894.464075 0.006452

1036.830725 0.006452

9106.327234 0.006452

11744.834167 0.006452

2231.993335 0.006452

1392.411829 0.006452

3745.649852 0.006452

5348.597192 0.006452

10749.419238 0.006452

12729.454400 0.006452

19630.540547 0.006452

554.879840 0.006452

9175.796015 0.006452

1975.551906 0.006452

268.331790 0.006452

5634.003948 0.006452

37662.751250 0.006452

25249.986061 0.006452

595.874535 0.006452

5332.238591 0.006452

2923.144355 0.006452

6105.280743 0.006452

372.728414 0.006452

591.067944 0.006452

5184.709328 0.006452

21087.394125 0.006452

5330.401612 0.006452

18982.269285 0.006452

105147.437697 0.006452

2425.471293 0.006452

5528.363114 0.006452

103.775857 0.006452

28033.489283 0.006452

285.224449 0.006452

2636.787800 0.006452

17092.460004 0.006452

31993.200694 0.006452

22275.751661 0.006452

1326.741757 0.006452

4189.436587 0.006452

1232.794137 0.006452

2668.020519 0.006452

948.355952 0.006452

6334.105194 0.006452

12505.212545 0.006452

269.892881 0.006452

6238.537506 0.006452

1324.194906 0.006452

6243.571318 0.006452

27595.091347 0.006452

5011.219456 0.006452

1258.762596 0.006452

2344.896916 0.006452

25306.187193 0.006452

4180.765821 0.006452

25575.352623 0.006452

Name: incomeperperson, Length: 155, dtype: float64

After applying the subsets the count of percapita income reduced to 155 from 190.

0 notes