Don't wanna be here? Send us removal request.
Text
# -*- coding: utf-8 -*- """ The Decision Tree Assignment which aims to predict probability of citizen having negative life outlook based on their ethnicity, social status and income.
"""
####################################################### # 1. import library & data ####################################################### # 1.1 import library
from pandas import Series, DataFrame import pandas as pd import numpy as np import os import matplotlib.pylab as plt from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report import sklearn.metrics
# 1.2 set directory os.chdir("C:/Users/amishr12/Desktop/data/Mat/AI/ML")
# 1.3 import data data = pd.read_csv('ool_pds.csv', sep=',', low_memory=False) data.dtypes data.describe()
# 1.4 setting variables to be working with to numeric and category # 1.4.1 Dependent variable data['ool'] = data['W1_F1'].astype('category') #factor variable
# 1.4.2 Explanatory variables data['inc'] = pd.to_numeric(data['PPINCIMP'], errors='coerce') # qt var. - household income data['soc'] = pd.to_numeric(data['W1_P2'], errors='coerce') # ft var. (can order) - social class data['ethm'] = data['PPETHM'].astype('category') Â # ft var. (can't order) - ethnic data['sex'] = data['PPGENDER'].astype('category') # ft var. (binary) - gender data['edu'] = pd.to_numeric(data['PPEDUCAT'], errors='coerce') # ft var. (can order) - education data['age'] = pd.to_numeric(data['PPAGE'], errors='coerce') # qt var. - age data['unemp'] = data['W1_P11'].astype('category') # ft var. (binary) - unemploy
####################################################### # 2. Make and implement data management decisions ####################################################### # 2.1 Subset for selected variables # subset variables in new data frame, sub1 s_data=data[['ool', 'inc', 'soc', 'ethm','sex','edu','age','unemp']] s_data.dtypes s_data.describe() #describe quantitative var.
# 2.2 Coding out missing data (-1 = missing) # No missing value for inc & ethm & sex & edu & age s_data['ool'] = s_data['ool'].replace(-1, np.nan) s_data['soc'] = s_data['soc'].replace(-1, np.nan) s_data['unemp'] = s_data['unemp'].replace(-1, np.nan)
data_clean = s_data.dropna()
data_clean.dtypes data_clean.describe()
# 2.3 Recode variables # 2.3.1 Recoding values for ool to be more intuitive recode1 = {1: 'positive', 2: 'neutral', 3: 'negative'} print (data_clean["ool"].value_counts(sort=False)) #before recoding data_clean['ool']= data_clean['ool'].map(recode1) print (data_clean["ool"].value_counts(sort=False)) #after recoding
# 2.3.2 Recode inc to be quantitative recode1 = {1:2500, 2:6250, 3:8750, 4:11250, 5:13750, 6: 17500, 7:22500, 8: 27500, 9: 32500, Â Â Â Â Â 10:37500, 11: 45000, 12:55000, 13:67500, 14: 80000, 15: 92500, 16:112500, Â Â Â Â Â 17: 137500, 18: 162500, 19: 200000} print (data_clean["inc"].value_counts(sort=False)) #before recoding data_clean['inc']= data_clean['inc'].map(recode1) print (data_clean["inc"].value_counts(sort=False)) #after recoding
# 2.3.3 Recode soc to start with 0 print (data_clean["soc"].value_counts(sort=False)) #before recoding data_clean["soc"] = data_clean["soc"] -1 print (data_clean["soc"].value_counts(sort=False)) #after recoding
# 2.3.4 Recode sex to have 0 = male | 1 = female recode1 = {1: 0, 2: 1} data_clean['sex']= data_clean['sex'].map(recode1) data_clean['sex'] = data_clean['sex'].astype('category') print (data_clean["sex"].value_counts(sort=False))
# 2.3.5 Recode unemp to have 0 = employ | 1 = unemploy recode1 = {1: 1, 2: 0} data_clean['unemp']= data_clean['unemp'].map(recode1) data_clean['unemp'] = data_clean['unemp'].astype('category') print (data_clean["unemp"].value_counts(sort=False))
# 2.4 Check the data print (data_clean.dtypes) print (data_clean.describe())
# 2.5 Set prediction and target variable predictors = data_clean[['inc', 'soc', 'ethm','sex','edu','age','unemp']] targets = data_clean['ool']
# 2.6 Split into training and testing sets pred_train, pred_test, tar_train, tar_test  =  train_test_split(predictors, targets,                                 test_size=.4)
pred_train.shape pred_test.shape tar_train.shape tar_test.shape
############################################################################## # 3. Perform Analysis ############################################################################## # 3.1 Base Model # 3.1.1 Create base model print ('Training Set Frequency Table') print (tar_train.value_counts(sort=False, normalize=True)) # -> always predict positive
# 3.1.2 Base model accuracy -> always predict positive print ('Test Set Frequency Table') print (tar_test.value_counts(sort=False, normalize=True)) # 0.55 accuracy
# 3.2 Decision Tree Model # 3.2.1 Build model on training data classifier=DecisionTreeClassifier(max_leaf_nodes = 5) classifier=classifier.fit(pred_train,tar_train) #help (DecisionTreeClassifier())
# 3.2.3 Checking the result - training set prediction_train=classifier.predict(pred_train) print ("Decision Tree - Training Set Result: Confusion Matrix & Accuracy") print (sklearn.metrics.confusion_matrix(tar_train,prediction_train)) print (sklearn.metrics.accuracy_score(tar_train, prediction_train)) #model accuracy ~ 0.58
# 3.2.4 Checking the result - test set prediction_test=classifier.predict(pred_test) print ("Decision Tree - Test Set Result: Confusion Matrix & Accuracy") print (sklearn.metrics.confusion_matrix(tar_test,prediction_test)) print (sklearn.metrics.accuracy_score(tar_test, prediction_test)) #model accuracy = 0.56
# 3.2.4 Displaying the decision tree from sklearn import tree #from StringIO import StringIO from io import StringIO from IPython.display import Image out = StringIO() tree.export_graphviz(classifier, out_file=out, Â Â Â Â Â Â Â Â Â Â feature_names=pred_train.columns.values, Â Â Â Â Â Â Â Â Â Â class_names = ['negative', 'neutral', 'positive'],filled=True, rounded=True)
import pydotplus graph=pydotplus.graph_from_dot_data(out.getvalue()) Image(graph.create_png())
Output:
runfile('C:/Users/amishr12/.spyder-py3/temp.py', wdir='C:/Users/amishr12/.spyder-py3') C:/Users/amishr12/.spyder-py3/temp.py:56: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['ool'] = s_data['ool'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:57: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['soc'] = s_data['soc'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:58: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['unemp'] = s_data['unemp'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:69: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['ool']= data_clean['ool'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:77: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['inc']= data_clean['inc'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:82: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean["soc"] = data_clean["soc"] -1 C:/Users/amishr12/.spyder-py3/temp.py:87: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['sex']= data_clean['sex'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:88: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['sex'] = data_clean['sex'].astype('category') C:/Users/amishr12/.spyder-py3/temp.py:93: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['unemp']= data_clean['unemp'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:94: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['unemp'] = data_clean['unemp'].astype('category') -1    0 1   1188 2   764 3   215 Name: ool, dtype: int64 negative   215 positive   1188 neutral    764 Name: ool, dtype: int64 2    54 4    66 6    90 8   133 10   119 12   173 14   125 16   193 18   62 1   116 3    57 5    59 7   103 9   103 11   154 13   221 15   137 17   121 19   81 Name: inc, dtype: int64 8750    57 37500   119 17500    90 13750    59 55000   173 112500   193 67500   221 22500   103 6250    54 80000   125 137500   121 92500   137 2500    116 162500   62 27500   133 11250    66 200000   81 32500   103 45000   154 Name: inc, dtype: int64 3.0   939 1.0   251 2.0   743 4.0   210 5.0   24 Name: soc, dtype: int64 2.0   939 0.0   251 1.0   743 3.0   210 4.0   24 Name: soc, dtype: int64 0   982 1   1185 Name: sex, dtype: int64 0.0   1323 1.0   844 Name: unemp, dtype: int64 ool     object inc     int64 soc    float64 ethm   category sex    category edu     int64 age     int64 unemp   category dtype: object         inc      soc      edu      age count   2167.000000  2167.000000  2167.000000  2167.000000 mean   63082.602677   1.544532   2.824642   48.983849 std   49380.053076   0.860397   0.969851   16.425444 min    2500.000000   0.000000   1.000000   18.000000 25%   22500.000000   1.000000   2.000000   36.000000 50%   55000.000000   2.000000   3.000000   51.000000 75%   92500.000000   2.000000   4.000000   62.000000 max   200000.000000   4.000000   4.000000   81.000000 Training Set Frequency Table negative   0.096154 positive   0.543846 neutral   0.360000 Name: ool, dtype: float64 Test Set Frequency Table positive   0.554787 neutral   0.341407 negative   0.103806 Name: ool, dtype: float64 Decision Tree - Training Set Result: Confusion Matrix & Accuracy [[  0  68  57] [  0 190 278] [  0 150 557]] 0.5746153846153846 Decision Tree - Test Set Result: Confusion Matrix & Accuracy [[  0  50  40] [  0  96 200] [  0 111 370]] 0.5374855824682814
1 note
·
View note