amishr12-blog
amishr12-blog
Untitled
1 post
Don't wanna be here? Send us removal request.
amishr12-blog · 5 years ago
Text
# -*- coding: utf-8 -*- """ The Decision Tree Assignment which aims to predict probability of citizen having negative life outlook based on their ethnicity, social status and income.
"""
####################################################### # 1. import library & data ####################################################### # 1.1 import library
from pandas import Series, DataFrame import pandas as pd import numpy as np import os import matplotlib.pylab as plt from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report import sklearn.metrics
# 1.2 set directory os.chdir("C:/Users/amishr12/Desktop/data/Mat/AI/ML")
# 1.3 import data data = pd.read_csv('ool_pds.csv', sep=',', low_memory=False) data.dtypes data.describe()
# 1.4 setting variables to be working with to numeric and category # 1.4.1 Dependent variable data['ool'] = data['W1_F1'].astype('category') #factor variable
# 1.4.2 Explanatory variables data['inc'] = pd.to_numeric(data['PPINCIMP'], errors='coerce') # qt var. - household income data['soc'] = pd.to_numeric(data['W1_P2'], errors='coerce') # ft var. (can order) - social class data['ethm'] = data['PPETHM'].astype('category')  # ft var. (can't order) - ethnic data['sex'] = data['PPGENDER'].astype('category') # ft var. (binary) - gender data['edu'] = pd.to_numeric(data['PPEDUCAT'], errors='coerce') # ft var. (can order) - education data['age'] = pd.to_numeric(data['PPAGE'], errors='coerce') # qt var. - age data['unemp'] = data['W1_P11'].astype('category') # ft var. (binary) - unemploy
####################################################### # 2. Make and implement data management decisions ####################################################### # 2.1 Subset for selected variables # subset variables in new data frame, sub1 s_data=data[['ool', 'inc', 'soc', 'ethm','sex','edu','age','unemp']] s_data.dtypes s_data.describe() #describe quantitative var.
# 2.2 Coding out missing data (-1 = missing) # No missing value for inc & ethm & sex & edu & age s_data['ool'] = s_data['ool'].replace(-1, np.nan) s_data['soc'] = s_data['soc'].replace(-1, np.nan) s_data['unemp'] = s_data['unemp'].replace(-1, np.nan)
data_clean = s_data.dropna()
data_clean.dtypes data_clean.describe()
# 2.3 Recode variables # 2.3.1 Recoding values for ool to be more intuitive recode1 = {1: 'positive', 2: 'neutral', 3: 'negative'} print (data_clean["ool"].value_counts(sort=False)) #before recoding data_clean['ool']= data_clean['ool'].map(recode1) print (data_clean["ool"].value_counts(sort=False)) #after recoding
# 2.3.2 Recode inc to be quantitative recode1 = {1:2500, 2:6250, 3:8750, 4:11250, 5:13750, 6: 17500, 7:22500, 8: 27500, 9: 32500,          10:37500, 11: 45000, 12:55000, 13:67500, 14: 80000, 15: 92500, 16:112500,          17: 137500, 18: 162500, 19: 200000} print (data_clean["inc"].value_counts(sort=False)) #before recoding data_clean['inc']= data_clean['inc'].map(recode1) print (data_clean["inc"].value_counts(sort=False)) #after recoding
# 2.3.3 Recode soc to start with 0 print (data_clean["soc"].value_counts(sort=False)) #before recoding data_clean["soc"] = data_clean["soc"] -1 print (data_clean["soc"].value_counts(sort=False)) #after recoding
# 2.3.4 Recode sex to have 0 = male | 1 = female recode1 = {1: 0, 2: 1} data_clean['sex']= data_clean['sex'].map(recode1) data_clean['sex'] = data_clean['sex'].astype('category') print (data_clean["sex"].value_counts(sort=False))
# 2.3.5 Recode unemp to have 0 = employ | 1 = unemploy recode1 = {1: 1, 2: 0} data_clean['unemp']= data_clean['unemp'].map(recode1) data_clean['unemp'] = data_clean['unemp'].astype('category') print (data_clean["unemp"].value_counts(sort=False))
# 2.4 Check the data print (data_clean.dtypes) print (data_clean.describe())
# 2.5 Set prediction and target variable predictors = data_clean[['inc', 'soc', 'ethm','sex','edu','age','unemp']] targets = data_clean['ool']
# 2.6 Split into training and testing sets pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets,                                                                test_size=.4)
pred_train.shape pred_test.shape tar_train.shape tar_test.shape
############################################################################## # 3. Perform Analysis ############################################################################## # 3.1 Base Model # 3.1.1 Create base model print ('Training Set Frequency Table') print (tar_train.value_counts(sort=False, normalize=True)) # -> always predict positive
# 3.1.2 Base model accuracy -> always predict positive print ('Test Set Frequency Table') print (tar_test.value_counts(sort=False, normalize=True)) # 0.55 accuracy
# 3.2 Decision Tree Model # 3.2.1 Build model on training data classifier=DecisionTreeClassifier(max_leaf_nodes = 5) classifier=classifier.fit(pred_train,tar_train) #help (DecisionTreeClassifier())
# 3.2.3 Checking the result - training set prediction_train=classifier.predict(pred_train) print ("Decision Tree - Training Set Result: Confusion Matrix & Accuracy") print (sklearn.metrics.confusion_matrix(tar_train,prediction_train)) print (sklearn.metrics.accuracy_score(tar_train, prediction_train)) #model accuracy ~ 0.58
# 3.2.4 Checking the result - test set prediction_test=classifier.predict(pred_test) print ("Decision Tree - Test Set Result: Confusion Matrix & Accuracy") print (sklearn.metrics.confusion_matrix(tar_test,prediction_test)) print (sklearn.metrics.accuracy_score(tar_test, prediction_test)) #model accuracy = 0.56
# 3.2.4 Displaying the decision tree from sklearn import tree #from StringIO import StringIO from io import StringIO from IPython.display import Image out = StringIO() tree.export_graphviz(classifier, out_file=out,                    feature_names=pred_train.columns.values,                    class_names = ['negative', 'neutral', 'positive'],filled=True, rounded=True)
import pydotplus graph=pydotplus.graph_from_dot_data(out.getvalue()) Image(graph.create_png())
Output:
runfile('C:/Users/amishr12/.spyder-py3/temp.py', wdir='C:/Users/amishr12/.spyder-py3') C:/Users/amishr12/.spyder-py3/temp.py:56: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['ool'] = s_data['ool'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:57: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['soc'] = s_data['soc'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:58: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy s_data['unemp'] = s_data['unemp'].replace(-1, np.nan) C:/Users/amishr12/.spyder-py3/temp.py:69: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['ool']= data_clean['ool'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:77: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['inc']= data_clean['inc'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:82: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean["soc"] = data_clean["soc"] -1 C:/Users/amishr12/.spyder-py3/temp.py:87: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['sex']= data_clean['sex'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:88: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['sex'] = data_clean['sex'].astype('category') C:/Users/amishr12/.spyder-py3/temp.py:93: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['unemp']= data_clean['unemp'].map(recode1) C:/Users/amishr12/.spyder-py3/temp.py:94: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data_clean['unemp'] = data_clean['unemp'].astype('category') -1       0 1    1188 2     764 3     215 Name: ool, dtype: int64 negative     215 positive    1188 neutral      764 Name: ool, dtype: int64 2      54 4      66 6      90 8     133 10    119 12    173 14    125 16    193 18     62 1     116 3      57 5      59 7     103 9     103 11    154 13    221 15    137 17    121 19     81 Name: inc, dtype: int64 8750       57 37500     119 17500      90 13750      59 55000     173 112500    193 67500     221 22500     103 6250       54 80000     125 137500    121 92500     137 2500      116 162500     62 27500     133 11250      66 200000     81 32500     103 45000     154 Name: inc, dtype: int64 3.0    939 1.0    251 2.0    743 4.0    210 5.0     24 Name: soc, dtype: int64 2.0    939 0.0    251 1.0    743 3.0    210 4.0     24 Name: soc, dtype: int64 0     982 1    1185 Name: sex, dtype: int64 0.0    1323 1.0     844 Name: unemp, dtype: int64 ool        object inc         int64 soc       float64 ethm     category sex      category edu         int64 age         int64 unemp    category dtype: object                inc          soc          edu          age count    2167.000000  2167.000000  2167.000000  2167.000000 mean    63082.602677     1.544532     2.824642    48.983849 std     49380.053076     0.860397     0.969851    16.425444 min      2500.000000     0.000000     1.000000    18.000000 25%     22500.000000     1.000000     2.000000    36.000000 50%     55000.000000     2.000000     3.000000    51.000000 75%     92500.000000     2.000000     4.000000    62.000000 max    200000.000000     4.000000     4.000000    81.000000 Training Set Frequency Table negative    0.096154 positive    0.543846 neutral     0.360000 Name: ool, dtype: float64 Test Set Frequency Table positive    0.554787 neutral     0.341407 negative    0.103806 Name: ool, dtype: float64 Decision Tree - Training Set Result: Confusion Matrix & Accuracy [[  0  68  57] [  0 190 278] [  0 150 557]] 0.5746153846153846 Decision Tree - Test Set Result: Confusion Matrix & Accuracy [[  0  50  40] [  0  96 200] [  0 111 370]] 0.5374855824682814
1 note · View note