codingroockie
codingroockie
Sin título
4 posts
Don't wanna be here? Send us removal request.
codingroockie · 2 years ago
Text
Cluster program without sklearn
Tumblr media Tumblr media
Tumblr media Tumblr media
I had problems with the KMeans function
0 notes
codingroockie · 2 years ago
Text
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.linear_model import LassoLarsCV
AH_data = pd.read_csv("car_usage.csv")
data_clean = AH_data.dropna()
recode1 = {1:1,2:0}
data_clean['V8'] = data_clean['V6'].map(recode1)
data_clean.dtypes
data_clean.describe()
"""
Modeling and Prediction
"""
#Split into training and testing sets
predictors = data_clean[['DATA_USAGE','V6','AMERICAN','MUSCLE','SPORT','4X4','SEDAN','HIGHLANDER','CROMED1','turbo1','transmision1','ac1','gpsl','polarized1','crashed1','TICKETS1','STATES','MECHANICS1','GAS1','VA1','BOXED1','BOUGHT','ELECTRIC','PARPRES']]
targets = data_clean.TARGET
pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
model1 = LassoLarsCV(cv=10,precompute=False).fit(pred_train,tar_train)
print(dict(zip(predictors.columns,model1.coef_)))
# plot coefficient progression
m_log_alphas = -np.log10(model1.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model1.coef_path_.T)
plt.axvline(-np.log10(model1.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
# plot mean square error for each fold
m_log_alphascv = -np.log10(model1.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model1.mse_path_, ':')
plt.plot(m_log_alphascv, model1.mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model1.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
# MSE from training and test data
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model1.predict(pred_train))
test_error = mean_squared_error(tar_test, model1.predict(pred_test))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
# R-square from training and test data
rsquared_train=model1.score(pred_train,tar_train)
rsquared_test=model1.score(pred_test,tar_test)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
Tumblr media
Multiple variables tend to zero reducing the complecity of the model.
0 notes
codingroockie · 2 years ago
Text
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
AH_data = pd.read_csv("car_usage.csv")
data_clean = AH_data.dropna()
data_clean.dtypes
data_clean.describe()
"""
Modeling and Prediction
"""
#Split into training and testing sets
predictors = data_clean[['DATA_USAGE','V6','AMERICAN','MUSCLE','SPORT','4X4','SEDAN','HIGHLANDER','CROMED1','turbo1','transmision1','ac1','gpsl','polarized1','crashed1','TICKETS1','STATES','MECHANICS1','GAS1','VA1','BOXED1','BOUGHT','ELECTRIC','PARPRES']]
targets = data_clean.TARGET
pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
#Build model on training data
#classifier=DecisionTreeClassifier()
#classifier=classifier.fit(pred_train,tar_train)
#predictions=classifier.predict(pred_test)
#print(predictions)
#sklearn.metrics.confusion_matrix(tar_test,predictions)
#print(sklearn.metrics.accuracy_score(tar_test, predictions))
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
classifier = RandomForestClassifier(n_estimators=25)
classifier = classifier.fit(pred_train,tar_train)
predictions = classifier.predict(pred_test)
predictions=classifier.predict(pred_test)
#print(predictions)
sklearn.metrics.confusion_matrix(tar_test,predictions)
print(sklearn.metrics.accuracy_score(tar_test, predictions))
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
print(model.feature_importances_)
trees = range(25)
accuracy = np.zeros(25)
for idx in range(len(trees)):
    classifier = RandomForestClassifier(n_estimators = idx+1)
    classifier = classifier.fit(pred_train,tar_train)
    predictions = classifier.predict(pred_test)
    accuracy[idx] = sklearn.metrics.accuracy_score(tar_test,predictions)
plt.cla()
plt.plot(trees,accuracy)
plt.show()
Tumblr media
0.8442622950819673 [0.02592401 0.01445604 0.02320977 0.0173445 0.00758759 0.00689442 0.06145175 0.05076942 0.04913949 0.11451924 0.01962096 0.0173606 0.02639438 0.05672182 0.05597494 0.05127223 0.01665533 0.07227021 0.0699886 0.06857387 0.01013534 0.05616487 0.05673264 0.05083798]
The most important influence comes from the .11 variable which is cromed1. Also, the more iteration made the better tree in the forest were created giving the best one at almost 23 iterations.
0 notes
codingroockie · 2 years ago
Text
First assigment
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
AH_data = pd.read_csv("tree_addhealth.csv")
data_clean = AH_data.dropna()
data_clean.dtypes
data_clean.describe()
"""
Modeling and Prediction
"""
#Split into training and testing sets
predictors = data_clean[['BIO_SEX','HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN',
'age','ALCEVR1','ALCPROBS1','marever1','cocever1','inhever1','cigavail','DEP1',
'ESTEEM1','VIOL1','PASSIST','DEVIANT1','SCHCONN1','GPA1','EXPEL1','FAMCONCT','PARACTV',
'PARPRES']]
targets = data_clean.TREG1
pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
#Build model on training data
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
#print(predictions)
sklearn.metrics.confusion_matrix(tar_test,predictions)
print(sklearn.metrics.accuracy_score(tar_test, predictions))
1 note · View note