statsmaths
statsmaths
Data Analysis
8 posts
Don't wanna be here? Send us removal request.
statsmaths · 3 years ago
Text
Assignment 4 - Data Visualization
import pandas import numpy import seaborn import matplotlib.pyplot as plt
data = pandas.read_csv('gapminder.csv', low_memory=False)
data = data.replace(r'^\s*$', numpy.NaN, regex=True)
#setting variables you will be working with to numeric
data['internetuserate'] = pandas.to_numeric(data['internetuserate']) data['urbanrate'] = pandas.to_numeric(data['urbanrate']) data['incomeperperson'] = pandas.to_numeric(data['incomeperperson']) data['hivrate'] = pandas.to_numeric(data['hivrate'])
desc1 = data['urbanrate'].describe()
#The below are descriptive statistics results for all the 2 variables used in the code
print (desc1)
count 203.000000
mean 56.769360
std 23.844933
min 10.400000
25% 36.830000
50% 57.940000
75% 74.210000
max 100.000000
desc2 = data['internetuserate'].describe() print (desc2)
count 192.000000
mean 35.632716
std 27.780285
min 0.210066
25% 9.999604
50% 31.810121
75% 56.416046
max 95.638113
#basic scatterplot: Q->Q
scat1 = seaborn.regplot(x="urbanrate", y="internetuserate", fit_reg=False, data=data)
plt.xlabel('Urban Rate')
Text(0.5, 0, 'Urban Rate')
plt.ylabel('Internet Use Rate')
Text(0, 0.5, 'Internet Use Rate')
plt.title('Scatterplot for the Association Between Urban Rate and Internet Use Rate')
Text(0.5, 1.0, 'Scatterplot for the Association Between Urban Rate and Internet Use Rate')
scat2 = seaborn.regplot(x="urbanrate", y="internetuserate", data=data)
#scatterplot graph image cannot be attached here
plt.xlabel('Urban Rate')
Text(0.5, 0, 'Urban Rate')
plt.ylabel('Internet Use Rate')
Text(0, 0.5, 'Internet Use Rate')
plt.title('Scatterplot for the Association Between Urban Rate and Internet Use Rate')
Text(0.5, 1.0, 'Scatterplot for the Association Between Urban Rate and Internet Use Rate')
scat3 = seaborn.regplot(x="incomeperperson", y="internetuserate", data=data)
#scatterplot graph image cannot be attached here
plt.xlabel('Income per Person')
plt.ylabel('Internet Use Rate')
plt.title('Scatterplot for the Association Between Income per Person and Internet Use Rate')
scat4 = seaborn.regplot(x="incomeperperson", y="hivrate", data=data)
plt.xlabel('Income per Person')
plt.ylabel('HIV Rate')
plt.title('Scatterplot for the Association Between Income per Person and HIV Rate')
#quartile split (use qcut function & ask for 4 groups - gives you quartile split)
print ('Income per person - 4 categories - quartiles')
data['INCOMEGRP4']=pandas.qcut(data.incomeperperson, 4, labels=["1=25th%tile","2=50%tile","3=75%tile","4=100%tile"])
c10 = data['INCOMEGRP4'].value_counts(sort=False, dropna=True)
print(c10)
1=25th%tile 48
2=50%tile 47
3=75%tile 47
4=100%tile 48
#bivariate bar graph C->Q
seaborn.catplot(x='INCOMEGRP4', y='hivrate', data=data, kind="bar", ci=None)
#bar chart image cannot be attached here
plt.xlabel('income group')
plt.ylabel('mean HIV rate')
c11= data.groupby('INCOMEGRP4').size()
print (c11)
INCOMEGRP4
1=25th%tile 48
2=50%tile 47
3=75%tile 47
4=100%tile 48
#Summary:
We examined four variables - internateuser rate, urban rate, income per person and HIV rate from gapminder dataset and understand each variables descriptive statistics.
Next we did a co-relation analysis on these variables to find association b/w them. From the urban rate & internet use rate we find there is positive relation b/w them.
plt.xlabel('Urban Rate')
plt.ylabel('Internet Use Rate')
plt.title('Scatterplot for the Association Between Urban Rate and Internet Use Rate')
scat3 = seaborn.regplot(x="incomeperperson", y="internetuserate", data=data)
plt.xlabel('Income per Person')
plt.ylabel('Internet Use Rate')
plt.title('Scatterplot for the Association Between Income per Person and Internet Use Rate')
Text(0.5, 1.0, 'Scatterplot for the Association Between Income per Person and Internet Use Rate')
scat4 = seaborn.regplot(x="incomeperperson", y="hivrate", data=data)
plt.ylabel('HIV Rate')
plt.title('Scatterplot for the Association Between Income per Person and HIV Rate')
0 notes
statsmaths · 3 years ago
Text
Assignment 3
import pandas import numpy import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi
data = pandas.read_csv('gapminder.csv', low_memory=False)
print (len(data)) #number of observations (rows)
213
#setting variables you will be working with to numeric
data['urbanrate'] = pandas.to_numeric(data['urbanrate'], errors='coerce') data['femaleemployrate'] = pandas.to_numeric(data['femaleemployrate'], errors='coerce') data['lifeexpectancy'] = pandas.to_numeric(data['lifeexpectancy'], errors='coerce')
#creating three new variables for data management operations - urban,fer & le
data['urban']=data['urbanrate'] data['fer']=data['femaleemployrate'] data['le']=data['lifeexpectancy']
#data management for urbanrate
def urban (row): if row['urbanrate'] >= 75: return 3 if 25 <= row['urbanrate'] < 75: return 2 if row['urbanrate'] < 25: return 1
data['urban'] = data.apply(lambda row: urban (row),axis=1)
#frequency distribution for new varaibale urban
c1 = data['urban'].value_counts(sort=False) print (c1)
1.0 22
2.0 133
3.0 48
#data management for femaleemployrate
def fer (row): if row['femaleemployrate'] < 20: return 1 if 20 <= row['femaleemployrate'] < 40: return 2 if 40 <= row['femaleemployrate'] < 60: return 3 if 60 <= row['femaleemployrate'] < 80: return 4 if row['femaleemployrate'] >= 80: return 5
data['fer'] = data.apply(lambda row: fer (row),axis=1)
#frequency distribution for new varaibale fer
c2 = data['fer'].value_counts(sort=False) print (c2)
2.0 45
3.0 96
4.0 26
5.0 4
1.0 7
#data management for le
def le (row): if row['lifeexpectancy'] < 45: return 1 if 45 <= row['lifeexpectancy'] < 55: return 2 if 55 <= row['lifeexpectancy'] < 65: return 3 if 65 <= row['lifeexpectancy'] < 75: return 4 if row['lifeexpectancy'] >= 75: return 5
data['le'] = data.apply(lambda row: le (row),axis=1)
#frequency distribution for new varaibale le
c3 = data['le'].value_counts(sort=False) print (c3)
2.0 24
5.0 65
4.0 76
3.0 26
Summary:
In this module for data management I have collapsed the responses for urbanrate, femaleemployrate, and lifeexpectancy to create three new variables which are: urban, fer, and le.
For urban, the most commonly endorsed response was 2 (62.44%), meaning that most countries have an urban rate of 25% - 75%.
For fer, the most commonly endorsed response was 3 (45.07%), meaning that almost half of the countries have a female employment rate of 40% - 60%.
For le, the most commonly endorsed response was 4 (35.68%), meaning that the life expectancy in about one-third of the countries is 65 – 75 years old.
0 notes
statsmaths · 3 years ago
Text
Assignment 2 (week 2)
import pandas import numpy import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi
data = pandas.read_csv('gapminder.csv', low_memory=False)
print (len(data)) #number of observations (rows)
213
print (len(data.columns)) # number of variables (columns)
16
setting variables you will be working with to numeric
data['internetuserate'] = pandas.to_numeric(data['internetuserate'], errors='coerce') data['urbanrate'] = pandas.to_numeric(data['urbanrate'], errors='coerce') data['incomeperperson'] = pandas.to_numeric(data['incomeperperson'], errors='coerce')
ADDING TITLES (i.e. frequency distributions) for each variable
print ('counts for internetuserate - 2010 Internet users (per 100 people)') c1 = data['internetuserate'].value_counts(sort=False) print (c1)
counts for internetuserate - 2010 Internet users (per 100 people) 81.000000 1
66.000000 1
45.000000 1
65.000000 1
2.100213 1
..
36.000335 1
41.000128 1
33.616683 1
60.119707 1
1.699985 1
Name: internetuserate, Length: 192, dtype: int64
print ('percentages for internetuserate - 2010 Internet users (per 100 people)') p1 = data['internetuserate'].value_counts(sort=False, normalize=True) print (p1)
percentages for internetuserate - 2010 Internet users (per 100 people)
81.000000 0.005208
66.000000 0.005208
45.000000 0.005208
65.000000 0.005208
2.100213 0.005208
...
36.000335 0.005208
41.000128 0.005208
33.616683 0.005208
60.119707 0.005208
1.699985 0.005208
Name: internetuserate, Length: 192, dtype: float64
print ('counts for urbanrate - 2008 urban poplation (% of total)') c2 = data['urbanrate'].value_counts(sort=False) print(c2)
counts for urbanrate - 2008 urban poplation (% of total)
92.00 1
100.00 6
74.50 1
73.50 1 17.00 1
..
56.02 1
57.18 1
73.92 1
25.46 1
28.38 1
Name: urbanrate, Length: 194, dtype: int64
print ('percentages for urbanrate -2008 urban poplation (% of total)') p2 = data['urbanrate'].value_counts(sort=False, normalize=True) print (p2)
percentages for urbanrate -2008 urban poplation (% of total)
92.00 0.004926
100.00 0.029557
74.50 0.004926
73.50 0.004926
17.00 0.004926
...
56.02 0.004926
57.18 0.004926
73.92 0.004926
25.46 0.004926
28.38 0.004926
Name: urbanrate, Length: 194, dtype: float64
print ('counts for incomeperperson - 2010 GDP per capita in constant 2000 US $') c3 = data['incomeperperson'].value_counts(sort=False, dropna=False) print(c3)
counts for incomeperperson - 2010 GDP per capita in constant 2000 US $
NaN 23 8614.120219 1
39972.352768 1
279.180453 1
161.317137 1
..
377.421113 1
2344.896916 1
25306.187193 1
4180.765821 1
25575.352623 1
Name: incomeperperson, Length: 191, dtype: int64
print ('percentages for incomeperperson - 2010 GDP per capita in constant 2000 US $') p3 = data['incomeperperson'].value_counts(sort=False, normalize=True) print (p3)
percentages for incomeperperson - 2010 GDP per capita in constant 2000 US $
5188.900935 0.005263
8614.120219 0.005263
39972.352768 0.005263
279.180453 0.005263
161.317137 0.005263
...
377.421113 0.005263
2344.896916 0.005263
25306.187193 0.005263
4180.765821 0.005263
25575.352623 0.005263
Name: incomeperperson, Length: 190, dtype: float64
Summary: The data set taken here is 'gapminder.csv'. I have considered three variables of interest here which are:
internetuserate - 2010 Internet users (per 100 people)
urbanrate - 2008 urban poplation (% of total)
incomeperperson - 2010 GDP per capita in constant 2000 US $
The program is written and compiled in python and output snapshot is provided above. First I have run the # of observations in the data set (rows) which is 213. Then have run the # of variables in the data set (columns) whose value is 16.
Next, as per the assignment requirement, I have run the frequency distribution (i.e counts and percentages of all these three variables considered) and provided the snapshot.
0 notes
statsmaths · 3 years ago
Text
Assignment 1 - Data Management & Visualization
Data Set Selected: I would like to work on the Gapminder data set which contains data for all 192 UN members and which seeks to increase the use and understanding of statistics about social, economic, and environmental development at local, national, and global levels.
2. Research Question: Determine associations (if any) between "rate of internet users" by the "rate of the country's population living in urban settings".
Variables of interest from the codebook for the above stated hypothesis - Internetuserate & Urbanrate
Variables Type - Both are ordinal variables
3. Second hypothesis: Is there any association b/w rate of internet users and income per person for a region/country?
Variables of interest from the codebook for the above stated hypothesis - Internetuserate & Incomeperperson
Variables Type - Both are ordinal variables
4. Literature review of hypothesis #1: IJERPH | Free Full-Text | Association between Urban Upbringing and Compulsive Internet Use in Japan: A Cross-Sectional, Multilevel Study with Retrospective Recall (mdpi.com)
5. From the above literature review, the findings as are below:
Growing up in a large city was significantly associated with higher Compulsive Internet Use Scale (CIUS) scores (γ = 1.65, Standard Error (SE) = 0.45) and Mild CIU + Severe CIU (Exp(γ) = 1.44; 95% Confidence Interval (CI) (1.04–2.00)) compared to growing up in a small municipality
0 notes
statsmaths · 3 years ago
Text
Statistical Interactions - Assignment4
Objective:
The goal of this assignment is to perform data analysis on a dataset ("diet_exercise.csv") to reveal any association that may exist b/w weight-loss and diet. Here the categorical variable is "Diet" given by two levels - Diet A & Diet B. The quantitative variable is weigh-tloss. The study is performed on these two variables using ANOVA method and results show below (first part) the p value is 0.00133 which is less than 0.05 which means there is slight significance b/w these two variables.
Now as part of this exercise we also wanted to introduce a moderator variable called "exercise" which has two levels - Cardio & Weights.
We ran the same tests with the introduction of the third moderator variable called "exercise" and found the below results.
Results:
The weight loss was shown to be significant for subjects who were on cardio exercise (p value of 2.00e-12) compared to those who used weights (p value of 0.00442). Hence the moderator variable is useful to show the actual relationship b/w two variables may be due to a 3rd moderator variable.
Code Snippet & Output:
import numpy
import pandas
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import seaborn
import matplotlib.pyplot as plt
data = pandas.read_csv('diet_exercise.csv', low_memory=False)
model1 = smf.ols(formula='WeightLoss ~ C(Diet)', data=data).fit()
print (model1.summary())
Tumblr media
sub1 = data[['WeightLoss', 'Diet']].dropna()
m1= sub1.groupby('Diet').mean()
print ("means for WeightLoss by Diet A vs. B")
print (m1)
means for WeightLoss by Diet A vs. B
WeightLoss
Diet
A 14.655
B 9.320
st1= sub1.groupby('Diet').std()
print ("standard deviation for mean WeightLoss by Diet A vs. B")
print (st1)
standard deviation for mean WeightLoss by Diet A vs. B
WeightLoss
Diet
A 6.302086
B 2.779360
# bivariate bar graph
seaborn.factorplot(x="Diet", y="WeightLoss", data=data, kind="bar", ci=None)
Tumblr media
plt.xlabel('Diet Type')
Tumblr media
plt.ylabel('Mean Weight Loss in pounds')
Tumblr media
sub2=data[(data['Exercise']=='Cardio')]
sub3=data[(data['Exercise']=='Weights')]
model2 = smf.ols(formula='WeightLoss ~ C(Diet)', data=sub2).fit()
print ('association between diet and weight loss for those using Cardio exercise')
print (model2.summary())
Tumblr media
model3 = smf.ols(formula='WeightLoss ~ C(Diet)', data=sub3).fit()
print ('association between diet and weight loss for those using Weights exercise')
print (model3.summary())
Tumblr media
m3= sub2.groupby('Diet').mean()
print ("means for WeightLoss by Diet A vs. B for CARDIO")
print (m3)
Tumblr media
m4 = sub3.groupby('Diet').mean()
print ("Means for WeightLoss by Diet A vs. B for WEIGHTS")
print (m4)
Tumblr media
0 notes
statsmaths · 3 years ago
Text
Assignment 3 - Calculating a Correlation Co-efficient (r):
Objective:
For this assignment we needed to generate a correlation coefficient from our data among the variables of interest. In this study we were interested to see if there was any association between 'Urban rate' & 'Internet use rate' as well as 'Income per person' & 'Internet use rate'. All three data sets are quantitative. Python was used to conduct the analysis.
Results:
1) Urban rate & internet use rate: r (Correlation coefficient) = 0.62 and P value = 4.540316299446745e-21. Since the r value is close to 1 and the p value is very small, this tells us that the relationship is statistically significant between urbanization rate and internet use rate.
2) Income per person & internet use rate: r (Correlation coefficient) = 0.75 and P value = 3.067338031233027e-34. Since the r value is close to 1 and the p value is very small, this tells us that the relationship is statistically significant between income per person and internet use rate.
Code Snippet:
import pandas
import numpy
import seaborn
import scipy
import matplotlib.pyplot as plt
data = pandas.read_csv('gapminder.csv', low_memory=False)
#setting variables you will be working with to numeric
data['internetuserate'] = pandas.to_numeric(data['internetuserate'], errors='coerce')
data['urbanrate'] = pandas.to_numeric(data['urbanrate'], errors='coerce')
data['incomeperperson'] = pandas.to_numeric(data['incomeperperson'], errors='coerce')
data['incomeperperson']=data['incomeperperson'].replace(' ', numpy.nan)
scat1 = seaborn.regplot(x="urbanrate", y="internetuserate", fit_reg=True, data=data)
Tumblr media
plt.xlabel('Urban Rate')
Tumblr media
plt.ylabel('Internet Use Rate')
Tumblr media
plt.title('Scatterplot for the Association Between Urban Rate and Internet Use Rate')
Tumblr media
scat2 = seaborn.regplot(x="incomeperperson", y="internetuserate", fit_reg=True, data=data)
Tumblr media
plt.xlabel('Income per Person')
Tumblr media
plt.ylabel('Internet Use Rate')
Tumblr media
plt.title('Scatterplot for the Association Between Income per Person and Internet Use Rate')
Tumblr media
data_clean=data.dropna()
print ('association between urbanrate and internetuserate')
print (scipy.stats.pearsonr(data_clean['urbanrate'], data_clean['internetuserate']))
association between urbanrate and internetuserate
(0.6244640029489794, 4.540316299446745e-21)
print ('association between incomeperperson and internetuserate')
print (scipy.stats.pearsonr(data_clean['incomeperperson'], data_clean['internetuserate']))
association between incomeperperson and internetuserate (0.7507274333051273, 3.067338031233027e-34)
0 notes
statsmaths · 3 years ago
Text
Assignment 2
Objective:
For this assignment a Chi-square Test of Independence in Python was supposed to be executed to analyze association b/w two categorical variables - 'Nicotine Dependence' & 'Days smoked per month'. The dataset "NESACR.csv" was used to see if there is an association b/w nicotine dependence proportion & cigarettes smoked per month for young adults in age bracket of 18 to 25. In other words if there is an increase in nicotine dependence when the # of cigarettes smoked in a month increased. There were six levels for the cigarettes smoked per month - 1, 2.5, 6, 14, 22 & 30. We did run a post-hoc analysis b/w each of the pairs i.e nicotine dependence within each smoking frequency group.
Results:
The results shows the p value is significant when the # cigarettes smoked increased. The chi-square value, p value & expected counts for each smoking frequency group is shown below.
Code Snippet & Output:
import pandas
import numpy
import scipy.stats
import seaborn
import matplotlib.pyplot as plt
data = pandas.read_csv('nesarc.csv', low_memory=False)
# new code setting variables you will be working with to numeric
data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'], errors='coerce')
data['CHECK321'] = pandas.to_numeric(data['CHECK321'], errors='coerce')
data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'], errors='coerce')
data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'], errors='coerce')
data['AGE'] = pandas.to_numeric(data['AGE'], errors='coerce')
#subset data to young adults age 18 to 25 who have smoked in the past 12 months
sub1=data[(data['AGE']>=18) & (data['AGE']<=25) & (data['CHECK321']==1)]
#make a copy of my new subsetted data
sub2 = sub1.copy()
# recode missing values to python missing (NaN)
sub2['S3AQ3B1']=sub2['S3AQ3B1'].replace(9, numpy.nan)
sub2['S3AQ3C1']=sub2['S3AQ3C1'].replace(99, numpy.nan)
#recoding values for S3AQ3B1 into a new variable, USFREQMO
recode1 = {1: 30, 2: 22, 3: 14, 4: 6, 5: 2.5, 6: 1}
sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode1)
# contingency table of observed counts
ct1=pandas.crosstab(sub2['TAB12MDX'], sub2['USFREQMO'])
print (ct1)
USFREQMO 1.0 2.5 6.0 14.0 22.0 30.0
TAB12MDX
0 64 53 69 59 41 521
1 7 12 19 32 27 799
# column percentages
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)
USFREQMO 1.0 2.5 6.0 14.0 22.0 30.0
TAB12MDX
0 0.901408 0.815385 0.784091 0.648352 0.602941 0.394697
1 0.098592 0.184615 0.215909 0.351648 0.397059 0.605303
# chi-square
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)
chi-square value, p value, expected counts
(165.27320708055845, 7.436364208390599e-34, 5, array([[ 33.64474457, 30.80152672, 41.70052848, 43.1221374 , 32.22313564, 625.50792719], [ 37.35525543, 34.19847328, 46.29947152, 47.8778626 , 35.77686436, 694.49207281]]))
# set variable types
sub2["USFREQMO"] = sub2["USFREQMO"].astype('category')
sub2['TAB12MDX'] = pandas.to_numeric(sub2['TAB12MDX'], errors='coerce')
# graph percent with nicotine dependence within each smoking frequency group
seaborn.factorplot(x="USFREQMO", y="TAB12MDX", data=sub2, kind="bar", ci=None)
Tumblr media
plt.xlabel('Days smoked per month')
plt.ylabel('Proportion Nicotine Dependent')
Tumblr media
recode2 = {1: 1, 2.5: 2.5}
sub2['COMP1v2']= sub2['USFREQMO'].map(recode2)
ct2=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP1v2'])
print (ct2)
COMP1v2 1.0 2.5
TAB12MDX
0 64 53
1 7 12
# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)
COMP1v2 1.0 2.5
TAB12MDX
0 0.901408 0.815385
1 0.098592 0.184615
print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)
chi-square value, p value, expected counts
(1.4348930637007287, 0.2309675448977717, 1, array([[61.08088235, 55.91911765], [ 9.91911765, 9.08088235]]))
recode3 = {1: 1, 6: 6}
sub2['COMP1v6']= sub2['USFREQMO'].map(recode3)
# contingency table of observed counts
ct3=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP1v6'])
print (ct3)
COMP1v6 1.0 6.0
TAB12MDX
0 64 69
1 7 19
# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)
COMP1v6 1.0 6.0
TAB12MDX
0 0.901408 0.784091
1 0.098592 0.215909
print ('chi-square value, p value, expected counts')
cs3= scipy.stats.chi2_contingency(ct3)
print (cs3)
chi-square value, p value, expected counts
(3.142840191220936, 0.07626090198286821, 1, array([[59.38993711, 73.61006289], [11.61006289, 14.38993711]]))
recode4 = {1: 1, 14: 14}
sub2['COMP1v14']= sub2['USFREQMO'].map(recode4)
# contingency table of observed counts
ct4=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP1v14'])
print (ct4)
COMP1v14 1.0 14.0
TAB12MDX
0 64 59
1 7 32
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
COMP1v14 1.0 14.0
TAB12MDX
0 0.901408 0.648352
1 0.098592 0.351648
print ('chi-square value, p value, expected counts')
cs4= scipy.stats.chi2_contingency(ct4)
print (cs4)
chi-square value, p value, expected counts
(12.622564075461572, 0.00038111819882681824, 1, array([[53.90740741, 69.09259259], [17.09259259, 21.90740741]]))
recode5 = {1: 1, 22: 22}
sub2['COMP1v22']= sub2['USFREQMO'].map(recode5)
# contingency table of observed counts
ct5=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP1v22'])
print (ct5)
COMP1v22 1.0 22.0
TAB12MDX
0 64 41
1 7 27
# column percentages
colsum=ct5.sum(axis=0)
colpct=ct5/colsum
print(colpct)
COMP1v22 1.0 22.0
TAB12MDX
0 0.901408 0.602941
1 0.098592 0.397059
print ('chi-square value, p value, expected counts')
cs5= scipy.stats.chi2_contingency(ct5)
print (cs5)
chi-square value, p value, expected counts
(15.169488833230059, 9.827865291318501e-05, 1, array([[53.63309353, 51.36690647], [17.36690647, 16.63309353]]))
recode6 = {1: 1, 30: 30}
sub2['COMP1v30']= sub2['USFREQMO'].map(recode6)
# contingency table of observed counts
ct6=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP1v30'])
print (ct6)
COMP1v30 1.0 30.0
TAB12MDX
0 64 521
1 7 799
# column percentages
colsum=ct6.sum(axis=0)
colpct=ct6/colsum
print(colpct)
COMP1v30 1.0 30.0
TAB12MDX
0 0.901408 0.394697
1 0.098592 0.605303
print ('chi-square value, p value, expected counts')
cs6= scipy.stats.chi2_contingency(ct6)
print (cs6)
chi-square value, p value, expected counts
(68.92471874488487, 1.0229460827061155e-16, 1, array([[ 29.85981308, 555.14018692], [ 41.14018692, 764.85981308]]))
recode7 = {2.5: 2.5, 6: 6}
sub2['COMP2v6']= sub2['USFREQMO'].map(recode7)
# contingency table of observed counts
ct7=pandas.crosstab(sub2['TAB12MDX'], sub2['COMP2v6'])
print (ct7)
COMP2v6 2.5 6.0
TAB12MDX
0 53 69
1 12 19
# column percentages
colsum=ct7.sum(axis=0)
colpct=ct7/colsum
print(colpct)
COMP2v6 2.5 6.0
TAB12MDX
0 0.815385 0.784091
1 0.184615 0.215909
print ('chi-square value, p value, expected counts')
cs7=scipy.stats.chi2_contingency(ct7)
print (cs7)
chi-square value, p value, expected counts (0.07430561076945266, 0.7851679729700605, 1, array([[51.83006536, 70.16993464], [13.16993464, 17.83006536]]))
0 notes
statsmaths · 3 years ago
Text
Assignment 1
import numpy import pandas import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi
data = pandas.read_csv('nesarc.csv', low_memory=False)
#setting variables you will be working with to numeric data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'], errors='coerce') data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'], errors='coerce') data['CHECK321'] = pandas.to_numeric(data['CHECK321'], errors='coerce')
#subset data to young adults age 18 to 25 who have smoked in the past 12 months sub1=data[(data['AGE']>=18) & (data['AGE']<=25) & (data['CHECK321']==1)]
#SETTING MISSING DATA sub1['S3AQ3B1']=sub1['S3AQ3B1'].replace(9, numpy.nan) sub1['S3AQ3C1']=sub1['S3AQ3C1'].replace(99, numpy.nan)
#recoding number of days smoked in the past month recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1} sub1['USFREQMO']= sub1['S3AQ3B1'].map(recode1)
#converting new variable USFREQMMO to numeric sub1['USFREQMO'] = pandas.to_numeric(sub1['USFREQMO'], errors='coerce')
# Creating a secondary variable multiplying the days smoked/month and the number of cig/per day sub1['NUMCIGMO_EST']=sub1['USFREQMO'] * sub1['S3AQ3C1'] sub1['NUMCIGMO_EST']= pandas.to_numeric(sub1['NUMCIGMO_EST'], errors='coerce')
ct1 = sub1.groupby('NUMCIGMO_EST').size() print (ct1)
NUMCIGMO_EST
1.0       29
2.0       14
2.5       11
3.0       12
4.0        2          
..
1050.0     1
1200.0    29
1800.0     2
2400.0     1
2940.0     1
Length: 66, dtype: int64
# using ols function for calculating the F-statistic and associated p value model1 = smf.ols(formula='NUMCIGMO_EST ~ C(MAJORDEPLIFE)', data=sub1) results1 = model1.fit() print (results1.summary())
OLS Regression Results                             ================================================
Dep. Variable:          NUMCIGMO_EST   R-squared:                       0.002
Model:                       OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     3.550
Date:                 Wed, 26 Jan 2022   Prob (F-statistic):             0.0597
Time:                         17:58:07   Log-Likelihood:                -11934.
No. Observations:   1697   AIC:                         2.387e+04
Df Residuals:           1695   BIC:                         2.388e+04
Df Model:                 1                                        
Covariance Type:   nonrobust                                         ================================================                        
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept              312.8380      7.747     40.381      0.000     297.643     328.033
C(MAJORDEPLIFE)[T.1]    28.5370     15.146      1.884      0.060      -1.169      58.243 ================================================
Omnibus:                      673.875   Durbin-Watson:                   1.982
Prob(Omnibus):          0.000   Jarque-Bera (JB):             5043.141
Skew:                           1.672   Prob(JB):                         0.00
Kurtosis:                       10.755   Cond. No.                         2.46 ================================================
Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
sub2 = sub1[['NUMCIGMO_EST', 'MAJORDEPLIFE']].dropna()
print ('means for numcigmo_est by major depression status') m1= sub2.groupby('MAJORDEPLIFE').mean() print (m1)
means for numcigmo_est by major depression status              
NUMCIGMO_EST
MAJORDEPLIFE              
0               312.837989
1               341.375000
print ('standard deviations for numcigmo_est by major depression status') sd1 = sub2.groupby('MAJORDEPLIFE').std() print (sd1)
standard deviations for numcigmo_est by major depression status              NUMCIGMO_EST
MAJORDEPLIFE              
0             269.002344
1               288.495118 sub3 = sub1[['NUMCIGMO_EST', 'ETHRACE2A']].dropna()
model2 = smf.ols(formula='NUMCIGMO_EST ~ C(ETHRACE2A)', data=sub3).fit() print (model2.summary())
OLS Regression Results                             ================================================
Dep. Variable:           NUMCIGMO_EST R-squared:                       0.055
Model:                             OLS Adj. R-squared:                  0.052
Method:                 Least Squares F-statistic:                     24.40
Date:                 Wed, 26 Jan 2022   Prob (F-statistic):           1.18e-19
Time:                         18:01:14 Log-Likelihood:                -11888.
No. Observations:                 1697   AIC:                         2.379e+04
Df Residuals:                     1692   BIC:                         2.381e+04
Df Model:                           4    
Covariance Type:            nonrobust ============================================================
coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           368.7865    8.229     44.814      0.000     352.646     384.927 C(ETHRACE2A)[T.2]  -109.5127     20.189     -5.424      0.000    -149.111     -69.914 C(ETHRACE2A)[T.3]   -57.7984     42.038     -1.375      0.169    -140.250      24.653 C(ETHRACE2A)[T.4]  -124.5279     36.033     -3.456      0.001    -195.201     -53.854 C(ETHRACE2A)[T.5]  -149.0283     16.795     -8.873      0.000    -181.969    -116.087 ============================================================ Omnibus:                      712.397   Durbin-Watson:                  1.994
Prob(Omnibus):            0.000   Jarque-Bera (JB):             6548.614
Skew:                           1.717   Prob(JB):                         0.00
Kurtosis:                       11.990   Cond. No.                         6.72 ============================================================Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print ('means for numcigmo_est by major depression status') m2= sub3.groupby('ETHRACE2A').mean() print (m2)
means for numcigmo_est by major depression status          
NUMCIGMO_EST
ETHRACE2A              
1            368.786528
2            259.273810
3            310.988095
4            244.258621
5            219.758258
print ('standard deviations for numcigmo_est by major depression status') sd2 = sub3.groupby('ETHRACE2A').std() print (sd2)
standard deviations for numcigmo_est by major depression status           NUMCIGMO_EST
ETHRACE2A              
1            281.430730
2            278.677392
3            260.116964
4            195.076441
5            220.859365
mc1 = multi.MultiComparison(sub3['NUMCIGMO_EST'], sub3['ETHRACE2A']) res1 = mc1.tukeyhsd() print(res1.summary())
Multiple Comparison of Means - Tukey HSD, FWER=0.05   ========================================================= group1 group2  meandiff p-adj    lower     upper   reject
----- ---------------------------------------------------------    
1      2 -109.5127  0.001 -164.6441  -54.3814   True    
1      3   -57.7984 0.6251 -172.5914   56.9945  False    
1      4 -124.5279 0.0051 -222.9229  -26.1329   True    
1      5 -149.0283  0.001   -194.89 -103.1665   True    
2      3   51.7143 0.7555  -71.6021  175.0307  False    
2      4   -15.0152    0.9  -123.233   93.2026  False    
2      5   -39.5156 0.4492 -103.8025   24.7714  False    
3      4   -66.7295 0.7058 -214.5437   81.0848  False    
3      5   -91.2298 0.2269 -210.6902   28.2305  False    
4      5   -24.5004    0.9 -128.3027    79.302  False
---------------------------------------------------------
1 note · View note