Code 3
"""
Created on Thu Jun 17 21:01:53 2021
@author: MariaRomo
"""
import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt
pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)
pandas.set_option('display.float_format', lambda x:'%f'%x)
data = pandas.read_csv('nesarc_pds.csv', low_memory=False)
data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'])
data['CHECK321'] = pandas.to_numeric(data['CHECK321'])
data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'])
data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'])
data['AGE'] = pandas.to_numeric(data['AGE'])
sub1=data[(data['AGE']>=18) & (data['AGE']<=25) & (data['CHECK321']==1)]
sub2 = sub1.copy()
sub2['S3AQ3B1']=sub2['S3AQ3B1'].replace(9, numpy.nan)
sub2['S3AQ3C1']=sub2['S3AQ3C1'].replace(99, numpy.nan)
recode1 = {1: 6, 2: 5, 3: 4, 4: 3, 5: 2, 6: 1}
sub2['USFREQ']= sub2['S3AQ3B1'].map(recode1)
recode2 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode2)
sub2['NUMCIGMO_EST']=sub2['USFREQMO'] * sub2['S3AQ3C1']
sub2["TAB12MDX"] = sub2["TAB12MDX"].astype('category')
seaborn.countplot(x="TAB12MDX", data=sub2)
plt.xlabel('Nicotine Dependence past 12 months')
plt.title('Nicotine Dependence in the Past 12 Months Among Young Adult Smokers in the NESARC Study')
seaborn.distplot(sub2["NUMCIGMO_EST"].dropna(), kde=False);
plt.xlabel('Number of Cigarettes per Month')
plt.title('Estimated Number of Cigarettes per Month among Young Adult Smokers in the NESARC Study')
print ('describe number of cigarettes smoked per month')
desc1 = sub2['NUMCIGMO_EST'].describe()
print (desc1)
c1= sub2.groupby('NUMCIGMO_EST').size()
print (c1)
print ('describe nicotine dependence')
desc2 = sub2['TAB12MDX'].describe()
print (desc2)
c1= sub2.groupby('TAB12MDX').size()
print (c1)
p1 = sub2.groupby('TAB12MDX').size() * 100 / len(data)
print (p1)
c2 = sub2.groupby('NUMCIGMO_EST').size()
print (c2)
p2 = sub2.groupby('NUMCIGMO_EST').size() * 100 / len(data)
print (p2)
sub2['PACKSPERMONTH']=sub2['NUMCIGMO_EST'] / 20
c2= sub2.groupby('PACKSPERMONTH').size()
print (c2)
sub2['PACKCATEGORY'] = pandas.cut(sub2.PACKSPERMONTH, [0, 5, 10, 20, 30, 147])
# change format from numeric to categorical
sub2['PACKCATEGORY'] = sub2['PACKCATEGORY'].astype('category')
print ('pack category counts')
c7 = sub2['PACKCATEGORY'].value_counts(sort=False, dropna=True)
print(c7)
print ('describe PACKCATEGORY')
desc3 = sub2['PACKCATEGORY'].describe()
print (desc3)
sub2['TAB12MDX'] = pandas.to_numeric(sub2['TAB12MDX'])
# bivariate bar graph C->Q
seaborn.catplot(x="PACKCATEGORY", y="TAB12MDX", data=sub2, kind="bar", ci=None)
plt.xlabel('Packs per Month')
plt.ylabel('Proportion Nicotine Dependent')
Univariate: Number of cigarettes per month
Bivariate graphs: Correlation between Packs per months and Proportion of Nicotine Dependent
0 notes
Code 2
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 9 18:27:23 2021
@author: MariaRomo
"""
import pandas
import numpy
data= pandas.read_csv('nesarc_pds.csv', low_memory=False)
#pasar todo a mayusculas o minusculas
data.columns=map(str.upper, data.columns)
#runtime error, poner esto
pandas.set_option('display.float_format',lambda x:'%f'%x)
#setting variables you will be working with to numeric
data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'])
data['CHECK321'] = pandas.to_numeric(data['CHECK321'])
data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'])
data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'])
data['AGE'] = pandas.to_numeric(data['AGE'])
#Filtrar información
sub1= data[(data['AGE']>=18)&(data['AGE']<=25)&(data['CHECK321']==1)]
#copy of new subset data
sub2=sub1.copy()
print('counts for S3AQ3B1')
c5 = sub2['S3AQ3B1'].value_counts(sort=False)
print(c5)
sub2['S3AQ3B1']= sub2['S3AQ3B1'].replace(9, numpy.nan)
print('counts for S3AQ3B1')
c6 = sub2['S3AQ3B1'].value_counts(sort=False, dropna=False)
print(c6)
print('counts for S3AQ3C1')
c7 = sub2['S3AQ3C1'].value_counts(sort=False)
print(c7)
#Quitar los 99 del códgio
sub2['S3AQ3C1']= sub2['S3AQ3C1'].replace(99, numpy.nan)
print('counts for S3AQ3C1')
c8 = sub2['S3AQ3C1'].value_counts(sort=False, dropna=False)
print(c8)
data.loc[(data['S2AQ3']!=9)&(data['S2AQ8A'].isnull()),'S2AQ8A']=11
recode1={1:6, 2:5, 3:4, 4:3, 5:2, 6:1}
sub2['USFREQ']=sub2['S3AQ3B1'].map(recode1)
recode2={1:30, 2:22, 3:14, 4:5, 5:2.5, 6:1}
sub2['USFREQMO']=sub2['S3AQ3B1'].map(recode2)
print('counts for USFREQMO')
c9 = sub2['USFREQMO'].value_counts(sort=False, dropna=False)
print(c9)
sub2['NUMCIGMO_EST'] = sub2['USFREQMO'] * sub2['S3AQ3C1']
sub3= sub2[['IDNUM', 'S3AQ3C1', 'USFREQMO', 'NUMCIGMO_EST']]
a=sub3.head(n=25)
print(a)
sub2['AGEGROUP3']=pandas.cut(sub2.AGE,[17,20,22,25])
print(pandas.crosstab(sub2['AGEGROUP3'],sub2['AGE']))
Output:
runfile('/Users/MariaRomo/Documents/NESARC/EjercicioMR2.py', wdir='/Users/MariaRomo/Documents/NESARC')
counts for S3AQ3B1
9.000000 3
6.000000 71
3.000000 91
2.000000 68
5.000000 65
1.000000 1320
4.000000 88
Name: S3AQ3B1, dtype: int64
counts for S3AQ3B1
NaN 3
6.000000 71
3.000000 91
2.000000 68
5.000000 65
1.000000 1320
4.000000 88
Name: S3AQ3B1, dtype: int64
counts for S3AQ3C1
15.000000 99
98.000000 1
9.000000 6
20.000000 365
30.000000 38
19.000000 1
17.000000 2
80.000000 1
24.000000 1
11.000000 3
28.000000 1
35.000000 1
16.000000 5
99.000000 9
60.000000 2
6.000000 60
10.000000 387
3.000000 114
40.000000 30
12.000000 25
8.000000 42
13.000000 7
25.000000 13
2.000000 111
7.000000 45
27.000000 1
18.000000 3
5.000000 163
1.000000 83
14.000000 3
4.000000 84
Name: S3AQ3C1, dtype: int64
counts for S3AQ3C1
NaN 9
15.000000 99
98.000000 1
9.000000 6
20.000000 365
30.000000 38
19.000000 1
17.000000 2
80.000000 1
24.000000 1
11.000000 3
28.000000 1
35.000000 1
16.000000 5
60.000000 2
6.000000 60
10.000000 387
3.000000 114
40.000000 30
12.000000 25
8.000000 42
13.000000 7
25.000000 13
2.000000 111
7.000000 45
27.000000 1
18.000000 3
5.000000 163
1.000000 83
14.000000 3
4.000000 84
Name: S3AQ3C1, dtype: int64
counts for USFREQMO
NaN 3
2.500000 65
30.000000 1320
22.000000 68
5.000000 88
1.000000 71
14.000000 91
Name: USFREQMO, dtype: int64
IDNUM S3AQ3C1 USFREQMO NUMCIGMO_EST
20 21 3.000000 30.000000 90.000000
76 77 3.000000 22.000000 66.000000
102 103 10.000000 30.000000 300.000000
121 122 10.000000 30.000000 300.000000
135 136 20.000000 30.000000 600.000000
149 150 5.000000 30.000000 150.000000
154 155 8.000000 30.000000 240.000000
173 174 1.000000 30.000000 30.000000
177 178 10.000000 30.000000 300.000000
183 184 20.000000 30.000000 600.000000
187 188 2.000000 5.000000 10.000000
209 210 3.000000 30.000000 90.000000
219 220 5.000000 14.000000 70.000000
222 223 1.000000 30.000000 30.000000
278 279 98.000000 30.000000 2940.000000
336 337 20.000000 30.000000 600.000000
363 364 20.000000 30.000000 600.000000
398 399 2.000000 22.000000 44.000000
412 413 5.000000 30.000000 150.000000
417 418 20.000000 30.000000 600.000000
508 509 30.000000 30.000000 900.000000
511 512 1.000000 2.500000 2.500000
519 520 20.000000 30.000000 600.000000
522 523 10.000000 30.000000 300.000000
529 530 4.000000 30.000000 120.000000
AGEGROUP3
(17, 20] 161 200 221 0 0 0 0 0
(20, 22] 0 0 0 239 228 0 0 0
(22, 25] 0 0 0 0 0 231 241 185
Summary:
In this code I used frequency distribution in all the variables to see the output and if the variable was doing what I was expecting.
I used recode, coding out missing data, creating secondary variables, aggregate, grouping and split variables.
0 notes
Code
Code:
import pandas
import numpy
data= pandas.read_csv('nesarc_pds.csv', low_memory=False)
#Upper or lower case
data.columns=map(str.upper, data.columns)
#runtime error
pandas.set_option('display.float_format',lambda x:'%f'%x)
print(len(data)) #imprime la cantidad de rows u observaciones
print(len(data.columns)) #imprime la cantidad de columnas o variables
#Checking the format of the variables, use print to see the format
data['ETHRACE2A'].dtype
#setting variables you will be working with to numeric
data['TAB12MDX'] = pandas.to_numeric(data['TAB12MDX'])
data['CHECK321'] = pandas.to_numeric(data['CHECK321'])
data['S3AQ3B1'] = pandas.to_numeric(data['S3AQ3B1'])
data['S3AQ3C1'] = pandas.to_numeric(data['S3AQ3C1'])
data['AGE'] = pandas.to_numeric(data['AGE'])
#Another option for displaying observations or rows in a data frame
#Counts and percentages (i.e. frequency distributions) for each variable
print('counts for TAB12MDX - nicotine dependence in the past 12 months, yes = 1')
c1 = data['TAB12MDX'].value_counts(sort=False)
print (c1)
print('percentages for TAB12MDX nicotine dependence in the past 12 months, yes = 1')
p1 = data['TAB12MDX'].value_counts(sort=False, normalize=True)
print (p1)
print('counts for CHECK321 - smoked in the past year, yes = 1')
c2 = data['CHECK321'].value_counts(sort=False)
print(c2)
print('percentages for CHECK321 smoked in the past year, yes = 1')
p2 = data['CHECK321'].value_counts(sort=False, normalize=True)
print (p2)
print('counts for S3AQ3B1 - usual frequency when smoked cigarettes')
c3 = data['S3AQ3B1'].value_counts(sort=False)
print(c3)
print('percentages for S3AQ3B1 usual frequency when smoked cigarettes')
p3 = data['S3AQ3B1'].value_counts(sort=False, normalize=True)
print (p3)
#dropna- to display missing data
print('counts for S3AQ3C1 - usual quantity when smoke cigarettes')
c4 = data['S3AQ3C1'].value_counts(sort=False, dropna=False)
print(c4)
print('percentages for S3AQ3C1 usual quantity when smoke cigarettes')
p4 = data['S3AQ3C1'].value_counts(sort=False, dropna=False, normalize=True)
print (p4)
#By groups is another approach to frequency distributions both counts and percetages
print('counts for TAB12MDX - nicotine dependence in the past 12 months, yes = 1')
ct1= data.groupby('TAB12MDX').size()
print(ct1)
print('percentages for TAB12MDX nicotine dependence in the past 12 months, yes = 1')
pt1= data.groupby('TAB12MDX').size() * 100/len(data)
print(pt1)
#Subset data
sub1= data[(data['AGE']>=18)&(data['AGE']<=25)&(data['CHECK321']==1)]
sub2=sub1.copy()
print('counts for AGE ')
c5 = sub2['AGE'].value_counts(sort=False)
print(c5)
print('percentages for AGE')
p5 = sub2['AGE'].value_counts(sort=False, normalize=True)
print (p5)
print('counts for CHECK321 - smoked in the past year, yes = 1')
c6 = sub2['CHECK321'].value_counts(sort=False)
print(c6)
print('percentages for CHECK321 smoked in the past year, yes = 1')
p6 = sub2['CHECK321'].value_counts(sort=False, normalize=True)
print (p6)
Output:
runfile('/Users/MariaRomo/Documents/NESARC/EjercicioMR1.py', wdir='/Users/MariaRomo/Documents/NESARC')
43093
3010
counts for TAB12MDX - nicotine dependence in the past 12 months, yes = 1
0 38131
1 4962
Name: TAB12MDX, dtype: int64
percentages for TAB12MDX nicotine dependence in the past 12 months, yes = 1
0 0.884854
1 0.115146
Name: TAB12MDX, dtype: float64
counts for CHECK321 - smoked in the past year, yes = 1
2.000000 8078
9.000000 22
1.000000 9913
Name: CHECK321, dtype: int64
percentages for CHECK321 smoked in the past year, yes = 1
2.000000 0.448454
9.000000 0.001221
1.000000 0.550325
Name: CHECK321, dtype: float64
counts for S3AQ3B1 - usual frequency when smoked cigarettes
4.000000 747
2.000000 460
5.000000 409
9.000000 102
3.000000 687
6.000000 772
1.000000 14836
Name: S3AQ3B1, dtype: int64
percentages for S3AQ3B1 usual frequency when smoked cigarettes
4.000000 0.041470
2.000000 0.025537
5.000000 0.022706
9.000000 0.005663
3.000000 0.038139
6.000000 0.042858
1.000000 0.823627
Name: S3AQ3B1, dtype: float64
counts for S3AQ3C1 - usual quantity when smoke cigarettes
NaN 25080
28.000000 3
4.000000 573
60.000000 241
8.000000 299
7.000000 269
24.000000 7
22.000000 10
33.000000 1
50.000000 106
2.000000 884
34.000000 1
15.000000 851
5.000000 1070
98.000000 15
99.000000 262
40.000000 993
9.000000 49
55.000000 2
75.000000 2
39.000000 1
20.000000 5366
19.000000 5
29.000000 3
25.000000 155
45.000000 8
21.000000 1
35.000000 30
23.000000 2
14.000000 25
30.000000 909
70.000000 12
27.000000 2
37.000000 2
3.000000 923
6.000000 463
18.000000 59
17.000000 22
11.000000 23
10.000000 3077
57.000000 1
1.000000 934
66.000000 1
12.000000 230
13.000000 34
80.000000 47
16.000000 40
Name: S3AQ3C1, dtype: int64
percentages for S3AQ3C1 usual quantity when smoke cigarettes
NaN 0.581997
28.000000 0.000070
4.000000 0.013297
60.000000 0.005593
8.000000 0.006938
7.000000 0.006242
24.000000 0.000162
22.000000 0.000232
33.000000 0.000023
50.000000 0.002460
2.000000 0.020514
34.000000 0.000023
15.000000 0.019748
5.000000 0.024830
98.000000 0.000348
99.000000 0.006080
40.000000 0.023043
9.000000 0.001137
55.000000 0.000046
75.000000 0.000046
39.000000 0.000023
20.000000 0.124521
19.000000 0.000116
29.000000 0.000070
25.000000 0.003597
45.000000 0.000186
21.000000 0.000023
35.000000 0.000696
23.000000 0.000046
14.000000 0.000580
30.000000 0.021094
70.000000 0.000278
27.000000 0.000046
37.000000 0.000046
3.000000 0.021419
6.000000 0.010744
18.000000 0.001369
17.000000 0.000511
11.000000 0.000534
10.000000 0.071404
57.000000 0.000023
1.000000 0.021674
66.000000 0.000023
12.000000 0.005337
13.000000 0.000789
80.000000 0.001091
16.000000 0.000928
Name: S3AQ3C1, dtype: float64
counts for TAB12MDX - nicotine dependence in the past 12 months, yes = 1
TAB12MDX
0 38131
1 4962
dtype: int64
percentages for TAB12MDX nicotine dependence in the past 12 months, yes = 1
TAB12MDX
0 88.485369
1 11.514631
dtype: float64
counts for AGE
18 161
19 200
20 221
21 239
22 228
23 231
24 241
25 185
Name: AGE, dtype: int64
percentages for AGE
18 0.094373
19 0.117233
20 0.129543
21 0.140094
22 0.133646
23 0.135404
24 0.141266
25 0.108441
Name: AGE, dtype: float64
counts for CHECK321 - smoked in the past year, yes = 1
1.000000 1706
Name: CHECK321, dtype: int64
percentages for CHECK321 smoked in the past year, yes = 1
1.000000 1.000000
Name: CHECK321, dtype: float64
0 notes
Association between smoking behavior and nicotine dependence
I'm a student and I want to understand the methodology and analysis with the support of the university. This is the reason why I decided to follow the example given.
Data set selected: NESARC
Question and hypothesis:
How much does a person need to smoke to become nicotine dependent?
The more individuals smoke, the more likely they are to have nicotine dependence.
Variables:
Topic: Nicotine dependence
Nicotine dependence in the last 12 months
Nicotine dependence - Lifetime
2nd Topic: Smoking behavior
Cigarette smoking status
Usual frequency when smoke cigarettes
Usual quantity when smoke cigarettes
Literature
In the research carried out, I found a work that focuses on deciphering the consumption by individuals who are nicotine dependent and those who have a psychiatric disorder. It is interesting for me to see the relationship between a psychiatric disorder and cigarette consumption, since we would expect from them a different behavior in almost every action due to their illness, but when talking about nicotine addiction, both behave in the same way, so I suppose there must be a pattern in people's behavior that leads the brain to demand this stimulus, even when it is under a psychiatric disorder. One of the conclusions of the article is that 70% of the cigarettes consumed in the United States come from people who are nicotine dependent and psychiatrically ill individuals.
Another article I read talks more about chemistry and how nicotine works, but what I found interesting was the correlation it explains between the chemistry and the smoking behavior nicotine can detach in people. It establishes that a gene found in nicotine exerts differential effects during the different stages of smoking, this information can give more details about a person's behavior before they become nicotine dependent.
Reference
Grant BF, Hasin DS, Chou SP, Stinson FS, Dawson DA. Nicotine Dependence and Psychiatric Disorders in the United States: Results From the National Epidemiologic Survey on Alcohol and RelatedConditions. Arch Gen Psychiatry. 2004;61(11):1107–1115. https://jamanetwork.com/journals/jamapsychiatry/article-abstract/482090
Viba Malaiyandi BSc, Edward M. Sellers MD, PhD, Rachel F. Tyndale PhD. Implications of CYP2A6 Genetic Variation for Smoking Behaviors and Nicotine Dependence. Perspectives in Clinical Pharmacology (2005) 77, 145–158; doi: 10.1016/j.clpt.2004.10.011
1 note
·
View note