thedatacondominium - Tumblr blog

thedatacondominium · 1 year ago

Text

Analyzing the Relationship between CO2 Emissions and the Manufacturing Sector (Creating the Graphs)

import pandas as pd

import streamlit as st

import numpy as np

import seaborn

import matplotlib.pyplot as plt

@st.cache_data

def get_data():

data=pd.read_csv("data_csv.csv")[["Country", "Manufacturing GDP", "emmissions"]]

lowercase=lambda x: str(x).lower()

data.rename(lowercase, axis="rows", inplace=True)

data.rename(columns={"Manufacturing GDP": "manugdp", "emmissions": "emmission"}, inplace=True)

return data

data=get_data()

st.header("Original Data")

st.markdown("we have the data for %i countries" %(len(data)))

raw=st.checkbox("Show raw data", False)

if raw:

st.write(data)

st.header("Different GDP Catagories")

range=st.selectbox("Ranges of GDP", ["Less than $1 billion", "between $1 billion and $500 billion", "Between $500 billion and $1 trillion", "Above $1 trillion"])

if range=="Less than $1 billion":

output=data.query("manugdp<=1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="between $1 billion and $500 billion":

output=data.query("manugdp<=500000000000").query("manugdp>1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="Between $500 billion and $1 trillion":

output=data.query("manugdp>500000000000").query("manugdp<=1000000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

else:

output=data.query(("manugdp>1000000000000"))

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

st.subheader("We now remove the 0 values and replace them by NaN")

data['manugdp']=data["manugdp"].replace(0, np.nan)

data['emmission']=data["emmission"].replace(0, np.nan)

raw_nan=st.checkbox("Show raw data with 0 replaced with NaN", False)

if raw_nan:

st.markdown("we have the data for %i countries" %(len(data)))

st.write(data)

st.subheader("Countries in each category of Manufacturing GDP")

st.markdown("1. Less than $1 billion \n 2. between $1 billion and $500 billion \n 3. Between $500 billion and $1 trillion \n 4. Above $1 trillion \n 5. Data missing")

def gdp_category_division(row):

if row['manugdp']<=1000000000:

return 1

elif row['manugdp']>1000000000 and row['manugdp']<=500000000000:

return 2

elif row['manugdp']>500000000000 and row['manugdp']<=1000000000000:

return 3

elif row['manugdp']>1000000000000:

return 4

else:

return 5

data['gdpCategory']=data.apply(lambda rows: gdp_category_division(rows), axis=1)

st.write(data["gdpCategory"].value_counts(sort=False))

st.write(pd.cut(data['gdpCategory'], [0,1,2,3,4,5]).value_counts(sort=False))

st.write(pd.cut(data['gdpCategory'], [0,1,2,3,4,5]).value_counts(sort=False, normalize=True))

st.subheader("Countries in each category of emmissions")

st.markdown("6. Less than 1 Mt/yr \n 7. 1Mt/yr to 500Mt/yr \n 8. 500Mt/yr to 1000Mt/yr \n 9. Above 1000Mt/yr \n 10. Data missing")

def emmission_category_division(row):

if row['emmission']<=1:

return 6

elif row['emmission']>1 and row['emmission']<=500:

return 7

elif row['emmission']>500 and row['emmission']<=1000:

return 8

elif row['emmission']>1000:

return 9

else:

return 10

data['emmissionCategory']=data.apply(lambda rows: emmission_category_division(rows), axis=1)

st.write(data["emmissionCategory"].value_counts(sort=False))

st.write(pd.crosstab(data["gdpCategory"], data["emmissionCategory"]))

st.write(pd.qcut(data.manugdp, 4, labels=["25%tile", "50%tile", "75%tile", "100%tile"]).value_counts(sort=False))

st.write(pd.qcut(data.emmission, 4, labels=["25%tile", "50%tile", "75%tile", "100%tile"]).value_counts(sort=False))

data["gdpCategory"]=data["gdpCategory"].astype("category")

st.subheader("Plot for number of countries in each category of GDP data")

plt.figure()

gdp_category_graph=seaborn.countplot(x="gdpCategory", data=data)

plt.xlabel("GDP categories for different countries")

plt.ylabel("Number of countries")

plt.title("Manufacturing GDP categories for 2021")

st.pyplot(gdp_category_graph.figure)

st.subheader("Plot for number of countries in each category of emission data")

plt.figure()

emmission_category_graph=seaborn.countplot(x="emmissionCategory", data=data)

plt.xlabel("emmission")

plt.title("CO2 Emissions for 2021")

st.pyplot(emmission_category_graph.figure)

st.subheader("General statistics for Manufacturing GDP")

st.write(data["manugdp"].describe())

st.subheader("General statistics for CO2 emissions")

st.write(data["emmission"].describe())

data["manugdp"]=data["manugdp"]/1000000000

data["gdp_category"]=pd.cut(data.manugdp, [0, 1, 500, 1000, 10000])

data["gdp_category"]=data["gdp_category"].astype("category")

st.write(data["gdp_category"].value_counts(sort=False, dropna=True))

st.write(data["gdp_category"].describe())

plt.figure()

gdp_cat_emission=seaborn.catplot(x="gdp_category", y="emmission", data=data, kind="bar", ci=None)

plt.xlabel("GDP in 2021 (in $Billion)")

plt.ylabel("Mean of Emissions released by countries in this range in Mt CO2/yr")

st.pyplot(gdp_cat_emission.figure)

st.subheader("Plot between Manufacturing GDP and Emissions")

plt.figure()

scat_plt=seaborn.regplot(x="manugdp", y="emmission", data=data)

plt.xlabel("Manufacturing GDP in $ billion")

plt.ylabel("Emissions in Mt CO2/yr")

st.pyplot(scat_plt.figure)

plt.figure()

data["manu_percentile"]=pd.qcut(data.manugdp, 4, labels=["25%tile", "50%tile", "75%tile", "100%tile"])#.value_counts(sort=False, dropna=True)

scat_plt1=seaborn.catplot(x="manu_percentile", y="emmission", data=data, kind="bar", ci=None)

plt.xlabel("Manufacturing GDP percentile")

plt.ylabel("Mean Emissions in Mt CO2/yr")

st.pyplot(scat_plt1.figure)

raw_category=st.checkbox("Show raw data with emmission and GDP categories", False)

if raw_category:

st.markdown("we have the data for %i countries" %(len(data)))

st.write(data)

######CODE ENDS HERE#####

I first created graphs for the number of countries falling in each category of Manufacturing GDP and emissions., creating the Univariate graphs

To recall from my previous posts, the category number along with the category:

GDP below $1 billion

GDP between $1 billion and $500 billion

GDP between $500 billion and $1 trillion

GDP above $1 trillion

GDP Data Missing

Emissions below 1Mt/yr

Emissions between 1Mt/yr and 500Mt/yr

Emissions between 500Mt/yr and 1000Mt/yr

Emissions above 1000Mt/yr

Emission data missing

I also calculated the general statistics for both the variables

I further made a graph of mean emissions in each category of GDP

I also made a scatterplot between the GDPs and Emissions

0 notes

thedatacondominium · 1 year ago

Text

Analyzing the Relationship between CO2 Emissions and the Manufacturing Sector (Manipulating the Data)

In the previous post, I had dropped the zero values, In this post, I have added them again and converted them to non values using numpy.nan

import pandas as pd

import streamlit as st

import numpy as np

@st.cache_data

def get_data():

data=pd.read_csv("data_csv.csv")[["Country", "Manufacturing GDP", "emmissions"]]

lowercase=lambda x: str(x).lower()

data.rename(lowercase, axis="rows", inplace=True)

data.rename(columns={"Manufacturing GDP": "manugdp", "emmissions": "emmission"}, inplace=True)

return data

data=get_data()

st.header("Original Data")

st.markdown("we have the data for %i countries" %(len(data)))

raw=st.checkbox("Show raw data", False)

if raw:

st.write(data)

st.header("Different GDP Catagories")

range=st.selectbox("Ranges of GDP", ["Less than $1 billion", "between $1 billion and $500 billion", "Between $500 billion and $1 trillion", "Above $1 trillion"])

if range=="Less than $1 billion":

output=data.query("manugdp<=1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="between $1 billion and $500 billion":

output=data.query("manugdp<=500000000000").query("manugdp>1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="Between $500 billion and $1 trillion":

output=data.query("manugdp>500000000000").query("manugdp<=1000000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

else:

output=data.query(("manugdp>1000000000000"))

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

st.subheader("We now remove the 0 values and replace them by NaN")

data['manugdp']=data["manugdp"].replace(0, np.nan)

data['emmission']=data["emmission"].replace(0, np.nan)

raw_nan=st.checkbox("Show raw data with 0 replaced with NaN", False)

if raw_nan:

st.markdown("we have the data for %i countries" %(len(data)))

st.write(data)

st.subheader("Countries in each category of Manufacturing GDP")

st.markdown("1. Less than $1 billion \n 2. between $1 billion and $500 billion \n 3. Between $500 billion and $1 trillion \n 4. Above $1 trillion \n 5. Data missing")

def gdp_category_division(row):

if row['manugdp']<=1000000000:

return 1

elif row['manugdp']>1000000000 and row['manugdp']<=500000000000:

return 2

elif row['manugdp']>500000000000 and row['manugdp']<=1000000000000:

return 3

elif row['manugdp']>1000000000000:

return 4

else:

return 5

data['gdpCategory']=data.apply(lambda rows: gdp_category_division(rows), axis=1)

st.write(data["gdpCategory"].value_counts(sort=False))

st.write(pd.cut(data['gdpCategory'], [0,1,2,3,4,5]).value_counts(sort=False))

st.write(pd.cut(data['gdpCategory'], [0,1,2,3,4,5]).value_counts(sort=False, normalize=True))

st.subheader("Countries in each category of emmissions")

st.markdown("6. Less than 1 Mt/yr \n 7. 1Mt/yr to 500Mt/yr \n 8. 500Mt/yr to 1000Mt/yr \n 9. Above 1000Mt/yr \n 10. Data missing")

def emmission_category_division(row):

if row['emmission']<=1:

return 6

elif row['emmission']>1 and row['emmission']<=500:

return 7

elif row['emmission']>500 and row['emmission']<=1000:

return 8

elif row['emmission']>1000:

return 9

else:

return 10

data['emmissionCategory']=data.apply(lambda rows: emmission_category_division(rows), axis=1)

st.write(data["emmissionCategory"].value_counts(sort=False))

st.write(pd.crosstab(data["gdpCategory"], data["emmissionCategory"]))

st.write(pd.qcut(data.manugdp, 4, labels=["25%tile", "50%tile", "75%tile", "100%tile"]).value_counts(sort=False))

st.write(pd.qcut(data.emmission, 4, labels=["25%tile", "50%tile", "75%tile", "100%tile"]).value_counts(sort=False))

raw_category=st.checkbox("Show raw data with emmission and GDP categories", False)

if raw_category:

st.markdown("we have the data for %i countries" %(len(data)))

st.write(data)

######CODE ENDS HERE#####

I have created different categories for both emissions and the GDP

The number of countries in each category is given below

Less than $1 billion

between $1 billion and $500 billion

Between $500 billion and $1 trillion

Above $1 trillion

Data missing

From the table below, we can observe that the majority of the countries have a manufacturing GDP between $1 billion and $ 500 billion. Also, note that data for 17 countries is not available.

Less than 1 Mt/yr (6)

1Mt/yr to 500Mt/yr (7)

500Mt/yr to 1000Mt/yr (8)

Above 1000Mt/yr (9)

Data missing (10)

From the data below, we can observe that we have no missing data. Moreover, the maximum number of countries fall in category 7, emitting between 1Mt CO2/yr to 500Mt CO2/yr

A distribution of countries in particular emission and GDP category

1, 2, 3, 4, 5 represent he categories of GDP, while 6, 7, 8, 9 are the emission categories.

We can observe that our earlier hypothesis, that higher the manufacturing GDP, higher the emissions.

Our raw data:

0 notes

thedatacondominium · 1 year ago

Text

Analyzing the Relationship between CO2 Emissions and the Manufacturing Sector (The Data)

import pandas as pd

import streamlit as st

@st.cache_data()

def get_data():

data=pd.read_csv("data_csv.csv")[["Country", "Manufacturing GDP", "emmissions"]]

data=data.dropna()

lowercase=lambda x: str(x).lower()

data.rename(lowercase, axis="rows", inplace=True)

data.rename(columns={"Manufacturing GDP": "manugdp", "emmissions": "emmission"}, inplace=True)

return data

data=get_data()

st.header("Original Data")

st.markdown("we have the data for %i countries" %(len(data)))

raw=st.checkbox("Show raw data", False)

if raw:

st.write(data)

st.header("Different GDP Catagories")

range=st.selectbox("Ranges of GDP", ["Less than $1 billion", "between $1 billion and $500 billion", "Between $500 billion and $1 trillion", "Above $1 trillion"])

if range=="Less than $1 billion":

output=data.query("manugdp<=1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="between $1 billion and $500 billion":

output=data.query("manugdp<=500000000000").query("manugdp>1000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

elif range=="Between $500 billion and $1 trillion":

output=data.query("manugdp>500000000000").query("manugdp<=1000000000000")

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

else:

output=data.query(("manugdp>1000000000000"))

st.markdown("This category has %i countries" %(len(output)))

st.write(output)

#####CODE ENDS HERE

I have used the streamlit library for better data visualization. I have dropped countries with any missing data, be it GDP or emissions. Finally we get the data for 175 countries. Part of the data is shown below.

Due to the nature of the data, values of the GDP and emissions of each country are unique and have a frequency of 1. So, although I may not use it later, I have divided the GDP into different ranges as shown, for a general idea.

The table of each category has how many countries it contains

0 notes