# Install geopandas
!pip install geopandas==0.13.2
Requirement already satisfied: geopandas==0.13.2 in /usr/local/lib/python3.10/dist-packages (0.13.2) Requirement already satisfied: fiona>=1.8.19 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (1.9.6) Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (24.0) Requirement already satisfied: pandas>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (1.5.3) Requirement already satisfied: pyproj>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (3.6.1) Requirement already satisfied: shapely>=1.7.1 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (2.0.3) Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (23.2.0) Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (2024.2.2) Requirement already satisfied: click~=8.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (8.1.7) Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (1.1.1) Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (0.7.2) Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (1.16.0) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (2023.4) Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (1.25.2)
# Necessary libraries are imported
import os
import numpy as np
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
path1 = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/gdp-per-capita-worldbank.csv"
# Load the file into a DataFrame
df1 = pd.read_csv(path1, encoding='latin1', sep=',')
print(df1.sample())
Entity Code Year GDP per capita, PPP (constant 2017 international $) 932 Cambodia KHM 2013 3197.532
# Columns are renamed for easier reading
df1 = df1.rename(columns={'Entity': 'Country', 'GDP per capita, PPP (constant 2017 international $)': 'GDP'})
display(df1.sample(1))
Country | Code | Year | GDP | |
---|---|---|---|---|
924 | Cambodia | KHM | 2005 | 2124.8374 |
# Get a statistical summary
df1.describe()
Year | GDP | |
---|---|---|
count | 6346.000000 | 6346.000000 |
mean | 2005.937914 | 18068.999300 |
std | 9.126633 | 20193.547754 |
min | 1990.000000 | 436.376400 |
25% | 1998.000000 | 3673.792850 |
50% | 2006.000000 | 10486.217500 |
75% | 2014.000000 | 26409.074000 |
max | 2021.000000 | 157602.480000 |
# Get general information
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6346 entries, 0 to 6345 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 6346 non-null object 1 Code 5930 non-null object 2 Year 6346 non-null int64 3 GDP 6346 non-null float64 dtypes: float64(1), int64(1), object(2) memory usage: 198.4+ KB
# Calculate the number of unique values
unique_values_Countries = df1['Country'].nunique()
print('Unique values of Countries:', unique_values_Countries)
unique_values_Codes = df1['Code'].nunique()
print('Unique values of Codes:', unique_values_Codes)
Unique values of Countries: 208 Unique values of Codes: 195
# Select the rows of the DataFrame where the 'Code' column has null values and print the counts according to 'Country' column
null_rows_df1 = df1[df1['Code'].isnull()]
print(null_rows_df1['Country'].value_counts().sort_values(ascending=False))
East Asia and Pacific (WB) 32 Europe and Central Asia (WB) 32 European Union (27) 32 High-income countries 32 Latin America and Caribbean (WB) 32 Low-income countries 32 Lower-middle-income countries 32 Middle East and North Africa (WB) 32 Middle-income countries 32 North America (WB) 32 South Asia (WB) 32 Sub-Saharan Africa (WB) 32 Upper-middle-income countries 32 Name: Country, dtype: int64
# Remove rows that contain null values
df1 = df1.dropna()
# Calculate the number of unique values
unique_values_Countries = df1['Country'].nunique()
print('Unique values of Countries:', unique_values_Countries)
unique_values_Codes = df1['Code'].nunique()
print('Unique values of Codes:', unique_values_Codes)
Unique values of Countries: 195 Unique values of Codes: 195
# Check if there is any null value in the DataFrame
df1.isnull().values.any()
False
# Define the path to the file
geo_path = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/ne_110m_admin_0_countries.zip"
# Load the local file into a GeoDataFrame
world = gpd.read_file(geo_path)
# Exclude Antarctica
world = world[world['NAME'] != 'Antarctica']
# Get the unique years in the DataFrame
years = np.sort(df1['Year'].unique())
# Filter df to only contain country codes that exist in world['BRK_A3']
df = df1[df1['Code'].isin(world['BRK_A3'])]
# Define a function to plot a map
def plot_map(ax, year, df, world):
# Select the data for a specific year
df_year = df1[df1['Year'] == year]
# Ensure that 'world' and 'df_year' are aligned
world_year = world.set_index('BRK_A3').join(df_year.set_index('Code'))
# Draw a map of the world
world.boundary.plot(ax=ax, color='black', linewidth=0.5) # Adjust the thickness of the lines here
# Remove the numerical labels on both axes
ax.set_xticks([])
ax.set_yticks([])
# Create a divider for the current axis
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="10%", pad=0.1)
# Draw the map and add the legend
sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=np.log(world_year['GDP']).min(), vmax=np.log(world_year['GDP']).max()))
sm._A = []
cbar = fig.colorbar(sm, cax=cax, format='')
# Set the color bar labels
ticks = np.linspace(np.log(world_year['GDP']).min(), np.log(world_year['GDP']).max(), num=5) # Adjust 'num' according to your needs
cbar.set_ticks(ticks)
cbar.set_ticklabels(np.exp(ticks).astype(int)) # Convert the labels back to the original scale (non-logarithmic)
world_year.plot(ax=ax, column=np.log(world_year['GDP']), legend=False, cmap='coolwarm')
# Add a title to the plot
ax.set_title(f'{year} GDP')
# Apply the function to each year
fig, axs = plt.subplots(len(years), 1, figsize=(10, len(years)*3))
# Ensure that axs is always a 1-D array
if len(axs.shape) == 2:
axs = axs.flatten()
# Iterate only over the axes that correspond to the existing years
for ax, year in zip(axs, years):
plot_map(ax, year, df, world)
# Automatically adjust the space between the subplots
plt.tight_layout()
# Show the plot
plt.show()
# Ignore FutureWarning
warnings.filterwarnings('ignore', category=FutureWarning)
# Filter the DataFrame to include only the countries you are interested in
df_filtered = df1[df1['Code'].isin(['CAN', 'CHN', 'ESP', 'ARG'])]
# Create a figure
plt.figure(figsize=(10, 5))
# Create the plot on the plot
sns.lineplot(data=df_filtered, x='Year', y='GDP', hue='Code')
# Add a title to the plot
plt.title('GDP of Countries Over the Years')
plt.xlabel('Year')
plt.ylabel('GDP')
# Show the figure
plt.show()
# Create a list of the codes of all countries
countries = df1['Code'].unique()
# Filter the DataFrame to include only the countries you are interested in
df_filtered = df1[df1['Code'].isin(countries)]
# Find the range of years for each country
years_range_per_country = {country: (df_filtered[df_filtered['Code'] == country]['Year'].min(), df_filtered[df_filtered['Code'] == country]['Year'].max()) for country in countries}
# Create lists for the ranges of years and the countries
years_ranges = [range for range in years_range_per_country.values()]
countries = [country for country in years_range_per_country.keys()]
# Sort the countries alphabetically and then reverse the order
countries, years_ranges = zip(*sorted(zip(countries, years_ranges), reverse=True))
# Create a figure and an axis with a custom size
fig, ax = plt.subplots(figsize=(10, 40))
# Draw each range of years for each country
for i in range(len(countries)):
ax.barh(i, years_ranges[i][1]-years_ranges[i][0], left=years_ranges[i][0])
# Set the labels of the y-axis
ax.set_yticks(range(len(countries)))
ax.set_yticklabels(countries)
# Set the labels of the x-axis
plt.xlabel('Years')
# Set the title
plt.title('Range of Years for Each Country')
# Show the plot
plt.show()
# Define the path to the file
path2 = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/TB_burden_countries_2024-01-13.csv"
# Load the file into a DataFrame
df2 = pd.read_csv(path2, sep=',')
display(df2.sample(1))
country | iso2 | iso3 | iso_numeric | g_whoregion | year | e_pop_num | e_inc_100k | e_inc_100k_lo | e_inc_100k_hi | ... | cfr | cfr_lo | cfr_hi | cfr_pct | cfr_pct_lo | cfr_pct_hi | c_newinc_100k | c_cdr | c_cdr_lo | c_cdr_hi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
733 | Burkina Faso | BF | BFA | 854 | AFR | 2020 | 21522626 | 46.0 | 30.0 | 66.0 | ... | 0.22 | 0.12 | 0.35 | 22.0 | 12.0 | 35.0 | 26.0 | 58.0 | 40.0 | 89.0 |
1 rows × 50 columns
# Select only desired columns
df2 = df2[['iso3', 'year', 'e_inc_100k','cfr_pct']]
# Rename the columns for easier reading
df2 = df2.rename(columns={'iso3': 'Code', 'year': 'Year', 'e_inc_100k': 'Incidence', 'cfr_pct': 'Fatality%'})
display(df2.sample(5))
Code | Year | Incidence | Fatality% | |
---|---|---|---|---|
4046 | SVN | 2013 | 7.7 | 6.0 |
3918 | SRB | 2013 | 27.0 | 6.0 |
196 | ARG | 2012 | 24.0 | 8.0 |
1630 | FRA | 2007 | 11.0 | 12.0 |
2676 | MDV | 2018 | 33.0 | 7.0 |
# Get a statistical summary
df2.describe()
Year | Incidence | Fatality% | |
---|---|---|---|
count | 4917.000000 | 4917.000000 | 4794.000000 |
mean | 2011.040065 | 123.046766 | 16.327701 |
std | 6.627437 | 184.816404 | 13.444558 |
min | 2000.000000 | 0.000000 | 0.000000 |
25% | 2005.000000 | 12.000000 | 8.000000 |
50% | 2011.000000 | 46.000000 | 11.000000 |
75% | 2017.000000 | 161.000000 | 22.000000 |
max | 2022.000000 | 1590.000000 | 100.000000 |
# Get general information
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4917 entries, 0 to 4916 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Code 4917 non-null object 1 Year 4917 non-null int64 2 Incidence 4917 non-null float64 3 Fatality% 4794 non-null float64 dtypes: float64(2), int64(1), object(1) memory usage: 153.8+ KB
# Select the rows of the DataFrame where the 'Budget' column has null values and print the counts according to 'Code' column
null_rows_df2 = df2[df2['Fatality%'].isnull()]
print(null_rows_df2['Code'].value_counts().sort_values(ascending=False))
SMR 20 MSR 18 NIU 16 TKL 15 MCO 9 VGB 8 BMU 6 KNA 6 COK 5 WLF 5 CYM 4 BRB 4 GRD 2 SXM 2 ATG 1 TCA 1 ASM 1 Name: Code, dtype: int64
# Remove rows that contain null values
df2 = df2.dropna()
# Define the path to the file
path3 = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/TB_budget_2024-01-13.csv"
# Load the file into a DataFrame
df3 = pd.read_csv(path3, sep=',')
display(df3.sample(1))
country | iso2 | iso3 | iso_numeric | g_whoregion | year | tx_dstb | budget_cpp_dstb | tx_mdr | budget_cpp_mdr | ... | cf_orsrvy | budget_oth | cf_oth | budget_tot | cf_tot | cf_tot_domestic | cf_tot_gf | cf_tot_usaid | cf_tot_grnt | cf_tot_sources | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
986 | Saint Vincent and the Grenadines | VC | VCT | 670 | AMR | 2020 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 rows × 43 columns
# Select only desired columns
df3 = df3[['iso3', 'year','budget_tot']]
# Rename the columns for easier reading
df3 = df3.rename(columns={'iso3': 'Code', 'year': 'Year', 'budget_tot': 'Budget'})
display(df3.sample(5))
Code | Year | Budget | |
---|---|---|---|
648 | LSO | 2018 | 3318438.0 |
539 | ISL | 2023 | NaN |
530 | HUN | 2020 | 2100000.0 |
985 | VCT | 2019 | NaN |
414 | FJI | 2018 | 646195.0 |
# Get a statistical summary
df3.describe()
Year | Budget | |
---|---|---|
count | 1290.000000 | 8.280000e+02 |
mean | 2020.500000 | 4.821886e+07 |
std | 1.708487 | 1.619825e+08 |
min | 2018.000000 | 0.000000e+00 |
25% | 2019.000000 | 1.200075e+06 |
50% | 2020.500000 | 6.380376e+06 |
75% | 2022.000000 | 2.665690e+07 |
max | 2023.000000 | 1.640128e+09 |
# Get general information
df3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1290 entries, 0 to 1289 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Code 1290 non-null object 1 Year 1290 non-null int64 2 Budget 828 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 30.4+ KB
# Select the rows of the DataFrame where the 'Budget' column has null values and print the counts according to 'Code' column
null_rows_df3 = df3[df3['Budget'].isnull()]
print(null_rows_df3['Code'].value_counts().sort_values(ascending=False))
QAT 6 TTO 6 BEL 6 BMU 6 AUT 6 .. LVA 1 AND 1 SDN 1 PRK 1 ALB 1 Name: Code, Length: 115, dtype: int64
# Remove rows that contain null values
df3 = df3.dropna()
# Merge the DataFrames df1, df2, and df3 based on the columns 'Code' and 'Year'
combined_df = df1.merge(df2, on=['Code', 'Year']).merge(df3, on=['Code', 'Year'])
display(combined_df)
Country | Code | Year | GDP | Incidence | Fatality% | Budget | |
---|---|---|---|---|---|---|---|
0 | Afghanistan | AFG | 2018 | 2060.6990 | 189.0 | 16.0 | 10881354.0 |
1 | Afghanistan | AFG | 2019 | 2079.9219 | 189.0 | 14.0 | 16957452.0 |
2 | Afghanistan | AFG | 2020 | 1968.3410 | 183.0 | 15.0 | 19137128.0 |
3 | Afghanistan | AFG | 2021 | 1516.3057 | 185.0 | 14.0 | 10633599.0 |
4 | Albania | ALB | 2018 | 13319.4950 | 18.0 | 2.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... |
496 | Zambia | ZMB | 2021 | 3236.7890 | 307.0 | 14.0 | 37755340.0 |
497 | Zimbabwe | ZWE | 2018 | 2399.6216 | 210.0 | 15.0 | 29583804.0 |
498 | Zimbabwe | ZWE | 2019 | 2203.3967 | 199.0 | 22.0 | 41313083.0 |
499 | Zimbabwe | ZWE | 2020 | 1990.3195 | 185.0 | 24.0 | 31504677.0 |
500 | Zimbabwe | ZWE | 2021 | 2115.1445 | 194.0 | 25.0 | 32126351.0 |
501 rows × 7 columns
# Check if there is any null value in the DataFrame
combined_df.isnull().values.any()
False
# Create a pivot table from the DataFrame combined_df
# The values of 'GDP', 'Incidence', 'Fatality%', and 'Budget' are grouped by 'Year' and 'Country'
pivot1 = combined_df.pivot_table(values=['GDP', 'Incidence', 'Fatality%', 'Budget'], index=['Year', 'Country'])
display(pivot1)
Budget | Fatality% | GDP | Incidence | ||
---|---|---|---|---|---|
Year | Country | ||||
2018 | Afghanistan | 10881354.0 | 16.0 | 2060.6990 | 189.0 |
Albania | 0.0 | 2.0 | 13319.4950 | 18.0 | |
Angola | 45881526.0 | 21.0 | 6878.5933 | 355.0 | |
Argentina | 5189232.0 | 7.0 | 22747.2420 | 27.0 | |
Armenia | 6828768.0 | 6.0 | 13231.4310 | 31.0 | |
... | ... | ... | ... | ... | ... |
2021 | Uruguay | 4253647.0 | 12.0 | 22800.6900 | 32.0 |
Vanuatu | 527709.0 | 18.0 | 2783.0195 | 34.0 | |
Vietnam | 132975703.0 | 7.0 | 10628.2190 | 165.0 | |
Zambia | 37755340.0 | 14.0 | 3236.7890 | 307.0 | |
Zimbabwe | 32126351.0 | 25.0 | 2115.1445 | 194.0 |
501 rows × 4 columns
# Create a pivot table from the DataFrame combined_df
# The values of 'GDP', 'Incidence', 'Fatality%', and 'Budget' are grouped by 'Country' and 'Year'
# Calculate the mean, max, and min for each group
pivot2 = combined_df.pivot_table(values=['GDP', 'Incidence', 'Fatality%', 'Budget'],
index=['Country', 'Year'],
aggfunc=['mean', 'max', 'min'])
display(pivot2)
mean | max | min | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Budget | Fatality% | GDP | Incidence | Budget | Fatality% | GDP | Incidence | Budget | Fatality% | GDP | Incidence | ||
Country | Year | ||||||||||||
Afghanistan | 2018 | 10881354.0 | 16.0 | 2060.6990 | 189.0 | 10881354.0 | 16.0 | 2060.6990 | 189.0 | 10881354.0 | 16.0 | 2060.6990 | 189.0 |
2019 | 16957452.0 | 14.0 | 2079.9219 | 189.0 | 16957452.0 | 14.0 | 2079.9219 | 189.0 | 16957452.0 | 14.0 | 2079.9219 | 189.0 | |
2020 | 19137128.0 | 15.0 | 1968.3410 | 183.0 | 19137128.0 | 15.0 | 1968.3410 | 183.0 | 19137128.0 | 15.0 | 1968.3410 | 183.0 | |
2021 | 10633599.0 | 14.0 | 1516.3057 | 185.0 | 10633599.0 | 14.0 | 1516.3057 | 185.0 | 10633599.0 | 14.0 | 1516.3057 | 185.0 | |
Albania | 2018 | 0.0 | 2.0 | 13319.4950 | 18.0 | 0.0 | 2.0 | 13319.4950 | 18.0 | 0.0 | 2.0 | 13319.4950 | 18.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Zambia | 2021 | 37755340.0 | 14.0 | 3236.7890 | 307.0 | 37755340.0 | 14.0 | 3236.7890 | 307.0 | 37755340.0 | 14.0 | 3236.7890 | 307.0 |
Zimbabwe | 2018 | 29583804.0 | 15.0 | 2399.6216 | 210.0 | 29583804.0 | 15.0 | 2399.6216 | 210.0 | 29583804.0 | 15.0 | 2399.6216 | 210.0 |
2019 | 41313083.0 | 22.0 | 2203.3967 | 199.0 | 41313083.0 | 22.0 | 2203.3967 | 199.0 | 41313083.0 | 22.0 | 2203.3967 | 199.0 | |
2020 | 31504677.0 | 24.0 | 1990.3195 | 185.0 | 31504677.0 | 24.0 | 1990.3195 | 185.0 | 31504677.0 | 24.0 | 1990.3195 | 185.0 | |
2021 | 32126351.0 | 25.0 | 2115.1445 | 194.0 | 32126351.0 | 25.0 | 2115.1445 | 194.0 | 32126351.0 | 25.0 | 2115.1445 | 194.0 |
501 rows × 12 columns
# Some columns are selected
df_selected = combined_df[['Country', 'Year', 'GDP', 'Incidence', 'Fatality%', 'Budget']]
display(df_selected)
Country | Year | GDP | Incidence | Fatality% | Budget | |
---|---|---|---|---|---|---|
0 | Afghanistan | 2018 | 2060.6990 | 189.0 | 16.0 | 10881354.0 |
1 | Afghanistan | 2019 | 2079.9219 | 189.0 | 14.0 | 16957452.0 |
2 | Afghanistan | 2020 | 1968.3410 | 183.0 | 15.0 | 19137128.0 |
3 | Afghanistan | 2021 | 1516.3057 | 185.0 | 14.0 | 10633599.0 |
4 | Albania | 2018 | 13319.4950 | 18.0 | 2.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... |
496 | Zambia | 2021 | 3236.7890 | 307.0 | 14.0 | 37755340.0 |
497 | Zimbabwe | 2018 | 2399.6216 | 210.0 | 15.0 | 29583804.0 |
498 | Zimbabwe | 2019 | 2203.3967 | 199.0 | 22.0 | 41313083.0 |
499 | Zimbabwe | 2020 | 1990.3195 | 185.0 | 24.0 | 31504677.0 |
500 | Zimbabwe | 2021 | 2115.1445 | 194.0 | 25.0 | 32126351.0 |
501 rows × 6 columns
# Group by 'Country' and calculate the mean of other columns across the years.
df_grouped1 = df_selected.groupby('Country')[['GDP', 'Incidence', 'Fatality%', 'Budget']].mean()
display(df_grouped1)
GDP | Incidence | Fatality% | Budget | |
---|---|---|---|---|
Country | ||||
Afghanistan | 1906.316900 | 186.50 | 14.75 | 1.440238e+07 |
Albania | 13698.175333 | 16.00 | 2.00 | 5.430423e+05 |
Angola | 6355.465575 | 340.25 | 19.25 | 4.051907e+07 |
Argentina | 21507.850500 | 28.25 | 6.75 | 2.526782e+06 |
Armenia | 13774.949500 | 26.75 | 6.50 | 4.364570e+06 |
... | ... | ... | ... | ... |
Uzbekistan | 7082.925300 | 70.00 | 7.00 | 1.104770e+07 |
Vanuatu | 2933.806625 | 39.50 | 15.75 | 3.916390e+05 |
Vietnam | 10241.714750 | 173.25 | 7.25 | 8.626229e+07 |
Zambia | 3304.687000 | 326.25 | 24.00 | 3.889813e+07 |
Zimbabwe | 2177.120575 | 197.00 | 21.50 | 3.363198e+07 |
141 rows × 4 columns
# Divide the values of the 'GDP' column into 5 equal-sized groups.
bins = pd.qcut(df_grouped1['GDP'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
print(bins.value_counts())
Very Low 29 Low 28 Medium 28 High 28 Very High 28 Name: GDP, dtype: int64
# Select rows from the 'df_grouped1' DataFrame where the values in the 'GDP' column are 'Very Low'
df_VeryLow = df_grouped1[bins == 'Very Low']
display(df_VeryLow)
GDP | Incidence | Fatality% | Budget | |
---|---|---|---|---|
Country | ||||
Afghanistan | 1906.316900 | 186.500000 | 14.750000 | 1.440238e+07 |
Benin | 3176.159175 | 54.500000 | 22.500000 | 3.558143e+06 |
Burkina Faso | 2108.700850 | 46.500000 | 21.000000 | 7.691302e+06 |
Burundi | 715.348050 | 103.333333 | 23.000000 | 6.842288e+06 |
Central African Republic | 850.420990 | 540.000000 | 32.500000 | 2.962002e+06 |
Chad | 1510.055500 | 141.250000 | 23.750000 | 4.306927e+06 |
Comoros | 3292.741300 | 35.000000 | 28.000000 | 4.929895e+05 |
Democratic Republic of Congo | 1056.463275 | 319.500000 | 19.000000 | 5.432168e+07 |
Ethiopia | 2205.354925 | 134.750000 | 16.000000 | 9.622103e+07 |
Guinea | 2565.330125 | 175.500000 | 18.250000 | 3.785495e+06 |
Guinea-Bissau | 1848.683433 | 361.000000 | 39.333333 | 2.617930e+06 |
Haiti | 3031.992500 | 167.250000 | 10.500000 | 1.177573e+07 |
Kiribati | 1976.545650 | 406.250000 | 9.500000 | 4.376662e+05 |
Lesotho | 2415.499800 | 635.750000 | 37.500000 | 1.005467e+07 |
Liberia | 1446.107225 | 308.000000 | 27.000000 | 8.166810e+06 |
Madagascar | 1510.657475 | 233.000000 | 21.000000 | 3.803587e+06 |
Malawi | 1494.215800 | 142.250000 | 30.250000 | 2.971530e+07 |
Mali | 2162.268425 | 51.500000 | 18.750000 | 6.325744e+06 |
Mozambique | 1259.458775 | 361.000000 | 11.500000 | 2.870273e+07 |
Niger | 1202.958725 | 83.000000 | 21.250000 | 4.939391e+06 |
Rwanda | 2137.110375 | 57.250000 | 15.000000 | 6.611477e+06 |
Sierra Leone | 1616.685300 | 293.500000 | 16.500000 | 1.101003e+07 |
Solomon Islands | 2531.343200 | 65.500000 | 11.000000 | 9.752508e+05 |
Somalia | 1134.146833 | 254.000000 | 27.666667 | 9.391973e+06 |
Tanzania | 2555.384525 | 230.000000 | 25.500000 | 6.383039e+07 |
Togo | 2073.316175 | 35.750000 | 11.750000 | 1.074839e+06 |
Uganda | 2230.956600 | 199.500000 | 19.750000 | 3.455637e+07 |
Vanuatu | 2933.806625 | 39.500000 | 15.750000 | 3.916390e+05 |
Zimbabwe | 2177.120575 | 197.000000 | 21.500000 | 3.363198e+07 |
# Calculate the correlation matrix for all data.
corr_matrix = df_grouped1.corr()
# Create a heatmap using seaborn.
plt.figure(figsize=(7, 7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
plt.show()
# Calculate the correlation matrix for the group with 'Very Low' GDP
corr_matrix = df_VeryLow.corr()
# Create a heatmap using seaborn.
plt.figure(figsize=(7, 7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
plt.show()
# Assign the values from the 'bins' variable to a new column called 'GDP_Level'
df_grouped1['GDP_Level'] = bins
display(df_grouped1)
GDP | Incidence | Fatality% | Budget | GDP_Level | |
---|---|---|---|---|---|
Country | |||||
Afghanistan | 1906.316900 | 186.50 | 14.75 | 1.440238e+07 | Very Low |
Albania | 13698.175333 | 16.00 | 2.00 | 5.430423e+05 | High |
Angola | 6355.465575 | 340.25 | 19.25 | 4.051907e+07 | Medium |
Argentina | 21507.850500 | 28.25 | 6.75 | 2.526782e+06 | Very High |
Armenia | 13774.949500 | 26.75 | 6.50 | 4.364570e+06 | High |
... | ... | ... | ... | ... | ... |
Uzbekistan | 7082.925300 | 70.00 | 7.00 | 1.104770e+07 | Medium |
Vanuatu | 2933.806625 | 39.50 | 15.75 | 3.916390e+05 | Very Low |
Vietnam | 10241.714750 | 173.25 | 7.25 | 8.626229e+07 | Medium |
Zambia | 3304.687000 | 326.25 | 24.00 | 3.889813e+07 | Low |
Zimbabwe | 2177.120575 | 197.00 | 21.50 | 3.363198e+07 | Very Low |
141 rows × 5 columns
# Create Boxplots
fig, axs = plt.subplots(ncols=2, figsize=(15, 5))
# Boxplot for 'Incidencia' by 'GDP_Level'
sns.boxplot(x='GDP_Level', y='Incidence', data=df_grouped1, color='skyblue', ax=axs[0])
axs[0].set_ylabel("Incidence")
# Boxplot for 'Budget' by 'GDP_Level'
sns.boxplot(x='GDP_Level', y='Budget', data=df_grouped1, color='pink', ax=axs[1])
axs[1].set_ylabel("Budget")
plt.subplots_adjust(wspace=0.5)
sns.set(rc={'figure.figsize':(10,5)})
plt.show()
# Show the joint distribution using kernel density estimation
g = sns.jointplot(data=df_grouped1, x="Incidence", y="GDP", kind="kde")
plt.show()
# Relationship of each group
sns.pairplot(df_grouped1, hue="GDP_Level")
plt.show()