# Install geopandas
!pip install geopandas==0.13.2

Requirement already satisfied: geopandas==0.13.2 in /usr/local/lib/python3.10/dist-packages (0.13.2)
Requirement already satisfied: fiona>=1.8.19 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (1.9.6)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (24.0)
Requirement already satisfied: pandas>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (1.5.3)
Requirement already satisfied: pyproj>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (3.6.1)
Requirement already satisfied: shapely>=1.7.1 in /usr/local/lib/python3.10/dist-packages (from geopandas==0.13.2) (2.0.3)
Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (23.2.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (2024.2.2)
Requirement already satisfied: click~=8.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (8.1.7)
Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (1.1.1)
Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (0.7.2)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from fiona>=1.8.19->geopandas==0.13.2) (1.16.0)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (2023.4)
Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.1.0->geopandas==0.13.2) (1.25.2)


# Necessary libraries are imported
import os
import numpy as np
import pandas as pd


import warnings
import seaborn as sns
import matplotlib.pyplot as plt


import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable


path1  = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/gdp-per-capita-worldbank.csv"


# Load the file into a DataFrame
df1 = pd.read_csv(path1, encoding='latin1', sep=',')
print(df1.sample())

       Entity Code  Year  GDP per capita, PPP (constant 2017 international $)
932  Cambodia  KHM  2013                                           3197.532


# Columns are renamed for easier reading
df1 = df1.rename(columns={'Entity': 'Country', 'GDP per capita, PPP (constant 2017 international $)': 'GDP'})
display(df1.sample(1))


# Get a statistical summary
df1.describe()


# Get general information
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6346 entries, 0 to 6345
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  6346 non-null   object 
 1   Code     5930 non-null   object 
 2   Year     6346 non-null   int64  
 3   GDP      6346 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 198.4+ KB


# Calculate the number of unique values
unique_values_Countries = df1['Country'].nunique()
print('Unique values of Countries:', unique_values_Countries)
unique_values_Codes = df1['Code'].nunique()
print('Unique values of Codes:', unique_values_Codes)

Unique values of Countries: 208
Unique values of Codes: 195


# Select the rows of the DataFrame where the 'Code' column has null values and print the counts according to 'Country' column
null_rows_df1 = df1[df1['Code'].isnull()]
print(null_rows_df1['Country'].value_counts().sort_values(ascending=False))

East Asia and Pacific (WB)           32
Europe and Central Asia (WB)         32
European Union (27)                  32
High-income countries                32
Latin America and Caribbean (WB)     32
Low-income countries                 32
Lower-middle-income countries        32
Middle East and North Africa (WB)    32
Middle-income countries              32
North America (WB)                   32
South Asia (WB)                      32
Sub-Saharan Africa (WB)              32
Upper-middle-income countries        32
Name: Country, dtype: int64


# Remove rows that contain null values
df1 = df1.dropna()


# Calculate the number of unique values
unique_values_Countries = df1['Country'].nunique()
print('Unique values of Countries:', unique_values_Countries)
unique_values_Codes = df1['Code'].nunique()
print('Unique values of Codes:', unique_values_Codes)

Unique values of Countries: 195
Unique values of Codes: 195


# Check if there is any null value in the DataFrame
df1.isnull().values.any()

False


# Define the path to the file
geo_path = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/ne_110m_admin_0_countries.zip"

# Load the local file into a GeoDataFrame
world = gpd.read_file(geo_path)

# Exclude Antarctica
world = world[world['NAME'] != 'Antarctica']

# Get the unique years in the DataFrame
years = np.sort(df1['Year'].unique())

# Filter df to only contain country codes that exist in world['BRK_A3']
df = df1[df1['Code'].isin(world['BRK_A3'])]

# Define a function to plot a map
def plot_map(ax, year, df, world):
    # Select the data for a specific year
    df_year = df1[df1['Year'] == year]

    # Ensure that 'world' and 'df_year' are aligned
    world_year = world.set_index('BRK_A3').join(df_year.set_index('Code'))

    # Draw a map of the world
    world.boundary.plot(ax=ax, color='black', linewidth=0.5)  # Adjust the thickness of the lines here

    # Remove the numerical labels on both axes
    ax.set_xticks([])
    ax.set_yticks([])

    # Create a divider for the current axis
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="10%", pad=0.1)

    # Draw the map and add the legend
    sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=np.log(world_year['GDP']).min(), vmax=np.log(world_year['GDP']).max()))
    sm._A = []
    cbar = fig.colorbar(sm, cax=cax, format='')

    # Set the color bar labels
    ticks = np.linspace(np.log(world_year['GDP']).min(), np.log(world_year['GDP']).max(), num=5)  # Adjust 'num' according to your needs
    cbar.set_ticks(ticks)
    cbar.set_ticklabels(np.exp(ticks).astype(int))  # Convert the labels back to the original scale (non-logarithmic)

    world_year.plot(ax=ax, column=np.log(world_year['GDP']), legend=False, cmap='coolwarm')

    # Add a title to the plot
    ax.set_title(f'{year} GDP')

# Apply the function to each year
fig, axs = plt.subplots(len(years), 1, figsize=(10, len(years)*3))

# Ensure that axs is always a 1-D array
if len(axs.shape) == 2:
    axs = axs.flatten()

# Iterate only over the axes that correspond to the existing years
for ax, year in zip(axs, years):
    plot_map(ax, year, df, world)

# Automatically adjust the space between the subplots
plt.tight_layout()
# Show the plot
plt.show()


# Ignore FutureWarning
warnings.filterwarnings('ignore', category=FutureWarning)

# Filter the DataFrame to include only the countries you are interested in
df_filtered = df1[df1['Code'].isin(['CAN', 'CHN', 'ESP', 'ARG'])]

# Create a figure
plt.figure(figsize=(10, 5))

# Create the plot on the plot
sns.lineplot(data=df_filtered, x='Year', y='GDP', hue='Code')

# Add a title to the plot
plt.title('GDP of Countries Over the Years')
plt.xlabel('Year')
plt.ylabel('GDP')

# Show the figure
plt.show()


# Create a list of the codes of all countries
countries = df1['Code'].unique()

# Filter the DataFrame to include only the countries you are interested in
df_filtered = df1[df1['Code'].isin(countries)]

# Find the range of years for each country
years_range_per_country = {country: (df_filtered[df_filtered['Code'] == country]['Year'].min(), df_filtered[df_filtered['Code'] == country]['Year'].max()) for country in countries}

# Create lists for the ranges of years and the countries
years_ranges = [range for range in years_range_per_country.values()]
countries = [country for country in years_range_per_country.keys()]

# Sort the countries alphabetically and then reverse the order
countries, years_ranges = zip(*sorted(zip(countries, years_ranges), reverse=True))

# Create a figure and an axis with a custom size
fig, ax = plt.subplots(figsize=(10, 40))

# Draw each range of years for each country
for i in range(len(countries)):
    ax.barh(i, years_ranges[i][1]-years_ranges[i][0], left=years_ranges[i][0])

# Set the labels of the y-axis
ax.set_yticks(range(len(countries)))
ax.set_yticklabels(countries)

# Set the labels of the x-axis
plt.xlabel('Years')

# Set the title
plt.title('Range of Years for Each Country')

# Show the plot
plt.show()


# Define the path to the file
path2 = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/TB_burden_countries_2024-01-13.csv"


# Load the file into a DataFrame
df2 = pd.read_csv(path2, sep=',')
display(df2.sample(1))


# Select only desired columns
df2 = df2[['iso3', 'year', 'e_inc_100k','cfr_pct']]


# Rename the columns for easier reading
df2 = df2.rename(columns={'iso3': 'Code', 'year': 'Year', 'e_inc_100k': 'Incidence', 'cfr_pct': 'Fatality%'})
display(df2.sample(5))


# Get a statistical summary
df2.describe()


# Get general information
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4917 entries, 0 to 4916
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Code       4917 non-null   object 
 1   Year       4917 non-null   int64  
 2   Incidence  4917 non-null   float64
 3   Fatality%  4794 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 153.8+ KB


# Select the rows of the DataFrame where the 'Budget' column has null values and print the counts according to 'Code' column
null_rows_df2 = df2[df2['Fatality%'].isnull()]
print(null_rows_df2['Code'].value_counts().sort_values(ascending=False))

SMR    20
MSR    18
NIU    16
TKL    15
MCO     9
VGB     8
BMU     6
KNA     6
COK     5
WLF     5
CYM     4
BRB     4
GRD     2
SXM     2
ATG     1
TCA     1
ASM     1
Name: Code, dtype: int64


# Remove rows that contain null values
df2 = df2.dropna()


# Define the path to the file
path3 = "https://raw.githubusercontent.com/DTAMBU/Data_Analysis/main/res/TB_budget_2024-01-13.csv"


# Load the file into a DataFrame
df3 = pd.read_csv(path3, sep=',')
display(df3.sample(1))


# Select only desired columns
df3 = df3[['iso3', 'year','budget_tot']]


# Rename the columns for easier reading
df3 = df3.rename(columns={'iso3': 'Code', 'year': 'Year', 'budget_tot': 'Budget'})
display(df3.sample(5))


# Get a statistical summary
df3.describe()


# Get general information
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1290 entries, 0 to 1289
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Code    1290 non-null   object 
 1   Year    1290 non-null   int64  
 2   Budget  828 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 30.4+ KB


# Select the rows of the DataFrame where the 'Budget' column has null values and print the counts according to 'Code' column
null_rows_df3 = df3[df3['Budget'].isnull()]
print(null_rows_df3['Code'].value_counts().sort_values(ascending=False))

QAT    6
TTO    6
BEL    6
BMU    6
AUT    6
      ..
LVA    1
AND    1
SDN    1
PRK    1
ALB    1
Name: Code, Length: 115, dtype: int64


# Remove rows that contain null values
df3 = df3.dropna()


# Merge the DataFrames df1, df2, and df3 based on the columns 'Code' and 'Year'
combined_df = df1.merge(df2, on=['Code', 'Year']).merge(df3, on=['Code', 'Year'])
display(combined_df)


# Check if there is any null value in the DataFrame
combined_df.isnull().values.any()

False


# Create a pivot table from the DataFrame combined_df
# The values of 'GDP', 'Incidence', 'Fatality%', and 'Budget' are grouped by 'Year' and 'Country'
pivot1 = combined_df.pivot_table(values=['GDP', 'Incidence', 'Fatality%', 'Budget'], index=['Year', 'Country'])
display(pivot1)


# Create a pivot table from the DataFrame combined_df
# The values of 'GDP', 'Incidence', 'Fatality%', and 'Budget' are grouped by 'Country' and 'Year'
# Calculate the mean, max, and min for each group
pivot2 = combined_df.pivot_table(values=['GDP', 'Incidence', 'Fatality%', 'Budget'],
                                 index=['Country', 'Year'],
                                 aggfunc=['mean', 'max', 'min'])
display(pivot2)


# Some columns are selected
df_selected = combined_df[['Country', 'Year', 'GDP', 'Incidence', 'Fatality%', 'Budget']]
display(df_selected)


# Group by 'Country' and calculate the mean of other columns across the years.
df_grouped1 = df_selected.groupby('Country')[['GDP', 'Incidence', 'Fatality%', 'Budget']].mean()
display(df_grouped1)


# Divide the values of the 'GDP' column into 5 equal-sized groups.
bins = pd.qcut(df_grouped1['GDP'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
print(bins.value_counts())

Very Low     29
Low          28
Medium       28
High         28
Very High    28
Name: GDP, dtype: int64


# Select rows from the 'df_grouped1' DataFrame where the values in the 'GDP' column are 'Very Low'
df_VeryLow = df_grouped1[bins == 'Very Low']
display(df_VeryLow)


# Calculate the correlation matrix for all data.
corr_matrix = df_grouped1.corr()
# Create a heatmap using seaborn.
plt.figure(figsize=(7, 7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
plt.show()


# Calculate the correlation matrix for the group with 'Very Low' GDP
corr_matrix = df_VeryLow.corr()
# Create a heatmap using seaborn.
plt.figure(figsize=(7, 7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)
plt.show()


# Assign the values from the 'bins' variable to a new column called 'GDP_Level'
df_grouped1['GDP_Level'] = bins
display(df_grouped1)


# Create Boxplots
fig, axs = plt.subplots(ncols=2, figsize=(15, 5))

# Boxplot for 'Incidencia' by 'GDP_Level'
sns.boxplot(x='GDP_Level', y='Incidence', data=df_grouped1, color='skyblue', ax=axs[0])
axs[0].set_ylabel("Incidence")

# Boxplot for 'Budget' by 'GDP_Level'
sns.boxplot(x='GDP_Level', y='Budget', data=df_grouped1, color='pink', ax=axs[1])
axs[1].set_ylabel("Budget")

plt.subplots_adjust(wspace=0.5)
sns.set(rc={'figure.figsize':(10,5)})

plt.show()


# Show the joint distribution using kernel density estimation
g = sns.jointplot(data=df_grouped1, x="Incidence", y="GDP", kind="kde")
plt.show()


# Relationship of each group
sns.pairplot(df_grouped1, hue="GDP_Level")
plt.show()

	Year	GDP
count	6346.000000	6346.000000
mean	2005.937914	18068.999300
std	9.126633	20193.547754
min	1990.000000	436.376400
25%	1998.000000	3673.792850
50%	2006.000000	10486.217500
75%	2014.000000	26409.074000
max	2021.000000	157602.480000

	Code	Year	Incidence	Fatality%
4046	SVN	2013	7.7	6.0
3918	SRB	2013	27.0	6.0
196	ARG	2012	24.0	8.0
1630	FRA	2007	11.0	12.0
2676	MDV	2018	33.0	7.0

	Year	Incidence	Fatality%
count	4917.000000	4917.000000	4794.000000
mean	2011.040065	123.046766	16.327701
std	6.627437	184.816404	13.444558
min	2000.000000	0.000000	0.000000
25%	2005.000000	12.000000	8.000000
50%	2011.000000	46.000000	11.000000
75%	2017.000000	161.000000	22.000000
max	2022.000000	1590.000000	100.000000

	Code	Year	Budget
648	LSO	2018	3318438.0
539	ISL	2023	NaN
530	HUN	2020	2100000.0
985	VCT	2019	NaN
414	FJI	2018	646195.0

	Year	Budget
count	1290.000000	8.280000e+02
mean	2020.500000	4.821886e+07
std	1.708487	1.619825e+08
min	2018.000000	0.000000e+00
25%	2019.000000	1.200075e+06
50%	2020.500000	6.380376e+06
75%	2022.000000	2.665690e+07
max	2023.000000	1.640128e+09

Loading and analysis of the first Dataframe¶

Use of the Geopandas library¶

Graphs as a function of time¶

Loading and analysis of the second Dataframe¶

Loading and analysis of the third Dataframe¶

Merge of the Dataframes¶

Creation of Pivot Table¶

Other Analyses¶

Conclusions¶

	Country	Code	Year	GDP	Incidence	Fatality%	Budget
0	Afghanistan	AFG	2018	2060.6990	189.0	16.0	10881354.0
1	Afghanistan	AFG	2019	2079.9219	189.0	14.0	16957452.0
2	Afghanistan	AFG	2020	1968.3410	183.0	15.0	19137128.0
3	Afghanistan	AFG	2021	1516.3057	185.0	14.0	10633599.0
4	Albania	ALB	2018	13319.4950	18.0	2.0	0.0
...	...	...	...	...	...	...	...
496	Zambia	ZMB	2021	3236.7890	307.0	14.0	37755340.0
497	Zimbabwe	ZWE	2018	2399.6216	210.0	15.0	29583804.0
498	Zimbabwe	ZWE	2019	2203.3967	199.0	22.0	41313083.0
499	Zimbabwe	ZWE	2020	1990.3195	185.0	24.0	31504677.0
500	Zimbabwe	ZWE	2021	2115.1445	194.0	25.0	32126351.0

		Budget	Fatality%	GDP	Incidence
Year	Country
2018	Afghanistan	10881354.0	16.0	2060.6990	189.0
	Albania	0.0	2.0	13319.4950	18.0
	Angola	45881526.0	21.0	6878.5933	355.0
	Argentina	5189232.0	7.0	22747.2420	27.0
	Armenia	6828768.0	6.0	13231.4310	31.0
...	...	...	...	...	...
2021	Uruguay	4253647.0	12.0	22800.6900	32.0
	Vanuatu	527709.0	18.0	2783.0195	34.0
	Vietnam	132975703.0	7.0	10628.2190	165.0
	Zambia	37755340.0	14.0	3236.7890	307.0
	Zimbabwe	32126351.0	25.0	2115.1445	194.0

	GDP	Incidence	Fatality%	Budget
Country
Afghanistan	1906.316900	186.50	14.75	1.440238e+07
Albania	13698.175333	16.00	2.00	5.430423e+05
Angola	6355.465575	340.25	19.25	4.051907e+07
Argentina	21507.850500	28.25	6.75	2.526782e+06
Armenia	13774.949500	26.75	6.50	4.364570e+06
...	...	...	...	...
Uzbekistan	7082.925300	70.00	7.00	1.104770e+07
Vanuatu	2933.806625	39.50	15.75	3.916390e+05
Vietnam	10241.714750	173.25	7.25	8.626229e+07
Zambia	3304.687000	326.25	24.00	3.889813e+07
Zimbabwe	2177.120575	197.00	21.50	3.363198e+07

	GDP	Incidence	Fatality%	Budget
Country
Afghanistan	1906.316900	186.500000	14.750000	1.440238e+07
Benin	3176.159175	54.500000	22.500000	3.558143e+06
Burkina Faso	2108.700850	46.500000	21.000000	7.691302e+06
Burundi	715.348050	103.333333	23.000000	6.842288e+06
Central African Republic	850.420990	540.000000	32.500000	2.962002e+06
Chad	1510.055500	141.250000	23.750000	4.306927e+06
Comoros	3292.741300	35.000000	28.000000	4.929895e+05
Democratic Republic of Congo	1056.463275	319.500000	19.000000	5.432168e+07
Ethiopia	2205.354925	134.750000	16.000000	9.622103e+07
Guinea	2565.330125	175.500000	18.250000	3.785495e+06
Guinea-Bissau	1848.683433	361.000000	39.333333	2.617930e+06
Haiti	3031.992500	167.250000	10.500000	1.177573e+07
Kiribati	1976.545650	406.250000	9.500000	4.376662e+05
Lesotho	2415.499800	635.750000	37.500000	1.005467e+07
Liberia	1446.107225	308.000000	27.000000	8.166810e+06
Madagascar	1510.657475	233.000000	21.000000	3.803587e+06
Malawi	1494.215800	142.250000	30.250000	2.971530e+07
Mali	2162.268425	51.500000	18.750000	6.325744e+06
Mozambique	1259.458775	361.000000	11.500000	2.870273e+07
Niger	1202.958725	83.000000	21.250000	4.939391e+06
Rwanda	2137.110375	57.250000	15.000000	6.611477e+06
Sierra Leone	1616.685300	293.500000	16.500000	1.101003e+07
Solomon Islands	2531.343200	65.500000	11.000000	9.752508e+05
Somalia	1134.146833	254.000000	27.666667	9.391973e+06
Tanzania	2555.384525	230.000000	25.500000	6.383039e+07
Togo	2073.316175	35.750000	11.750000	1.074839e+06
Uganda	2230.956600	199.500000	19.750000	3.455637e+07
Vanuatu	2933.806625	39.500000	15.750000	3.916390e+05
Zimbabwe	2177.120575	197.000000	21.500000	3.363198e+07