C:\Users\Soham Das\Data-analysis-projects\Covid-19-Data-Analysis\Covid-19
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
# for better interactive visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot #to see graphs in offline mode
#to ignore the filter warnings
import warnings
warnings.filterwarnings('ignore')
files=os.listdir(r'C:\Users\Soham Das\Data-analysis-projects\Covid-19-Data-Analysis\Covid-19')
files
#we got many data sets so lets create the functions
['country_wise_latest.csv', 'covid_19_clean_complete.csv', 'day_wise.csv', 'full_grouped.csv', 'usa_country_wise.csv', 'worldometer_data.csv']
# lets create a function to make our task simpler as we have to read data again & again
def read_data(path,filename):
return pd.read_csv(path+'/'+filename)
path=(r'C:\Users\Soham Das\Data-analysis-projects\Covid-19-Data-Analysis\Covid-19')
read_data(path,'worldometer_data.csv')
#this is our dataframe
Country/Region | Continent | Population | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | USA | North America | 3.311981e+08 | 5032179 | NaN | 162804.0 | NaN | 2576668.0 | NaN | 2292707.0 | 18296.0 | 15194.0 | 492.0 | 63139605.0 | 190640.0 | Americas |
1 | Brazil | South America | 2.127107e+08 | 2917562 | NaN | 98644.0 | NaN | 2047660.0 | NaN | 771258.0 | 8318.0 | 13716.0 | 464.0 | 13206188.0 | 62085.0 | Americas |
2 | India | Asia | 1.381345e+09 | 2025409 | NaN | 41638.0 | NaN | 1377384.0 | NaN | 606387.0 | 8944.0 | 1466.0 | 30.0 | 22149351.0 | 16035.0 | South-EastAsia |
3 | Russia | Europe | 1.459409e+08 | 871894 | NaN | 14606.0 | NaN | 676357.0 | NaN | 180931.0 | 2300.0 | 5974.0 | 100.0 | 29716907.0 | 203623.0 | Europe |
4 | South Africa | Africa | 5.938157e+07 | 538184 | NaN | 9604.0 | NaN | 387316.0 | NaN | 141264.0 | 539.0 | 9063.0 | 162.0 | 3149807.0 | 53044.0 | Africa |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
204 | Montserrat | North America | 4.992000e+03 | 13 | NaN | 1.0 | NaN | 10.0 | NaN | 2.0 | NaN | 2604.0 | 200.0 | 61.0 | 12220.0 | NaN |
205 | Caribbean Netherlands | North America | 2.624700e+04 | 13 | NaN | NaN | NaN | 7.0 | NaN | 6.0 | NaN | 495.0 | NaN | 424.0 | 16154.0 | NaN |
206 | Falkland Islands | South America | 3.489000e+03 | 13 | NaN | NaN | NaN | 13.0 | NaN | 0.0 | NaN | 3726.0 | NaN | 1816.0 | 520493.0 | NaN |
207 | Vatican City | Europe | 8.010000e+02 | 12 | NaN | NaN | NaN | 12.0 | NaN | 0.0 | NaN | 14981.0 | NaN | NaN | NaN | Europe |
208 | Western Sahara | Africa | 5.986820e+05 | 10 | NaN | 1.0 | NaN | 8.0 | NaN | 1.0 | NaN | 17.0 | 2.0 | NaN | NaN | Africa |
209 rows × 16 columns
#saving the dataframe as world_data
world_data=read_data(path,'worldometer_data.csv')
world_data.head()
Country/Region | Continent | Population | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | USA | North America | 3.311981e+08 | 5032179 | NaN | 162804.0 | NaN | 2576668.0 | NaN | 2292707.0 | 18296.0 | 15194.0 | 492.0 | 63139605.0 | 190640.0 | Americas |
1 | Brazil | South America | 2.127107e+08 | 2917562 | NaN | 98644.0 | NaN | 2047660.0 | NaN | 771258.0 | 8318.0 | 13716.0 | 464.0 | 13206188.0 | 62085.0 | Americas |
2 | India | Asia | 1.381345e+09 | 2025409 | NaN | 41638.0 | NaN | 1377384.0 | NaN | 606387.0 | 8944.0 | 1466.0 | 30.0 | 22149351.0 | 16035.0 | South-EastAsia |
3 | Russia | Europe | 1.459409e+08 | 871894 | NaN | 14606.0 | NaN | 676357.0 | NaN | 180931.0 | 2300.0 | 5974.0 | 100.0 | 29716907.0 | 203623.0 | Europe |
4 | South Africa | Africa | 5.938157e+07 | 538184 | NaN | 9604.0 | NaN | 387316.0 | NaN | 141264.0 | 539.0 | 9063.0 | 162.0 | 3149807.0 | 53044.0 | Africa |
day_wise=read_data(path,files[2])
group_data=read_data(path,files[3])
usa_data=read_data(path,files[4])
province_data=read_data(path,files[1])
day_wise.shape
(188, 12)
world_data.columns
#we get all the column names
Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'WHO Region'], dtype='object')
#more the size of each block more will be the count
#we can also use piechart or a bargraph for the same purpose
columns=['TotalCases','TotalDeaths','TotalRecovered','ActiveCases']
for i in columns:
fig=px.treemap(world_data[0:20],values=i,path=['Country/Region'],title="<b>TreeMap representation of different Countries w.r.t. their {}</b>".format(i))
fig.show()
#for all the case USA's count is huge for all the cases
#whenever we hear the word 'trend' try to visualise using line plot
fig=px.line(day_wise,x="Date",y=["Confirmed","Deaths","Recovered","Active"],title="COVID cases-vs-date")
fig.show()
#we can say deathcount is low compared to the confirmed cases count
#the rate of recovery is moderate
#during the month of june we can see recovery count gets higher,
#than the active cases since the the old cases are recovering.
#we will call top 20 countries i.e [0:20]
pop_test_ratio=world_data.iloc[0:20]['Population']/world_data.iloc[0:20]['TotalTests']
pop_test_ratio
0 5.245489 1 16.106896 2 62.365033 3 4.911040 4 18.852446 5 122.115932 6 13.241331 7 10.866949 8 28.269105 9 6.618696 10 32.187237 11 3.877883 12 9.589865 13 107.484026 14 134.558952 15 8.514790 16 16.613857 17 56.934398 18 9.760649 19 16.353942 dtype: float64
fig=px.bar(world_data.iloc[0:20],color='Country/Region',y=pop_test_ratio,x='Country/Region',title="<b>Population vs Tests done</b>")
fig.show()
#Bangladesh has conducted tests for the highest percentage of it's population
#followed by Pakistan, Mexico and India
#BarPlot Representation of CoronaViruses Cases w.r.t Time
fig=px.bar(world_data.iloc[0:20],x='Country/Region',y=['Serious,Critical','TotalDeaths','TotalRecovered','ActiveCases','TotalCases'])
fig.update_layout({'title':"Coronavirus cases vs Time"})
fig.show()
#USA is the country which is most affected by COVID
#Brazil and India are also in critical situation
world_data.head()
Country/Region | Continent | Population | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | USA | North America | 3.311981e+08 | 5032179 | NaN | 162804.0 | NaN | 2576668.0 | NaN | 2292707.0 | 18296.0 | 15194.0 | 492.0 | 63139605.0 | 190640.0 | Americas |
1 | Brazil | South America | 2.127107e+08 | 2917562 | NaN | 98644.0 | NaN | 2047660.0 | NaN | 771258.0 | 8318.0 | 13716.0 | 464.0 | 13206188.0 | 62085.0 | Americas |
2 | India | Asia | 1.381345e+09 | 2025409 | NaN | 41638.0 | NaN | 1377384.0 | NaN | 606387.0 | 8944.0 | 1466.0 | 30.0 | 22149351.0 | 16035.0 | South-EastAsia |
3 | Russia | Europe | 1.459409e+08 | 871894 | NaN | 14606.0 | NaN | 676357.0 | NaN | 180931.0 | 2300.0 | 5974.0 | 100.0 | 29716907.0 | 203623.0 | Europe |
4 | South Africa | Africa | 5.938157e+07 | 538184 | NaN | 9604.0 | NaN | 387316.0 | NaN | 141264.0 | 539.0 | 9063.0 | 162.0 | 3149807.0 | 53044.0 | Africa |
world_data['Country/Region'].nunique()
209
fig=px.bar(world_data.iloc[0:20],y='Country/Region',x='TotalCases',color='TotalCases',text="TotalCases")
fig.update_layout(title_text="<b>Top 20 countries of Total confirmed cases</b>")
fig.show()
fig=px.bar(world_data.sort_values(by='TotalDeaths',ascending=False)[0:20],y='Country/Region',x='TotalDeaths',color='TotalDeaths',text="TotalDeaths")
fig.update_layout(title_text="<b>Top 20 countries of Total deaths</b>")
fig.show()
fig=px.bar(world_data.sort_values(by='ActiveCases',ascending=False)[0:20], y='Country/Region',x='ActiveCases',color='ActiveCases',text='ActiveCases')
fig.update_layout(title_text="<b>Top 20 countries of Total Active cases")
fig.show()
fig=px.bar(world_data.sort_values(by='TotalRecovered',ascending=False)[:20],y='Country/Region',x='TotalRecovered',color='TotalRecovered',text='TotalRecovered')
fig.update_layout(title_text="<b>Top 20 countries of Total Recovered")
fig.show()
world_data.columns
Index(['Country/Region', 'Continent', 'Population', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'WHO Region'], dtype='object')
world_data[0:15]['Country/Region'].values
array(['USA', 'Brazil', 'India', 'Russia', 'South Africa', 'Mexico', 'Peru', 'Chile', 'Colombia', 'Spain', 'Iran', 'UK', 'Saudi Arabia', 'Pakistan', 'Bangladesh'], dtype=object)
labels=world_data[0:15]['Country/Region'].values
cases=['TotalCases','TotalDeaths','TotalRecovered','ActiveCases']
for i in cases:
fig=px.pie(world_data[0:15],values=i,names=labels,hole=0.3,title=" {} Recordeded w.r.t. to WHO Region of 15 worst effected countries ".format(i))
fig.show()
deaths_to_confirmed=((world_data['TotalDeaths']/world_data['TotalCases']))
fig = px.bar(world_data,x='Country/Region',y=deaths_to_confirmed)
fig.update_layout(title={'text':"Death to confirmed ratio of some worst effected countries",'xanchor':'left'})
fig.show()
#hovering the mouse over the longest bar we find It's yemen
#yemen has highest Death:Confirmed ratio
deaths_to_recovered=((world_data['TotalDeaths']/world_data['TotalRecovered']))
fig = px.bar(world_data,x='Country/Region',y=deaths_to_recovered)
fig.update_layout(title={'text':"Death to recovered ratio of some worst effected countries",'xanchor':'left'})
fig.show()
#we can see recovery rate of Yemen and Belgium is quite high with respect to death count
tests_to_confirmed=((world_data['TotalTests']/world_data['TotalCases']))
fig = px.bar(world_data,x='Country/Region',y=tests_to_confirmed)
fig.update_layout(title={'text':"Tests to confirmed ratio of some worst effected countries",'xanchor':'left'})
fig.show()
#We would find Laos is where highest no. of people getting tested is COVID +ve
world_data['Serious,Critical']/world_data['TotalDeaths']
0 0.112381 1 0.084323 2 0.214804 3 0.157470 4 0.056122 ... 204 NaN 205 NaN 206 NaN 207 NaN 208 NaN Length: 209, dtype: float64
serious_to_death=((world_data['Serious,Critical']/world_data['TotalDeaths']))
fig = px.bar(world_data,x='Country/Region',y=serious_to_death)
fig.update_layout(title={'text':"serious to Death ratio of some worst effected countries",'xanchor':'left'})
fig.show()
#we can conclude that most of the people having serious COVID condition have a high chance of death
#We can find namibia at the top
#but there are many countries with serious fatal conditions
group_data.head()
Date | Country/Region | Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | WHO Region | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-22 | Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Eastern Mediterranean |
1 | 2020-01-22 | Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe |
2 | 2020-01-22 | Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa |
3 | 2020-01-22 | Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe |
4 | 2020-01-22 | Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa |
#for creating subplots in plotly
from plotly.subplots import make_subplots
import plotly.graph_objects as go
def country_visualization(group_data,country):
#we will accept the country name
data=group_data[group_data['Country/Region']==country]
df=data.loc[:,['Date','Confirmed','Deaths','Recovered','Active']]
#calling functions which makes subplots
#then we need to mention the features needed to be visualised
fig = make_subplots(rows=1, cols=4,subplot_titles=("Confirmed", "Active", "Recovered",'Deaths'))
#Confirmed
fig.add_trace(
go.Scatter(name="Confirmed",x=df['Date'],y=df['Confirmed']),
row=1, col=1
)
#Active
fig.add_trace(
go.Scatter(name="Active",x=df['Date'],y=df['Active']),
row=1, col=2
)
#Recovered
fig.add_trace(
go.Scatter(name="Recovered",x=df['Date'],y=df['Recovered']),
row=1, col=3
)
#Deaths
fig.add_trace(
go.Scatter(name="Deaths",x=df['Date'],y=df['Deaths']),
row=1, col=4
)
fig.update_layout(height=600, width=1000, title_text="Date Vs Recorded Cases of {}".format(country))
fig.show()
country_visualization(group_data,'India')
#all the graphs are almost of same trend for india
country_visualization(group_data,'US')
#for US we see the a slight change in death Curve and active curve trend