# Loading tidyverse package
library(tidyverse)
# Loading dataset
= read_csv(
tips 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv'
)
4 Data Visualization
Learning Objectives of the Chapter
At the End of the Chapter, Students should be Able to -
Learn about the Meaning of Visualization
Understand the Importance of Visualization
Learn about Different R and Python Packages for Visualization
Gain Knowledge about Different Types of Visualization
Learn Knowledge on Interactive Visualization
4.1 What is Visualization?
To learn more about different kinds of visualization in R
, you should visit - https://r-graph-gallery.com/ and https://www.kaggle.com/code/ruchiraperera/seaborn-vs-plotly-express.
4.2 Importance of Visualization
4.3 Visualization Packages in R and Python
ggplot2
is a powerful package for visualization in R
. In addition, some other packages enhance the functionalities of ggplot2
. These packages include - gganimate
, ggthemes
, ggpubr
, ggridges
, ggmap
, ggrepel
, ggextra
, ggpattern
, ggcorrplot
and so on.
In Python
, matplotlib
and seaborn
are two of the powerful packages for visualization. Additionally, plotly
, plotnine
, altair
, and bokeh
are some other python packages that enhances visualization in python.
# Loading Necessary Python Packages
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# ggplot style
'ggplot')
plt.style.use(# Loading dataset
= sns.load_dataset('tips') tips
4.4 ggplot - Grammar of Graphics
In ggplot
, a plot consists of at least four elements -
Data - the data frame
Aesthetic Mappings - aesthetic mappings map variable from the data frame to different kinds of aesthetics such as x coordinate, y coordinate, color, shape, size and so on.
Coordinate System - the positioning of points
Geom - geoms are geometirc objects such as points or lines.
You can also use cheatsheet of ggplot to know more about the ggplot. Another good source to learn more about visualization in R
is The R Graph Library. Similarly, for Python
, you can use The Python Graph Library.
4.5 Types of Visualization
4.5.1 Bar Diagram (Bar Plot)
4.5.1.1 One Categorical Variable
|>
tips count (sex) |>
ggplot(mapping = aes(x = sex, y = n))+
geom_bar(stat = 'identity', width = 0.5, fill = "orangered3") +
labs(x = 'Sex', y = 'Total Observations')
Either of the the following code will also produce the same visualization.
|>
tips ggplot(mapping = aes(x = sex))+
geom_bar(width = 0.5, fill = "maroon") +
labs(x = 'Sex', y = 'Total Observations')
|>
tips ggplot(mapping = aes(x = sex))+
stat_count(width = 0.5, fill = "maroon") +
labs(x = 'Sex', y = 'Total Observations')
= tips, x = "sex", width=0.5)
sns.countplot(data 'Sex')
plt.xlabel('Total Observations') plt.ylabel(
4.5.1.2 One Categorical Variable and One Continuous Variable
Barplot can also be used for two variables - both discrete (categorical) variables or one discrete (categorical) and one continuous variable. Below is bar plot for one discrete (categorical) and one continuous variable.
|>
tips group_by(sex) |>
summarize(total_bill = mean(total_bill)) |>
ggplot(aes(x = sex, y = total_bill)) +
geom_col(width =0.6, fill = "pink") +
labs(x = "Sex", y = "Total Bill") +
geom_text(aes(label = round(total_bill,2)), vjust = -0.2)
The following code will produce the same results.
|>
tips ggplot(mapping = aes(x = sex, y = total_bill))+
geom_bar(stat = 'summary', fun = "mean", position = "dodge",
width = 0.60, fill = "pink") +
labs(x = "Sex", y = "Total Bill")
= tips, x = "sex", y = "total_bill",
sns.barplot(data = 0.5,
width= None)
errorbar'Sex')
plt.xlabel('Total Bill') plt.ylabel(
The following code will add text value on the bars in barplot.
= sns.barplot(data = tips, x = "sex", y = "total_bill",
ax = 0.5,
width= None)
errorbar
for i in ax.containers:
ax.bar_label(i,)
'Sex')
plt.xlabel('Total Bill') plt.ylabel(
4.5.1.3 Two Categorical Variables
Below is a bar plot for both discrete (categorical) variables.
|>
tips count (sex, day) |>
ggplot(mapping = aes(x = sex, y = n, fill = day))+
geom_bar(stat = 'identity', position = "dodge") +
labs(x = "Sex", y = "Total Observations")
The following code will also produce the same visualization.
|>
tips #count (sex, day) |>
ggplot(mapping = aes(x = sex, fill = day))+
geom_bar(stat = 'count', position = "dodge") +
labs(x = "Sex", y = "Total Observations"
fill = "Day"
, )
|>
tips count (sex, day) |>
ggplot(mapping = aes(x = sex, y = n, fill = day))+
geom_bar(stat = 'identity', position = "stack") + # position = "fill"
labs(x = "Sex", y = "Total Observations")
The following code will also produce the same visualization.
|>
tips #count (sex, day) |>
ggplot(mapping = aes(x = sex, fill = day))+
geom_bar(stat = 'count', position = "stack") + # position = "fill"
labs(x = "Sex", y = "Total Observations"
fill = "Day"
, )
= tips, x = "sex", hue = "day")
sns.countplot(data 'Sex')
plt.xlabel('Total Observations') plt.ylabel(
Stacked barchart cannot be created using seaborn
. So, we use alternatives -
'sex', 'day']].value_counts().reset_index() \
tips[[= "sex", columns = "day", values = 'count') \
.pivot(index = "bar", stacked = True)
.plot(kind = 360) plt.xticks(rotation
(array([0, 1]), [Text(0, 0, 'Male'), Text(1, 0, 'Female')])
"Sex")
plt.xlabel("Total Observations")
plt.ylabel(= "upper right") plt.legend(loc
4.5.2 Histogram
4.5.2.1 One Continuous Variable
|>
tips ggplot(aes(x = total_bill))+
geom_histogram(binwidth = 2.25, fill = "orangered3") +
labs(x = "Total Bill", y = "Count")
The following code will generate the same results with a little modification -
|>
tips ggplot(aes(x = total_bill))+
geom_histogram(binwidth = 2.25, fill = "orangered3", col = "white") +
labs(x = "Total Bill", y = "Count")
= tips, x = "total_bill", binwidth=2.25)
sns.histplot(data "Total Bill")
plt.xlabel("Count") plt.ylabel(
4.5.2.2 One Continuous and One Categorical Variable
|>
tips ggplot(aes(x = total_bill, fill = sex))+
geom_histogram(binwidth = 2.25)+
labs(x = "Total Bill")
The following code will generate the same results -
|>
tips ggplot(aes(x = total_bill, color = sex))+
geom_histogram(binwidth = 2.25)
= tips, x = "total_bill", hue = "sex", binwidth=2.25)
sns.histplot(data "Total Bill")
plt.xlabel("Count") plt.ylabel(
=tips, col="sex") \
sns.FacetGrid(datamap(sns.histplot, "total_bill", binwidth = 2.25) .
4.5.3 Density Plot
4.5.3.1 One Continuous Variable
|>
tips ggplot(aes(x = total_bill))+
geom_density( size = 1, color = "orangered3"
#adjust = 0.2
+
) labs(x = "Total Bill", y = "Density")
= tips, x = "total_bill"
sns.kdeplot(data #,bw_adjust = 0.20
)"Total Bill") plt.xlabel(
4.5.3.2 Two Continuous Variables
|>
tips select(1:2) |>
pivot_longer(cols = everything(), names_to = "types", values_to = "values") |>
ggplot(aes(x = values, col = types))+
geom_density(size = 1)
= tips[['total_bill', 'tip']])
sns.kdeplot(data "Total Bill") plt.xlabel(
4.5.3.3 One Continuous Variable and One Categorical Variable
|>
tips ggplot(aes(x = total_bill, fill = sex))+
geom_density(
#adjust = 0.2
+
)labs(x = "Total Bill", y = "Density")
|>
tips ggplot(aes(x = total_bill, color = sex))+
geom_density(size = 1
#adjust = 0.2
+
)labs(x = "Total Bill", y = "Density")
= tips, x = "total_bill", hue = "sex")
sns.kdeplot(data "Total Bill") plt.xlabel(
= tips, x = "total_bill", hue = "sex", multiple = "stack")
sns.kdeplot(data "Total Bill") plt.xlabel(
4.5.4 Point Plot
4.5.4.1 One Categorical and One Continuous Variable
|>
tips ggplot(aes(x = sex, y = total_bill, group = 1)) +
stat_summary(aes(sex, total_bill), geom = "point", fun.y = mean, size = 2, col = "red")+
stat_summary(aes(sex, total_bill), geom = "line", fun.y = mean, size = 1.5, col = "red",size = 2.1) +
labs(x = "Sex", y = "Total Bill")
The following code will also produce the same visualization.
|>
tips group_by(sex) |>
summarize(total_bill = mean(total_bill)) |>
ggplot(aes(x = sex, y = total_bill, group = 1)) +
geom_point(col = "red", size = 2)+
geom_line(col = "red", size = 2.1) +
labs(x = "Sex", y = "Total Bill")
= tips, x = "sex", y = "total_bill", errorbar=None)
sns.pointplot(data 'Sex')
plt.xlabel("Total Bill") plt.ylabel(
4.5.4.2 Two Categorical Variables and One Continuous Variable
|>
tips ggplot(aes(x = sex, y = total_bill, group = smoker, color = smoker)) +
stat_summary(aes(x = sex, y = total_bill), geom = "point", fun.y = mean) +
stat_summary(aes(x = sex, y = total_bill), geom = "line", fun.y = mean, size = 1.1) +
labs(x = "Sex", y = "Total Bill" #, color = "Smoker"
)
The following code will also produce the same visualization.
|>
tips group_by(sex, smoker) |>
summarize( total_bill = mean(total_bill)) |>
ggplot(aes(x = sex, y = total_bill, group = smoker , color = smoker)) +
geom_point()+
geom_line(size = 1.1)+
labs(x = "Sex", y = "Total Bill")
= tips, x = "sex", y = "total_bill",
sns.pointplot(data = "smoker", errorbar= None)
hue "Sex")
plt.xlabel("Total Bill") plt.ylabel(
4.5.5 Box Plot
|>
tips ggplot(aes(x = sex, y = total_bill))+
geom_boxplot(fill = "pink") +
labs (x = "Sex", y = "Total Bill")
|>
tips ggplot(aes(x = sex, y = total_bill))+
geom_boxplot(fill = "pink") +
labs (x = "Sex", y = "Total Bill") +
facet_wrap(~smoker)
|>
tips ggplot(aes(x = sex, y = total_bill))+
geom_boxplot(fill = "pink") +
labs (x = "Sex", y = "Total Bill") +
facet_grid(time~smoker)
= tips, x = "sex", y = "total_bill", color = "pink")
sns.boxplot(data "Sex")
plt.xlabel("Total Bill") plt.ylabel(
= tips, x = "sex", y = "total_bill",
sns.catplot(data = "pink", kind = "box", row = "smoker"
color )
= tips, x = "sex", y = "total_bill",
sns.catplot(data = "pink", kind = "box", row = "smoker"
color = "time"
,col )
4.5.6 Scatter Plot
|>
tips ggplot(aes(x = total_bill, y = tip))+
geom_point(col = "blue")+
labs(x = "Total Bill", y = "Tip")
= tips, x = "total_bill", y = "tip")
sns.scatterplot(data "Total Bill")
plt.xlabel("Tip") plt.ylabel(
4.5.7 Regression Plot
|>
tips ggplot(aes(x = total_bill, y = tip))+
geom_point(col = "blue")+
geom_smooth(method = "lm", col = "orange") +
labs(x = "Total Bill", y = "Tip")
|>
tips ggplot(aes(x = total_bill, y = tip, col = sex))+
geom_point(col = "blue")+
geom_smooth(method = "lm") +
labs(x = "Total Bill", y = "Tip")
= tips, x = "total_bill", y = "tip") sns.lmplot(data
"Total Bill")
plt.xlabel("Tip") plt.ylabel(
= tips, x = "total_bill", y = "tip")
sns.regplot(data "Total Bill")
plt.xlabel("Tip") plt.ylabel(
= tips, x = "total_bill", y = "tip", hue = "sex") sns.lmplot(data
"Total Bill")
plt.xlabel("Tip") plt.ylabel(
4.6 Exercises # 01
Download student data from the url and create a pointplot (lineplot) of students average math score (
math.grade
) of gender (gender
). Please note that the variablegender
includes a label calledother
in addition toM
andF
; you should filter out obsevations of the labelother
before you create visualization.From the dataset in above (question 1), compare, using pointplot (lineplot), the average math (
math.grade
) and science score (sciences.grade
) of different students based on gender (gender
). You might need to usepivot_longer
function to reshape the data frame before visualizing the relation.
4.7 Interactive Visualization
Interactive Visualization involves graphical presentation of data that permits users to engage with the visual elements directly. Unlike static visulization, interactive visualization allows users to manipulate data, explore different aspects, and customize the visualization in real time. The primary objective of interactive visualization is to make data exploration more intuititve and dynamic. The benefits of interactive visualzation include - enhaned engagement, deeper insights, customization, and exploration and discovery.
library(plotly)
= ggplot(data = tips, aes(x = sex)) +
p geom_bar(width = 0.5, fill = "orangered3") +
labs(x = "Gender", y = "Total Observations")
ggplotly(p)
= tips |>
p2 ggplot(aes(x = time, y = total_bill, group = smoker, color = smoker))+
stat_summary(aes(x = time, y = total_bill), geom = "point", fun.y = mean) +
stat_summary(aes(x = time, y = total_bill), geom = "line", fun.y = mean, size = 1.1) +
labs (x = "Time", y = "Total Bill")
ggplotly(p2)
import plotly.express as px
= px.histogram(tips, x = "sex") \
fig = "orangered") \
.update_traces(marker_color = "Sex") \
.update_xaxes(title = "Count")
.update_yaxes(title fig.show()
= "sex", y = "total_bill",histfunc='avg') \
px.histogram(tips, x = "orangered") \
.update_traces(marker_color = "Sex") \
.update_xaxes(title = "Average Total Bill") \
.update_yaxes(title .show()
="total_bill",histnorm='probability density',
px.histogram(tips, x=600, height=400) \
width= "Total Bill") \
.update_xaxes(title ="Density") .update_yaxes(title
#import plotnine as p9
from plotnine import *
import plotly.tools as tls
= tips.groupby(["sex"])["total_bill"].agg('mean').reset_index() df
(= "sex", y = "total_bill", group = 1)) +
ggplot(df, aes(x = "blue")+
geom_point(color = "orange", size = 1.1) +
geom_line(color = "Sex", y = "Average Total Bill")
labs(x )
<Figure Size: (640 x 480)>
= (
plotly_fig = "sex", y = "total_bill", group = 1)) +
ggplot(df, aes(x = "blue")+
geom_point(color = "orange", size = 1.1)
geom_line(color
) tls.mpl_to_plotly(plotly_fig.draw()).show()
= tips.groupby(["sex", "smoker"])["total_bill"] \
df2 'mean') \
.agg(round(2) \
.
.reset_index()
(= "sex", y = "total_bill", group = "smoker", color = "smoker")) +
ggplot(df2, aes(x +
geom_point()= 1.1) +
geom_line(size = "Sex", y = "Average Total Bill")
labs(x )
<Figure Size: (640 x 480)>