Seaborn#

Seaborn was

  • built from matplotlib

  • Integrated for pandas structures

Basic structure#

sns.”chart type”(       data=”dataset”,       x=”data in x axis”,       y=”data in y axis”,       hue=”grouping variable” )

import matplotlib.pyplot as plt
import seaborn as sns

sns.barplot(x=["A", "B", "C"], y=[1,3,2])
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 1
----> 1 import matplotlib.pyplot as plt
      2 import seaborn as sns
      4 sns.barplot(x=["A", "B", "C"], y=[1,3,2])

ModuleNotFoundError: No module named 'matplotlib'

Chart types in seaborn#

Picture title

import seaborn as sns
import matplotlib.pyplot as plt

#loading our data
#this data represents tips in a restaurant vs other variables
tips = sns.load_dataset('tips')
print(tips.head(5))
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

histogram (.displot())#

sns.displot(data=tips, x="total_bill")
plt.show()
../_images/604efe067808a4afd3109cec93169fee85e442b44e00ec1bf94473ca121aed74.png

scatter plot (.scatterplot())#

sns.scatterplot(data=tips, x="total_bill", y="tip")
plt.show()
../_images/7d931ee63f10531b6febe7fb272ceb3b901b40f5b3de46e935d58ab28dfa6a38.png

lm plot (linear model)#

this plot shows a correlation between two variables drawing a linear model on the chart

sns.lmplot(data=tips, x="total_bill", y="tip")
plt.show()
../_images/070018948b9a7d8a8f3d67cd3db9cfba629e8277034f9eb43b912ecbe7114bfa.png

Scatter plot with group by#

import seaborn as sns
import matplotlib.pyplot as plt

#import dataset
tipsdata = sns.load_dataset("tips")
tipsdata.head()

#show dataset
print(tipsdata.head())

#scatter plot, segment tip % total_bill correlation by day
sns.scatterplot(data=tipsdata, x="total_bill", y="tip", hue="day", palette ="pastel")
plt.show()
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
../_images/d05dde599a1573b52b4471b318f58b04eba7469d9cf60c837625b38c9ff05042.png

heatmap#

# see the correlation among the variables
tips.corr()
total_bill tip size
total_bill 1.000000 0.675734 0.598315
tip 0.675734 1.000000 0.489299
size 0.598315 0.489299 1.000000
#heatmap of the correlations
sns.heatmap(tips.corr())
<AxesSubplot: >
../_images/e0f1be3706c8da95358cb75e317be4739afaf808670585c126c0a71e4f7e6de4.png

Se pueden agregar diferentes parámetros:

  • annot muestra el valor de la correlación

  • cmap color

  • linewidthsespacio entre variables

  • linecolor color de las líneas

  • vminv, max valores máximos y mínimos

  • cbar=False eliminar la barra

#heatmap of the correlations
sns.heatmap(tips.corr(), annot= True, cmap='coolwarm', linewidths=5, linecolor='black',
vmin=0.5,vmax=1,cbar=False);
../_images/b058773a5c239a27db9c840ed4a2464765c31db447b88f23c289b1b2695fa5f6.png

Kernel Density Estimation (KDE)#

sns.kdeplot(data= tips, x= 'total_bill'); 

#In statistics, kernel density estimation (KDE) is the application of kernel smoothing for 
#probability density estimation, i.e., a non-parametric method to estimate the probability
#density function of a random variable based on kernels as weights.
../_images/2ff3203b680833b6b5f85f93ec685f0364de618bb249aa0d7d8c5d8f8f07d396.png

Change chart type (kind)#

Remember the image of the seaborn chart categories

Picture title

you can only change to a sub category if the main category corresponds in each case.

print(tips.head(5))
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
#Example

#lineplot is under relplot
sns.relplot(data=tips, x="total_bill", y="tip", kind="line")
plt.show()

#rugplot is under distplot
sns.displot(data=tips, x="tip", kind="ecdf")
plt.show()
../_images/7071ca3b67053553f8e0a4da5b6126329a0a862733936cb5ddf5129ed95ae481.png ../_images/4b2ae8842cae99555134561ebb647ae296dc0fe93f0210a63e6b97ac9876d55b.png

Remove Legend, change palette & transparency#

#note that the hue argument would add a legend of sex, but legend=False removed it.
#we also changed the line transparency with alpha=0.5
sns.displot(data= tips, x= 'total_bill', hue = 'sex', kind = 'kde', legend= False, palette='dark', alpha = .5)
plt.show()
../_images/9d0835d7d97cc253ae694755523c36c56c37d65a67afe8943ba7579926eb4049.png

Group by (hue)#

The argument hue allows you to do a segmentation in the chart, just as group by in pandas

#let's use this data from the lesson above
print(tips.head(5))

sns.scatterplot(data=tips, x="total_bill", y="tip", hue="sex")
plt.show()
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
../_images/392aa92f053cfa25997e9d329fc3b7528f3776c9aa15a3e755cd7e5d714f1859.png

Multiple charts#

In this chapter see:

  • how to create multiple charts one over the other

  • how to create multiple charts one next to the other

import seaborn as sns
import matplotlib.pyplot as plt 

#work with the following data
tips = sns.load_dataset('tips')
tips.head(2)
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3

Combine charts (overlapping)#

You can combine charts literally writing one line of code below the other.

#first chart
sns.boxplot(data=tips,x="day",y="total_bill",hue="sex", dodge=True)

#second chart
sns.swarmplot(data=tips,x="day",y="total_bill",hue="sex", palette='dark:0', dodge=True)
#the dodge argument is for the swarm plot to segment by sex

plt.show()
../_images/972b02cc27ef2ca6a4bc01313d579c9165597c2ff1a76c331a94786cc5cf960e.png

One next to the other#

the argument col separates the charts

sns.relplot(data= tips, x= 'total_bill', y = 'tip',  hue= 'day', kind= 'scatter', col = 'time');
../_images/569607af06ec14caec57cb07533b5dcb24cb8883359afd2edcaf02baa21f4c3a.png

Jointplot#

Joinplot joins two different charts (not overlapping, nor one next to the other. see below for details)

import seaborn as sns
import matplotlib.pyplot as plt

#loading our data
tips = sns.load_dataset('tips')
tips.head()

#jointplot chart
sns.jointplot(data=tips, x="total_bill", y="tip", hue="sex", kind="scatter");
#with kind you edit the type of the main chart
../_images/3579e23356a7638e6414a1239fb666783c0024187d3042b7eb899b4df0f9dcd5.png

You can add more arguments to do a better analysis

marginal_ticks#

marginal_ticks creates a table for the external chart

sns.jointplot(data=tips, x="total_bill", y="tip", hue="sex", kind="kde", marginal_ticks=True);
../_images/dea849d77a53aa689cc8e5126e5e685369d4f0914b798092e9828357d238ff85.png

marginal_kws#

marginal_kws allos to modify determined parameters for the external chart

sns.jointplot(data=tips, x="total_bill", y="tip", hue="sex", kind="hist",
marginal_ticks=True,     #shows a small table for the external chart
marginal_kws=dict(bins= 25, fill = True, multiple= 'dodge')  #arguments only affect the external chart
)  
<seaborn.axisgrid.JointGrid at 0x7f9de75bbd30>
../_images/a3ce6b0b590fea9c000148fa66b6d73d92863ed0b3289b6ec8447c441d934dd2.png

Modify style,pallette & font (Set)#

Modify size#

plt.figure(figsize=(1,1))

sns.set()
sns.barplot(x=["A", "B", "C"], y=[1,3,2])
plt.show()
../_images/08631729945a08cf5de30c3c4d26cd05ac9b8534d1f63a20ed0591a9c34c0fc1.png

Set (modify style, pallette & font#

Set allows to modify:

  • style and the

  • font

  • palette

  • font scale

simultaneously

sns.set(style="dark", palette="Spectral", font_scale=3)
sns.barplot(x=["A", "B", "C"], y=[1,3,2])
plt.show()
../_images/682801c9c6dae2068fc49cce577fef80db09bc44f07f01c4659e7dc6aca873ef.png

i will restore the default styles settings by calling set() with no arguments

sns.set()
sns.barplot(x=["A", "B", "C"], y=[1,3,2])
plt.show()
../_images/4972450d95279ac1fde5fa82d7099a703ee27409199065a0105945de46aba8c1.png

Seaborn color palletes#

link:

https://seaborn.pydata.org/generated/seaborn.color_palette.html#seaborn.color_palette

some examples:

sns.color_palette("husl", 9)
sns.color_palette("Spectral", as_cmap=True)
Spectral
Spectral colormap
under
bad
over
sns.color_palette("dark:#5A9_r", as_cmap=True)
blend
blend colormap
under
bad
over
sns.color_palette("pastel")

Seaborn themes#

Link:

https://seaborn.pydata.org/generated/seaborn.set_theme.html#seaborn.set_theme

Save your chart as a png#

Hola, si desean guardar los diagramas como imagen para descargarlos y usarlos en otro lado pueden usar el plt.savefig(“name.png”)

Chart customization#

import seaborn as sns
import matplotlib.pyplot as plt

tipsdata = sns.load_dataset("tips")
tipsdata.head()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

Acumulative charts#

sns.histplot(data=tipsdata, x="tip", bins = 15, cumulative=True)
plt.show()
../_images/d381c24827ae31287a7a033f8b7f70fe4a2f885e020b1adac347ced320413c50.png

Statistic charts#

Argument is stat() and the options are:

[‘count’, ‘density’, ‘percent’, ‘probability’ or ‘frequency’]

sns.histplot(data = tipsdata, x= 'tip', bins = 15, hue= 'sex', stat = 'density')
plt.show()

sns.histplot(data = tipsdata, x= 'tip', bins = 15, hue= 'sex', stat = 'frequency')
plt.show()
../_images/eefc96e256db608bfcf5c0e1fc74cbb9be0f8c74e529bb2d5b0ee88e02fa5f36.png ../_images/7c25878e66c00e8fa22eabed61109cc48d16dc997dab23ad8990d1c219cb7f38.png

Chart grouping#

Argument is multiple() and the options are:

[‘layer’, ‘stack’, ‘fill’, ‘dodge’]

#first plot with stack
sns.histplot(data = tipsdata, x= 'tip', bins = 15, hue= 'sex', multiple = 'stack')
plt.show()

#second plot with dodge
sns.histplot(data = tipsdata, x= 'tip', bins = 15, hue= 'sex', multiple = 'dodge')
plt.show()
../_images/1e52822b3bee7b98d8081123b11ddb408ec00290a9335ad371d79fd00df08cad.png ../_images/f981635500f218c75cf222efcf28fa528c4826f9d105de049ca809cb99ee8d3e.png

Area below the curve#

#first plot with stack
sns.kdeplot(data = tipsdata, x= 'tip', hue= 'sex', fill = True)
plt.show()
../_images/67bd301447c1294f37c3711fa3251d6ce238a8daeed15e197fb6e47f7347b70d.png

Chart types for categorical data#

import seaborn as sns
import matplotlib.pyplot as plt 

tips = sns.load_dataset('tips')
tips.head(2)

#the categorical variables are "sex", "smoker", "day", "time"
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3

Note that the categorical variables are “sex”, “smoker”, “day”, “time”

Catplot#

catplot is useful to work with categorical data

this chart will let you to do a double group by or double segmentation

sns.catplot(data=tips, x="day", y="total_bill",hue="sex",dodge=True,kind="box",col="smoker")

plt.show()
../_images/8fcdd6b03f67946df29d5dbb96a2e3be1d7bbfab116931dd466022f1154e0a17.png

note that the first segmentation was in “hue” and the second in “col”

“Bar plot (count)#

sns.countplot(data = tips, x="day", hue="sex");
../_images/c39add59a9d816bae4244223d1880651e36712a8396bb94ed501c43df0b38f5a.png

swarm plot(dots diagram)#

This chart is similar to stripplot, however, this one shows better the data concentration

sns.swarmplot(data = tips, x="day", y="total_bill", hue="sex");
../_images/266dbdba82bccbc75b0ad9b7fd9b5499c57a3a68a00c24676d051cb6e0684cec.png
sns.swarmplot(data = tips, x="day", y="total_bill", hue="sex", dodge=True);
#dodge fixes the issue of one category over the other
../_images/1dffbcf1493304549586ac9e70b82cee722642aa8aecc9a9b0ae7a46467f49ed.png

stripplot#

Looks similar to swarm plot, but data is agglomerated in this case

sns.stripplot(data = tips, x="day", y="total_bill", hue="sex", dodge=True);
../_images/2ea9cbf7a69b6b172213c00a1ced199f53119a29b1826fb6f30df413a9ae7a67.png

boxplot separated categories#

sns.boxplot(data=tips,x="day",y="total_bill",hue="sex", showfliers=True)     
#i put showfliers argument in case you want to remove outliers

plt.show()
../_images/7e550d2053e23f487c2d47eb54b86bd31f8108e46914e892ecabcf25ff46997e.png

violin plot#

this plot is similar to a boxplot, but does not show quartile. It shows the data concentration

sns.violinplot(data=tips, x="day", y="total_bill")
plt.show()
../_images/5fd0a990d9db71c1c9f73ef3ee9555b33fe3d3eee4720c9adaf56a535db0df80.png

Boxplot + Swarmplot#

#first chart
sns.boxplot(data=tips,x="day",y="total_bill",hue="sex", dodge=True)

#second chart
sns.swarmplot(data=tips,x="day",y="total_bill",hue="sex", palette='dark:0', dodge=True, marker="<")
#the dodge argument is for the swarm plot to segment by sex

plt.show()
../_images/8b0608d7e731df4f6c0ef0196b92c51c0483a641fc0baa839b3ce512a8d83edb.png

Correlation charts#

The main chart to identify correlations is the scatter chart, this chapter will focus on this

import seaborn as sns
import matplotlib.pyplot as plt 

#data to work on
tips = sns.load_dataset('tips')
tips.head(2)
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3

correlation by categories#

sns.scatterplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="pastel");
../_images/1fdf0dda77341cf23ec3d63f89f74d9e1a3e952a5f9a8b0c9b479b79a06b2c01.png

lm plot with multiple categories#

sns.lmplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="dark");
../_images/efd8ef31ba53e4a4d4efe12c2e49537228367d8921bc8991c4704d56b3af810f.png

in this chart, it does not look organized because of the data. let’s see a usefull case for this chart

iris = sns.load_dataset("iris")

iris.head()

sns.lmplot(data=iris, x="sepal_length", y="petal_length", hue="species");
../_images/da26f19517b089970e708dc29dd4bc8a9d18a0ef73e06275fbed27c79589d54a.png

second segmentation in legend#

with the argument style you can change the dot shape based on another category

change dot shape#

this will also change the dots shape

sns.scatterplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="pastel", style="time");
../_images/1bdf6a26a9f59baedb747228d9b5434ad249170b4efc2ae2a36818b01c8b97cf.png

change dot shape (but you deciding it)#

you just have to define a dictionary in which

  • key is the variable name

  • value is the dot shape “D” for diamond, “s” for squared, etc.

#define the shape dictionary
shapes = { "Lunch":"D", "Dinner":"s"}


sns.scatterplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="pastel", style="time", 
markers=shapes);
../_images/013f83666eecf2e4b0abf5453e76fb3023117c51b8bf3d46d309889dd6191da9.png

change dot size based on numerical variable#

you can change the dots size based on a numerical variable, for example *the dots are bigger if the “size” variable for this dataset is bigger

sns.scatterplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="pastel", size="size")

plt.show()
../_images/25eba70fea02ce0a630a9f9ae406739d05c2243c1c002e55039d348fc80e8209.png

Multiple correlation charts#

sns.relplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', palette="pastel", size="size", col="time", kind="scatter")

plt.show()
../_images/bad4a728ac774bed1ed718ce1fb130c2a4ef305f805cb35007737308f00e4dda.png

Move the legend (relocate)#

#make the chart bigger
plt.figure(figsize=(8,8))

sns.scatterplot(data= tips, x= 'total_bill', y = 'tip', hue= 'day', style="time", size="size")
plt.legend(loc="center", bbox_to_anchor=(1.2,0.5)) #bbox_to_anchor(xposition, yposition)

plt.show()
../_images/547e912000160f313c517b49fc00c475ba4935c067f2f88b60594996d399cc2f.png

Pairplot (correlation among all the variables)#

# see the correlation among the variables
tips.corr() #---> Muestra las variables correlacionadas entre si
total_bill tip size
total_bill 1.000000 0.675734 0.598315
tip 0.675734 1.000000 0.489299
size 0.598315 0.489299 1.000000

This function will show you the correlation among all the numeric variables. for this particular dataset, the numeric ones are [“total_bill”, “tip”, “size”]

sns.pairplot(data=tips)
plt.show() 
../_images/c0242233a40036dc74f01612b9933330072b71cc4d2ad553a919930542b8a74b.png

Pairplot + diag_kind + hue#

You can know the correlation among all the variables given a segmentation, and also change the diagonal charts.

iris = sns.load_dataset("iris")

iris.head()

sns.pairplot(data=iris, hue="species", palette="inferno", diag_kind="kde");
../_images/56e70d333cd98e6b3d906c0a284d9bfe1461c801ef2dc1632b7de65ffad1f8f1.png

Pairplot corner#

The argument corner eliminates the upper diagonal, avoiding repeated charts

sns.pairplot(data= tips, corner=True);
../_images/cfd0631e6a3a0faf65a5fffdcbe18bac63bd5e6caa5f5572c579554eca86dd00.png

Line charts#

#loading our data

import seaborn as sns
import matplotlib.pyplot as plt 

tips = sns.load_dataset('tips')
tips.head(2)
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
sns.lineplot(data=tips, x="total_bill", y="tip", hue="time", size="size", style="day");
../_images/1c35bd2e63198ba010eba61818126377dea19d51a9a069aff5d1796ded0125f9.png

relplot#

you can do the same with relplot and modifying the chart type later on.

sns.relplot(data= tips, x= 'total_bill', y = 'tip',  hue= 'time', style= 'day', size='size', kind= 'line');
../_images/dd193c4aac9357559f2be6366a6686226af4542fe4306c1e561ae40851c344c1.png
Created in deepnote.com Created in Deepnote