Hide code cell source
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from matplotlib import cm

from sklearn import datasets
from sklearn.datasets import make_blobs

from mpl_toolkits.mplot3d import Axes3D
import scipy.ndimage

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", None)

Motivación: visualizar datos multidimensionales#

Clasificación y clustering#

Datos bidimensionales con etiquetas

Hide code cell source
X, y = make_blobs(n_samples=100, centers=3, n_features=2,random_state=0)
#np.unique(y)
df1 = pd.DataFrame(X,columns=['x1','x2'])
df1['label'] = y
df1.head(10)
x1 x2 label
0 2.631858 0.689365 1
1 0.080804 4.690690 0
2 3.002519 0.742654 1
3 -0.637628 4.091047 0
4 -0.072283 2.883769 0
5 0.628358 4.460136 0
6 -2.674373 2.480062 2
7 -0.577483 3.005434 2
8 2.727562 1.305125 1
9 0.341948 3.941046 0
Hide code cell source
f,axes = plt.subplots(nrows=1,ncols=2,figsize=(10,5))
ax = axes.ravel()
ax[0].plot(X[:,0],X[:,1],'.')

colors = ["#4EACC5", "#FF9C34", "#4E9A06"]
for yy in np.unique(y):
    dataSelection = y ==yy
    data = X[dataSelection,:]
    ax[1].plot(data[:,0],data[:,1],'.',c=colors[yy],label=yy)
ax[1].legend()
for a in ax:
    a.set_xlabel(r'$X_1$')
    a.set_ylabel(r'$X_2$')
    
_images/784b0b2e81e53a471437f16870cc9ace173003b4290a64a3524f5202ae9a1a31.png

Datos tridimensionales con etiquetas

Hide code cell source
X, y = make_blobs(n_samples=200, centers=4, n_features=3,random_state=4)
#np.unique(y)
df1 = pd.DataFrame(X,columns=['x1','x2','x3'])
df1['label'] = y
df1.head()
x1 x2 x3 label
0 9.863844 0.448826 9.282223 0
1 10.596115 -11.280679 -5.449909 2
2 -2.083092 7.565793 -5.662472 3
3 5.578687 5.149093 -6.176931 1
4 4.101746 3.373981 -6.774126 1
Hide code cell source
colorsDict = {0:'orange',
              1:'red', 
              2:'green', 
              3:'blue'}

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
for ii in df1.index:
    ax.scatter(
        df1.loc[ii,'x1'],
        df1.loc[ii,'x2'],
        df1.loc[ii,'x3'],
        marker='.',
        s=100,
        color='blue'
        #color = colorsDict[
        #    df1.loc[ii,'label']
        #]
    )
ax = fig.add_subplot(122, projection='3d')
for ii in df1.index:
    ax.scatter(
        df1.loc[ii,'x1'],
        df1.loc[ii,'x2'],
        df1.loc[ii,'x3'],
        marker='.',
        s=100,
        color = colorsDict[
            df1.loc[ii,'label']
        ]
    )
plt.show()
_images/905898764a2732cf8f0429d6bbdd011036d76ffc4de0292df4beb4640aa964a9.png

Regresión#

Hide code cell source
x = np.linspace(-10,10,100)
y = 4*np.sin(x)*np.cos(2*x)+np.sin(3*x)/3+11/6*np.cos(5*x)
f,ax = plt.subplots(nrows=1,ncols=1,figsize=(7,7))
ax.plot(x,y,'-',color='red')
ydata = y+np.random.uniform(-0.5,0.5,y.shape[0])
ax.plot(x,ydata,'.',color='blue',label='datos')
ax.legend()
<matplotlib.legend.Legend at 0x13ee1fbed10>
_images/9db7e0dc9896df02f18b75c34a5edb253455b59da61bb9919bcdebcf88324d65.png
Hide code cell source
fig, ax = plt.subplots(figsize=(10,10),subplot_kw={"projection": "3d"})
# Make data.
X = np.arange(-5, 5, 0.25)
Y = np.arange(-5, 5, 0.25)
X, Y = np.meshgrid(X, Y)
#R = np.sqrt(X**2 + Y**2)
#Z = np.sin(R)
Z=np.sin(X/3)*np.cos(Y/2)+2
Z2 = np.sin(X/3)*np.cos(Y/2)+2+np.random.uniform(-0.5,0.5,(Z.shape[0],Z.shape[1]))

# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
ax.scatter(X,Y,Z2,s=2,label='datos')
ax.legend()
plt.show()
_images/ba25a7dd579d86538ca7f778368df81259c0ea98801618cbac69e72b4603f0f6.png

Más de tres dimensiones#

Cuando tenemos más dimensiones no podemos visualizar los datos.

Datos iris

Usamos la scatter_matrix

Hide code cell source
# iris
iris = datasets.load_iris()
iris_data = iris.data
iris_data.shape
irisDf = pd.DataFrame(iris_data,columns=iris.feature_names)
irisDf['target'] = iris.target
irisDf.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
df = pd.DataFrame(iris.data, columns=iris.feature_names)
colors=np.array(50*['r']+50*['g']+50*['b'])
pd.plotting.scatter_matrix(df, 
                           alpha=0.6, 
                           figsize=(10,10), 
                           #color=colors,
                           hist_kwds={'bins':30})
plt.show()
_images/cf19dd62bd56ae6b3be43c0505f538a8cbd4297abadcd289700b5d14865f6f56.png
Hide code cell source
df = pd.DataFrame(iris.data, columns=iris.feature_names)
colors=np.array(50*['r']+50*['g']+50*['b'])
pd.plotting.scatter_matrix(df, 
                           alpha=0.6, 
                           figsize=(10,10), 
                           color=colors,
                           hist_kwds={'bins':30})
plt.show()
_images/146ae00df067bd878f62d4c28b4d2883bdc4360cb081e7c18dad9e17c631e152.png
Hide code cell source
import seaborn as sns
sns.set_theme(style="ticks")

df = sns.load_dataset("penguins")
df.head(10)
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 Male
6 Adelie Torgersen 38.9 17.8 181.0 3625.0 Female
7 Adelie Torgersen 39.2 19.6 195.0 4675.0 Male
8 Adelie Torgersen 34.1 18.1 193.0 3475.0 NaN
9 Adelie Torgersen 42.0 20.2 190.0 4250.0 NaN

Método pairplot del paquete seaborn

https://seaborn.pydata.org/generated/seaborn.pairplot.html

sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x13ee40c4410>
_images/815b0960690551cdf828014e0930f997531624a4a13016b5b7dadd9b1d5b492d.png
sns.pairplot(df, hue="species")
<seaborn.axisgrid.PairGrid at 0x13ee2244410>
_images/df1f57bf8255b5b6e414fa78f3ef5f0cb64e7b3b93a42002f9696b8424dc31ea.png

Puede haber datos sin etiquetas

Hide code cell source
import numpy as np
import pandas as pd

np.random.seed(134)                     
N = 1000                              
 
x1 = np.random.normal(0, 1, N)                        
x2 = x1 + np.random.normal(0, 3, N)              
x3 = 2 * x1 - x2 +  np.random.normal(0, 2, N)
x4 = x3 * x1 -4*x2 + np.random.normal(0, 1, N)
x5 = x2-x4*x1 + np.random.normal(0, 2, N)

df = pd.DataFrame({'x1':x1,
                   'x2':x2,
                   'x3':x3,
                   'x4':x4,
                   'x5':x5
                  })

df.head()
x1 x2 x3 x4 x5
0 -0.224315 -8.840152 10.145993 33.286302 -1.376902
1 1.337257 2.383882 -1.854636 -11.590022 18.471552
2 0.882366 3.544989 -1.117054 -14.303068 14.009670
3 0.295153 -3.844863 3.634823 15.538617 -4.391063
4 0.780587 -0.465342 2.121288 2.874332 1.209348
Hide code cell source
import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(df,figsize=(10,10))
plt.show()
_images/5fa8944bf1f94b216c3fc07fe7be1f178b54c4c84a6ea866d4bc234f14214376.png

Proyecciones#

Las gráficas anteriores son proyecciones ortogonales de los datos sobre los diferentes planos formados eligiendo coordenadas de los diferentes atributos, por parejas

Ejemplo ad hoc en tres dimensiones

Hide code cell source
clusters=[1,2,3,4,5,6]
Npuntos = 40*len(clusters)

dictClusters={1:(1,0,0),
              2:(0,1,0),
              3:(0,0,1),
             4:(-1,0,0),
             5:(0,-1,0),
             6:(0,0,-1)}

dictDesvests={1:0.15,
              2:0.15,
              3:0.15,
             4:0.15,
             5:0.15,
             6:0.15}


label = []
df = pd.DataFrame(columns = ['x1','x2','x3'])
for _ in range(Npuntos):
    cluN = np.random.choice(clusters)
    clu=dictClusters[cluN]
    clx,cly,clz = clu
    desvest = dictDesvests[cluN]
    clx += np.random.normal(0,desvest)
    cly += np.random.normal(0,desvest)
    clz += np.random.normal(0,desvest)
    label.append(cluN)
    df.loc[df.shape[0]] = [clx,cly,clz]
df['label'] = label
Hide code cell source
df.head()
x1 x2 x3 label
0 -1.017333 0.098265 0.038147 4
1 -1.266728 -0.147091 0.011565 4
2 -1.010595 -0.059136 -0.128969 4
3 0.018001 -1.076903 0.076536 5
4 -0.088302 -0.908741 -0.319811 5
Hide code cell source
colorsDict = {6:'orange',
              1:'red', 
              2:'green', 
              3:'blue',
             4:'black',
             5:'purple'}

#maxLabel = np.max(df.label.unique())

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')


u = np.linspace(0, 2 * np.pi, 39)
v = np.linspace(0,np.pi, 21)

x = np.outer(np.cos(u), np.sin(v))
y = np.outer(np.sin(u), np.sin(v))
z = np.outer(np.ones(np.size(u)), np.cos(v))

## Use 3x the stride, no scipy zoom
#ax = fig.gca(projection='3d')
##ax.plot_surface(x, y, z, rstride=3, cstride=3, color='black', shade=0)
#ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#plt.show()

# Normalize to [0,1]
norm = plt.Normalize(z.min(), z.max())
colors = cm.viridis(norm(z))
rcount, ccount, _ = colors.shape

#fig = plt.figure()
ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#surf = ax.plot_surface(x, y, z, rcount=rcount, ccount=ccount,
#                       facecolors=colors, shade=False)



for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

ax.set_xlim((-1.5,1.5))
ax.set_ylim((-1.5,1.5))
ax.set_zlim((-1.5,1.5))

plt.show()
_images/7d2d43612a7422d1e1fed6852c4ef24063ac7ffe53ab3f59d536928d0da3da8d.png
Hide code cell source
sns.pairplot(df, hue="label")
<seaborn.axisgrid.PairGrid at 0x13ee5a7cfd0>
_images/2c1150dd9c4406a8310d33393ddbf766891eb2464c20c968e3e5d188a419d8eb.png
Hide code cell source
colorsDict = {6:'orange',
              1:'red', 
              2:'green', 
              3:'blue',
             4:'black',
             5:'purple'}

#maxLabel = np.max(df.label.unique())

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')


u = np.linspace(0, 2 * np.pi, 39)
v = np.linspace(0,np.pi, 21)

x = np.outer(np.cos(u), np.sin(v))
y = np.outer(np.sin(u), np.sin(v))
z = np.outer(np.ones(np.size(u)), np.cos(v))

## Use 3x the stride, no scipy zoom
#ax = fig.gca(projection='3d')
##ax.plot_surface(x, y, z, rstride=3, cstride=3, color='black', shade=0)
#ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#plt.show()

# Normalize to [0,1]
norm = plt.Normalize(z.min(), z.max())
colors = cm.viridis(norm(z))
rcount, ccount, _ = colors.shape

#fig = plt.figure()
ax.plot_wireframe(x, y, z,rstride=2,cstride=2)

#surf = ax.plot_surface(x, y, z, rcount=rcount, ccount=ccount,
#                       facecolors=colors, shade=False)



for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )
    
# proyeccion 1
for ii in df.index:
    ax.scatter(
        -3,
        df.loc[ii,'x2'],
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

# proyeccion 2
for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        3,
        df.loc[ii,'x3'],
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

# proyeccion 2
for ii in df.index:
    ax.scatter(
        df.loc[ii,'x1'],
        df.loc[ii,'x2'],
        -3,
        marker='.',
        s=100,
        color=colorsDict[df.loc[ii,'label']]
    )

plt.show()
_images/10bc4d79adb1c63e556c201eb345c1031378fc5e43fc955832acd8fac9ddb5f4.png

Ninguna proyección es completamente satisfactoria: problema con la distancia.

Nota: estas no son las únicas proyecciones posibles.Existen más proyecciones posibles

Ejemplo ad hoc en el plano

Hide code cell source
c1 = np.array((1,1))
c2 = np.array((2,1))
c3 = np.array((1,2))
c4 = np.array((2,2))
centers = [c1,c2,c3,c4]
Npuntos = 20
df = pd.DataFrame(columns=['x1','x2'])
labels=[]
for lb,c in enumerate(centers):
    for _ in range(Npuntos):
        df.loc[df.shape[0]] = c+np.random.uniform(-0.25,0.25,2)
        labels.append(lb)
df['label'] = labels
df.head()
x1 x2 label
0 1.056981 0.935569 0
1 0.781720 0.935267 0
2 0.810779 0.867002 0
3 0.906029 1.030259 0
4 1.103518 1.027526 0
Hide code cell source
sns.pairplot(df, hue="label")
<seaborn.axisgrid.PairGrid at 0x13ee1c661d0>
_images/d12a4662f4a660f6dea8f0c585a55b0c5d745ad9be8f22c119c3aeced9ed2861.png
Hide code cell source
# proyecciones
p1  = []
vR1 = np.array((1,1))
iR1 = np.inner(vR1,vR1)

p2 = []
vR2 = np.array((1,-1))
iR2 = np.inner(vR2,vR2)

for ii in df.index:
    v = np.array((df.loc[ii,'x1'],df.loc[ii,'x2']))
    p1.append(np.inner(v,vR1)/iR1)
    p2.append(np.inner(v,vR2)/iR2)

df['p1']  = p1
df['p2']  = p2
df.head()
Hide code cell output
x1 x2 label p1 p2
0 1.056981 0.935569 0 0.996275 0.060706
1 0.781720 0.935267 0 0.858494 -0.076773
2 0.810779 0.867002 0 0.838890 -0.028112
3 0.906029 1.030259 0 0.968144 -0.062115
4 1.103518 1.027526 0 1.065522 0.037996
Hide code cell source
colorsDict = {0:'orange',
              1:'red', 
              2:'green', 
              3:'blue'}
colores = [colorsDict[df.label.loc[i]] for i in df.index]
f,ax=plt.subplots(nrows=1,ncols=2,figsize=(20,10))
ax[0].scatter(df.x1,df.x2,c=colores,alpha=0.5)
ax[0].scatter(np.zeros(df.shape[0]),df.x2,marker='<',c=colores,alpha=0.5)
ax[0].scatter(df.x1,np.zeros(df.shape[0]),marker='v',c=colores,alpha=0.5)
ax[0].set_xlim((-0.5,3))
ax[0].set_ylim((-0.5,3))
ax[0].grid()

ax[1].scatter(df.x1,df.x2,c=colores,alpha=0.5)
ax[1].plot(np.linspace(-0,3,100),np.linspace(-0,3,100))
ax[1].plot(np.linspace(-3,3,100),-np.linspace(-3,3,100))
ax[1].scatter(df.p1,df.p1,marker = '+',c=colores,alpha=0.5)
ax[1].scatter(df.p2,-df.p2,marker = '+',c=colores,alpha=0.5)
ax[1].set_xlim((-1,3))
ax[1].set_ylim((-1,3))
ax[1].grid()
_images/7ba533299450dd120a7b2d7d477e5c9add1cbabf0788f5cd76b0c19c1e212c76.png

Nota: existen direcciones para las cuales las proyecciones son más adecuadas

Mapa#

fishy

Fig. 26 Mapa de Madrid#

Un mapa es una proyección cartográfica en la que puntos geográficos cercanos están cerca en el mapa.