Korelace dvou proměnných

body ze sady 2 proměnných definují eliptickou oblast

budeme se detailněji zabývat analýzou této elipsy

In [1]:
%matplotlib inline
from matplotlib.pyplot import plot,hist

import numpy as np
from scipy import stats

def measure(n,frac=0.5,norm=True):
    "Measurement model, return two coupled measurements."
    l1 = np.random.normal(size=n)
    l2 = np.random.normal(scale=0.5, size=n)
    rep=l1+frac*l2, frac*l1-l2
    if norm: 
        rep=[rep[0]/rep[0].std(),rep[1]/rep[1].std()]
    return rep
In [2]:
# renormalizovane mereni
m1, m2 = measure(2000,frac=0.7)
plot(m1,m2,'.')
m1.std(),m2.std()
Out[2]:
(1.0, 1.0)
In [3]:
# korelacni koeficient je zde roven dot(m1,m2)
r=0.3
dd=measure(1000,r)
#skut. korel. koeficient vs. hodnota se zadanymi E(x), V(x)
np.corrcoef(*dd)[0,1],np.array(dd).prod(0).sum()/999.
Out[3]:
(0.36967964094277628, 0.37000789031636233)
In [10]:
m1, m2 = measure(2000,frac=0.5)
# mereni vzdalenosti v elipticke metrice 
rep=hist(np.sqrt(m1**2+m2**2-m1*m2*0.5*2),30)
from matplotlib import pyplot as pl
pl.xlabel("vzdalenost od stredu")
Out[10]:
<matplotlib.text.Text at 0x7f36a9a80ac8>
In [9]:
plot(rep[1][1:],np.cumsum(rep[0])/2000,'d')
pl.grid()
pl.xlabel("kumul.dist.")
pl.title("kovariance")
Out[9]:
<matplotlib.text.Text at 0x7f36a9a08438>
In [26]:
np.corrcoef(m1,m2)[0][1]
Out[26]:
0.5093339744186216

průmět do jedné osy

In [11]:
rep1=hist(abs(m1),30)
pl.xlabel("vzdal. od pocatku ve smeru X")
Out[11]:
<matplotlib.text.Text at 0x7f36a99e8278>
In [14]:
# porovnani kupulativnich distribuci
plot(rep[1][1:],np.cumsum(rep1[0])/2000,'+')
plot(rep[1][1:],np.cumsum(rep[0])/2000,':')
grid()
pl.xlabel("kumul. vzdalenost")
pl.legend(["1-rozmerne","2-rozmerne"],loc=4)
Out[14]:
<matplotlib.legend.Legend at 0x7f36a9caaa20>

analýza z průměrování

rozdělení do tříd podle proměnné $m_1$ i $m_2$ vycházejí identicky (standardizované NP na průměr 0 a rozptyl 1)

In [16]:
#pouzivame balik NDImage pro rozdeleni dat do skupin
from scipy import ndimage as ni
ysum=ni.sum(m2,(m1*5.).astype(int),range(-12,13))
ymean=ni.mean(m2,(m1*5.).astype(int),range(-12,13))
ystd=ni.standard_deviation(m2,(m1*5.).astype(int),range(-12,13))/np.sqrt(ysum/ymean)
ysum2=ni.sum(m1,(m2*5.).astype(int),range(-12,13))
ymean2=ni.mean(m1,(m2*5.).astype(int),range(-12,13))
ystd2=ni.standard_deviation(m1,(m2*5.).astype(int),range(-12,13))/np.sqrt(ysum2/ymean2)
from matplotlib.pyplot import errorbar
errorbar(np.arange(-12,13)/5.,ymean,ystd)
errorbar(np.arange(-12,13)/5.,ymean2,ystd2)
pl.grid()
pl.legend(["m1 vs. m2","m2 vs. m1"],loc=2)
Out[16]:
<matplotlib.legend.Legend at 0x7f36a938ae80>

proložení přímkou dá tento sklon

In [9]:
np.polyfit(np.arange(-12,13)/5.,ymean2,1,w=1/ystd2)
Out[9]:
array([ 0.62158601,  0.0075244 ])

zkusíme totéž zautomatizovat pro různé míry korelace

In [18]:
# analyza z prumetu
def analelli(r,siz=2000,plot=False,nsig=2.2,div=5.):
    m1, m2 = measure(siz,frac=r)
    ipts=range(-int(nsig*div),int(nsig*div)+1)
    ysum=ni.mean(m2,(m1*div).astype(int),ipts)
    ymean=ni.mean(m2,(m1*div).astype(int),ipts)
    ystd=ni.standard_deviation(m2,(m1*div).astype(int),ipts)/np.sqrt(ysum/ymean)
    ysum2=ni.sum(m1,(m2*div).astype(int),ipts)
    ymean2=ni.mean(m1,(m2*div).astype(int),ipts)
    ystd2=ni.standard_deviation(m1,(m2*div).astype(int),ipts)/np.sqrt(ysum2/ymean2)
    pts=np.arange(-nsig*div,nsig*div+1)/5.
    if plot:
        from matplotlib.pyplot import errorbar
        errorbar(pts,ymean,ystd)
        errorbar(pts,ymean2,ystd2)
    f1=np.polyfit(pts,ymean,1,w=1/ystd)
    f2=np.polyfit(pts,ymean2,1,w=1/ystd2)
    return f1,f2
In [21]:
# simulace pro ruzne korelacni koef. 
ook=[analelli(r,5000) for r in np.r_[-0.5:0.9:0.1]]
rs=np.r_[-0.5:0.9:0.1]
plot(rs,np.array(ook)[:,0,0],'+',ms=13)
plot(rs,np.array(ook)[:,1,0],'r+',ms=13)
pl.grid()
pl.xlabel("korel. koeficient")
pl.ylabel(u"sklon průměrovaných dat")
pl.legend(["m1 vs m2","m2 vs m1"],loc=2)
pl.xlim(-.6,1)
Out[21]:
(-0.6, 1)

závislost není lineární = generovaná data nedosahují požadované míry korelace pro velké (absolutní) hodnoty $r$

opravíme výpočtem skutečných korelačních koeficientů

In [22]:
#vypocteme skutečné korelační koeficienty (průměr z 30 opakování)
cook=np.array([[np.array(measure(1000,r)).prod(0).sum()/999. for r in np.r_[-0.5:0.9:0.1]] for i in range(30)])
cavg=cook.mean(0)
plot(cavg,np.array(ook)[:,0,0],'+',ms=10)
plot(cavg,np.array(ook)[:,1,0],'r+',ms=10)
pl.grid()
pl.xlabel("korel. koef. skutecny")
pl.ylabel("sklon průměrovaných dat")
pl.legend(["m1 vs m2","m2 vs m1"],loc=2)
pl.xlim(-.6,.8)
Out[22]:
(-0.6, 0.8)
In [17]:
# nyní vychází sklon skoro přesně
np.polyfit(cavg,np.array(ook)[:,0,0],1)
Out[17]:
array([  1.04720039e+00,   2.49914468e-04])

fitování elipsy

kontury konstantní hustoty jsou elipsy

zpracování konturového grafu je v NumPy trochu náročnější

In [23]:
m1, m2 = measure(100000,0.7)
siz=14.5/4.
bins=np.r_[-siz:siz:30j]
out=np.histogram2d(m1,m2,[bins,bins])
from matplotlib.pyplot import imshow,contour
con1=contour(out[0],origin="lower",extent=[-siz,siz,-siz,siz])
pl.grid()
pl.xlabel("m1")
pl.ylabel("m2")
Out[23]:
<matplotlib.text.Text at 0x7f36a921e0f0>
In [25]:
col1=con1.collections[1].get_paths()[0]
ix,iy=np.array([i[0] for i in col1.iter_segments() if i[1]==2]).transpose()

modelujeme

$$r(\alpha)=r_m+d \cos 2(\alpha - \alpha_0)=r_m+d \cos 2\alpha \cos 2\alpha_0 - d \sin 2\alpha \sin 2\alpha_0 $$

následuje cvičení na aplikaci lineární regrese

In [28]:
#nafitujeme pomocí lineární regrese 
r=np.sqrt(ix**2+iy**2)
ang=np.arctan2(iy,ix)
plot(ang,r,"d")
mat=np.array([np.ones(r.shape),np.cos(2*ang),np.sin(2*ang)])
mcov=np.linalg.inv(mat.dot(mat.T))
coef=mcov.dot(mat.dot(r))
plot(ang,coef.dot(mat))
pl.xlabel("uhel [rad]")
pl.ylabel("vzdal. od středu")
pl.legend(["data","fit"])
coef
Out[28]:
array([ 1.3194205 ,  0.00828236,  0.42996171])
In [137]:
#plot(ix,iy)
rn=coef.dot(mat)
ixn=rn*np.cos(ang)
iyn=rn*np.sin(ang)
plot(ixn,iyn)
Out[137]:
[<matplotlib.lines.Line2D at 0x7f59126100b8>]
In [31]:
# schovame vse do procedury
def contal(con1,itr=2,choice=2,doplot=False):
    col1=con1.collections[itr].get_paths()[0]
    ix2,iy2=np.array([i[0] for i in col1.iter_segments() if i[1]==choice]).transpose()
    #plot(ix2,iy2)
    r=np.sqrt(ix2**2+iy2**2)
    ang=np.arctan2(iy2,ix2)
    if doplot:plot(ang,r,"d")
    mat=np.array([np.ones(r.shape),np.cos(2*ang),np.sin(2*ang)])
    mcov=np.linalg.inv(mat.dot(mat.T))
    coef=mcov.dot(mat.dot(r))
    if doplot: 
        plot(ang,coef.dot(mat))
        pl.grid()
    return coef

contal(con1,2,doplot=True)
Out[31]:
array([ 1.04783741, -0.00703061,  0.33424642])

histogram vs. KDE

In [32]:
m1, m2 = measure(100000,0.3)
siz=14.5/4.
bins=np.r_[-siz:siz:30j]
out=np.histogram2d(m1,m2,[bins,bins])
con2=contour(out[0],origin="lower",extent=[-siz,siz,-siz,siz])
aper=np.array([contal(con2,i) for i in range(1,5)])
print(np.arctan2(aper[:,2],aper[:,1])/np.pi*90)
pl.grid()
pl.title("2D histogram")
aper
[ 45.54871494  45.13988403  45.05666793  46.52707832]
Out[32]:
array([[  1.52864354e+00,  -5.74313118e-03,   2.99806799e-01],
       [  1.24817471e+00,  -1.24021493e-03,   2.53990805e-01],
       [  1.03424245e+00,  -3.68353949e-04,   1.86217311e-01],
       [  8.02243858e-01,  -9.13330866e-03,   1.71177961e-01]])
In [153]:
X, Y = np.mgrid[-siz:siz:100j, -siz:siz:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde([m1,m2])
kde = np.reshape(kernel(positions).T, X.shape)
con3=contour(kde,origin="lower",extent=[-siz,siz,-siz,siz])
aper3=np.array([contal(con3,i) for i in range(1,5)])
print(np.arctan2(aper3[:,2],aper3[:,1])/np.pi*90)
aper3
[ 46.37173095  45.11779919  44.94661845  46.37718607]
Out[153]:
array([[  1.48383496e+00,  -1.46665897e-02,   3.06069995e-01],
       [  1.20787355e+00,  -1.03773108e-03,   2.52367084e-01],
       [  9.70577497e-01,   3.54854166e-04,   1.90436777e-01],
       [  7.34629758e-01,  -7.06828910e-03,   1.46919549e-01]])

sestrojíme model kdif pro 2-D gaussovo rozdělení

In [189]:
r=0.3
modkde=(X**2+Y**2-2*X*Y*r)/(1-r**2)
contour(np.exp(-modkde),origin="lower",extent=[-siz,siz,-siz,siz])
colorbar()
from scipy import optimize as op
hisx=kde
kdif=lambda p:(hisx-np.exp(-((X-p[2])**2+(Y-p[3])**2-2*(X-p[2])*(Y-p[3])*p[0])/(1-p[0]**2))*p[1])
rep=op.fmin(lambda p:sum(kdif(p).ravel()**2),[0.3,0.4,0,0])
rep
Optimization terminated successfully.
         Current function value: 1.661975
         Iterations: 79
         Function evaluations: 146
Out[189]:
array([  2.31004981e-01,   2.22167157e-01,   1.59127184e-03,
         2.17691046e-04])

Přímé fitování na 2-d funkci

In [179]:
from matplotlib.pyplot import colorbar
con3=contour(kde,origin="lower",extent=[-siz,siz,-siz,siz])
contour(kde-kdif(rep),origin="lower",extent=[-siz,siz,-siz,siz])
colorbar()
Out[179]:
<matplotlib.colorbar.Colorbar at 0x7f591345b208>

Simulace různých korelačních faktorů

In [193]:
for r in np.r_[0.1:0.8:0.1]:
    m1, m2 = measure(100000,r)
    out=np.histogram2d(m1,m2,[bins,bins])
    his2=out[0]/len(m1)*(30/2./siz)**2
    hisx=his2
    M=(np.r_[-siz:siz:30j]+siz/30.)[:-1].reshape(29,1)
    hdif=lambda p:(hisx-np.exp(-((M-p[2])**2+(M.T-p[3])**2-2*(M-p[2])*(M.T-p[3])*p[0])/(1-p[0]**2))*p[1])
    rep2=op.fmin(lambda p:sum(hdif(p).ravel()**2),[r,0.2,0,0])
    print(rep2)
Optimization terminated successfully.
         Current function value: 0.164429
         Iterations: 59
         Function evaluations: 104
[  8.25423501e-02   2.28331867e-01  -5.31956603e-05   5.54383477e-04]
Optimization terminated successfully.
         Current function value: 0.164879
         Iterations: 117
         Function evaluations: 198
[ 0.16572554  0.23316233  0.00457645  0.00436815]
Optimization terminated successfully.
         Current function value: 0.166613
         Iterations: 79
         Function evaluations: 145
[ 0.2394679   0.2399989  -0.00037809  0.00043639]
Optimization terminated successfully.
         Current function value: 0.165911
         Iterations: 67
         Function evaluations: 117
[  2.94651121e-01   2.46987182e-01  -3.95839018e-04   3.97633831e-05]
Optimization terminated successfully.
         Current function value: 0.167064
         Iterations: 230
         Function evaluations: 382
[ 0.33013358  0.25294473 -0.01093467 -0.0019698 ]
Optimization terminated successfully.
         Current function value: 0.167124
         Iterations: 118
         Function evaluations: 206
[ 0.37189004  0.25968069  0.00088478  0.00187634]
Optimization terminated successfully.
         Current function value: 0.164007
         Iterations: 52
         Function evaluations: 102
[ 0.38676369  0.2635433   0.00039489  0.00059759]
In [149]:
#orientace elipsy
np.arctan2(aper[:,2],aper[:,1])/np.pi*180
Out[149]:
array([ 88.93536485,  90.60343066,  90.04671184,  90.31800636])