Robustní metody

problém odlehlých hodnot

M-odhady - pomocí vhodné váhové funkce je menší citlivost na vychýlené body než u nejm. čtverců ($d^2$)

minimalizujeme $\sum g(x_i,\theta)$, ve většině případů lze derivovat podle odhadu $\theta$: $\psi(x_i,\theta)=\partial g(x_i,\theta)/\partial \theta$ a váha se vyjádří jako

$$w(r_i)=\frac{\psi(r_i)}{r_i}$$

kde $r_i=x_i-\theta$

zdroj pro M-funkce

M-plot

In [1]:
from matplotlib import pyplot as pl
#mpl.use("pgf")
#mpl.use('GTK')
%matplotlib inline
from numpy import r_,random
In [56]:
import numpy as np

frac=2.6
slp=-.6
x=r_[2:10:50j]
y=x*slp+random.normal(3,0.4,size=x.shape)
x2=np.concatenate([x,random.uniform(2,10,size=int(frac*x.size))])
y2=np.concatenate([y,random.uniform(y.min()-3,y.max()+3,size=int(frac*x.size))])
pl.plot(x2,y2,'.')
Out[56]:
[<matplotlib.lines.Line2D at 0xaad3e48>]
In [57]:
c=2
gfunw=lambda z:c**2/2*(1-np.exp(-z**2/c**2)) #welsh
gfunh=lambda z:(abs(z)<c)*z**2/2+(abs(z)>=c)*(abs(c*z)-c**2/2) #huber
gfun=gfunh
#gfun=lambda z:abs(z)
wei=lambda p:gfun(y2-p[1]-p[0]*x2).sum()
from scipy import optimize as op
eslp=-0.5
op.fmin(wei,[eslp,y2.mean()-eslp*x2.mean()])
Optimization terminated successfully.
         Current function value: 578.317209
         Iterations: 37
         Function evaluations: 72
Out[57]:
array([-0.27995768,  0.79551484])
In [58]:
#klasicke nejmensi ctverce
gfun=lambda z:z**2/2
wei=lambda p:gfun(y2-p[1]-p[0]*x2).sum()
op.fmin(wei,[eslp,y2.mean()-eslp*x2.mean()])
Optimization terminated successfully.
         Current function value: 813.064897
         Iterations: 39
         Function evaluations: 77
Out[58]:
array([-0.16589092,  0.13646321])
In [59]:
xn=r_[-5:5:0.1]
pl.plot(xn,gfunw(xn))
pl.plot(xn,gfunh(xn))
pl.legend(["Welsh","Huber"])
np.polyfit(x2,y2,1)
Out[59]:
array([-0.16588885,  0.13643925])
In [60]:
c=1.
op.fmin(wei,[-0.5,y2.mean()-0.5*x2.mean()])
Optimization terminated successfully.
         Current function value: 813.064897
         Iterations: 50
         Function evaluations: 98
Out[60]:
array([-0.16587971,  0.13638679])
In [61]:
gfunc=lambda z:z**2/2
gfun=gfunc
wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
rep=[]
for j in range(len(y),len(y2),10):
    x3,y3=x2[:j],y2[:j]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
arep=np.array(rep)
arep
Out[61]:
array([[ -5.79660982e-01,   2.79499691e+00,   4.25557443e+00],
       [ -5.64367058e-01,   2.67360543e+00,   7.01009532e+01],
       [ -5.14902745e-01,   2.25504419e+00,   1.52890745e+02],
       [ -4.89274926e-01,   2.10776279e+00,   2.12729241e+02],
       [ -5.04885612e-01,   2.34820309e+00,   2.70689700e+02],
       [ -3.77571052e-01,   1.46386510e+00,   3.47104246e+02],
       [ -2.92496715e-01,   1.06781665e+00,   3.81761929e+02],
       [ -2.98268007e-01,   9.74127176e-01,   4.36716961e+02],
       [ -3.16892434e-01,   1.11143783e+00,   5.00026036e+02],
       [ -2.71336470e-01,   8.52492850e-01,   5.80767091e+02],
       [ -2.51808115e-01,   8.09639957e-01,   6.15604951e+02],
       [ -2.01680418e-01,   5.44066855e-01,   6.71430287e+02],
       [ -1.65967381e-01,   1.72123321e-01,   7.47152285e+02]])
In [62]:
gfun=gfunh
rep=[]
wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
for j in range(len(y),len(y2),10):
    x3,y3=x2[:j],y2[:j]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
areph=np.array(rep)
areph
Out[62]:
array([[  -0.58200555,    2.80146728,    4.18550437],
       [  -0.58459507,    2.77609202,   33.51957076],
       [  -0.58226464,    2.73485298,   64.810802  ],
       [  -0.58152568,    2.69877748,   92.02545433],
       [  -0.58212233,    2.74003185,  114.9919824 ],
       [  -0.53465504,    2.39386961,  146.82592097],
       [  -0.4824832 ,    2.160979  ,  166.99765244],
       [  -0.48945704,    2.12806745,  190.98627985],
       [  -0.48996825,    2.12724059,  216.48356392],
       [  -0.45637278,    1.92375171,  249.00911213],
       [  -0.44088945,    1.8681026 ,  264.92603937],
       [  -0.41697365,    1.75108297,  291.15311652],
       [  -0.38842034,    1.50286953,  322.35926395]])
In [63]:
gfun=gfunw
rep=[]
wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
for j in range(len(y),len(y2),10):
    x3,y3=x2[:j],y2[:j]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
arepw=np.array(rep)
arepw
Out[63]:
array([[ -0.58341252,   2.80066622,   3.19450753],
       [ -0.58047092,   2.77671978,   8.12343898],
       [ -0.5868336 ,   2.82887757,  12.19695222],
       [ -0.58764796,   2.818499  ,  16.86738956],
       [ -0.59331277,   2.84496674,  20.28339007],
       [ -0.6006836 ,   2.87452929,  24.87562497],
       [ -0.59166019,   2.82511163,  28.97967969],
       [ -0.59884439,   2.86158425,  33.03519622],
       [ -0.59426615,   2.82691328,  37.40302269],
       [ -0.582255  ,   2.75490752,  41.89401338],
       [ -0.57900173,   2.7405215 ,  44.92367925],
       [ -0.59007733,   2.8080249 ,  49.00741803],
       [ -0.58607721,   2.77998919,  53.29612758]])
In [64]:
ix=np.arange(len(y),len(y2),10.)/len(y)
ix=1-1/ix
pl.plot(ix,arep[:,0],'s')
pl.plot(ix,arepw[:,0],'d')
pl.plot(ix,areph[:,0],'v')
pl.legend(["quad","Welsh","Huber"],loc=2)
pl.ylabel("slope")
pl.ylim(-0.7,-0.1)
pl.grid()
pl.xlabel("frac. outlier.")
pl.xlim(-0.1,0.8)
Out[64]:
(-0.1, 0.8)
In [65]:
pl.plot(ix,arep[:,1],'s')
pl.plot(ix,arepw[:,1],'d')
pl.plot(ix,areph[:,1],'v')
pl.legend(["quad","Welsh","Huber"],loc=3)
pl.ylabel("absol.")
#pl.ylim(-0.7,-0.1)
pl.grid()
pl.xlabel("frac. outlier.")
pl.xlim(-0.1,0.8)
Out[65]:
(-0.1, 0.8)

odhad nejistot pomocí bootstrapu

vybereme bootfr bodů ze sady dat a nahradíme je nějakými (jinými) z původního vzorku

In [77]:
boofr=20
j=(len(x)+len(x2))/2
gfun=gfunw
rep=[]
for i in range(1000):
    x3,y3=x2[:j],y2[:j].copy()
    ifrom=np.random.choice(j,boofr)
    ito=np.random.choice(j,boofr)
    y3[ito]=y3[ifrom]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
brepw=np.array(rep)
brepw.std(0)
Out[77]:
array([ 0.01667763,  0.10661553,  1.12224915])
In [68]:
gfun=gfunc
rep=[]
wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
for i in range(1000):
    x3,y3=x2[:j],y2[:j].copy()
    ifrom=np.random.choice(j,boofr)
    ito=np.random.choice(j,boofr)
    y3[ito]=y3[ifrom]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
brepc=np.array(rep)
brepc.std(0)
Out[68]:
array([  0.05835593,   0.37256324,  28.52206437])
In [69]:
gfun=gfunh
rep=[]
wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
for i in range(1000):
    x3,y3=x2[:j],y2[:j].copy()
    ifrom=np.random.choice(j,boofr)
    ito=np.random.choice(j,boofr)
    y3[ito]=y3[ifrom]
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
breph=np.array(rep)
breph.std(0)
Out[69]:
array([  0.04374454,   0.2643811 ,  10.36298864])
In [93]:
gfun=gfunh
def bootme(boofr=20,niter=1000,gsel=115):
    rep=[]
    wei=lambda p:gfun(y3-p[1]-p[0]*x3).sum()
    for i in range(niter):
        x3,y3=x2[:gsel],y2[:gsel].copy()
        ifrom=np.random.choice(gsel,boofr)
        mask=np.ones(gsel,'bool')
        mask[ifrom]=False
        ito=np.random.choice(np.r_[:gsel][mask],boofr)
        y3[ito]=y3[ifrom]
        pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
        rep.append(list(pars)+[wei(pars),y3.std()])
    return np.array(rep)
bootme(20,500).std(0)
Out[93]:
array([  0.04308943,   0.27264614,  10.67534551,   0.09564452])
In [94]:
gfun=gfunw
bootme(20,500,115).std(0)
Out[94]:
array([ 0.01768252,  0.11320987,  1.19634393,  0.09134715])
In [95]:
bootme(40,500,115).std(0)
Out[95]:
array([ 0.0273262 ,  0.17352205,  1.47052802,  0.12460751])
In [102]:
#kolik ruznych prvku bude mit nahodny vyber s opakovanim
reprod=np.array([len(set(np.random.randint(gsel,size=gsel))) for i in range(200)])
reprod.mean(),reprod.min(),reprod.max(),reprod.std()
Out[102]:
(72.989999999999995, 64, 84, 3.5128193804976653)
In [70]:
brepw.mean(0), breph.mean(0)
Out[70]:
(array([ -0.58800937,   2.80282756,  33.54733321]),
 array([  -0.44562932,    1.89858798,  184.3669658 ]))

testování v simulaci

In [73]:
gfun=gfunw
x1=x.copy()
rep=[]
for i in range(100):
    y1=x1*slp+random.normal(3,0.4,size=x.shape)
    x3=np.concatenate([x1,random.uniform(2,10,size=int(j-x.size))])
    y3=np.concatenate([y1,random.uniform(y.min()-3,y.max()+3,size=int(j-x.size))])
    pars=op.fmin(wei,[eslp,y3.mean()-eslp*x3.mean()],disp=0)
    rep.append(list(pars)+[wei(pars)])
crepw=np.array(rep)
crepw.std(0)
Out[73]:
array([ 0.03184917,  0.20871405,  1.38255017])
In [78]:
j
Out[78]:
115

nejmenší median

použije se místo sumy čtverců

nelze minimalizovat analyticky, v praxi pomocí MC

In [38]:
gfun2=lambda x:abs(x)
wei2=lambda p:np.median(gfun2(y2-p[1]-p[0]*x2))
op.fmin(wei2,[-0.5,y.mean()-0.5*x.mean()])
Optimization terminated successfully.
         Current function value: 1.360260
         Iterations: 63
         Function evaluations: 119
Out[38]:
array([-0.4084894,  1.8100606])

Vychýlenost (bias)

Odstranění rozdělením vzorku (M rozdělíme na $m_1$ a $m_2$ o velikostech N)

$$E(M)=\theta + \frac{1}{2N}\beta + O(1/N^2)$$$$E(m_{1/2})=\theta + \frac{1}{N}\beta + O(1/N^2)$$$$2E(M)-\frac{E(m_1)+E(m_2)}{2} = \theta + O(1/N^2)$$

Zvýší se ovšem rozptyl členem řádu $1/N$

Jackknife (vynechání měření)

optimalizovaná varianta, rozptyl naroste jen člen řádu $1/N^2$ (porovnání s metodou bootstrapu)

$$\hat \alpha^* = N \hat \alpha - \frac{N-1}{N}\sum \alpha_j,$$

kde $\alpha_j=f(x_1,x_2,..x_{j-1},x_{j+1}...)$ je odhad s vynecháním j-tého vzorku.

Další manipulace se vzorky (robustnost)

  • ořez (trimming) - vynechání $n/2$ nejmenších a největších hodnot

  • winsorizace - nahrazení ořezaných hodnot minimem/maximem zbytku (kombinace průměru a midrange)

Příklad: odhad středu (symetrického) rozdělení

(location parameter)

posuzuje se asymptotická efektivita (poměr k MVB pro $N \to \infty$)

pro normálně rozdělená data s rostoucím $n$ efektivita klesá, pro rozdělení s větším vlivem okrajových hodnot (dvojexponenciála, Cauchy) roste

Optimum (minimax) pro $r=(N-n)/2N=0.23$

class=left asym.efektivita

Ořez

testování ořezaného průměru

odhady nejistot pomocí bootstrapingu

In [3]:
size=1000
def gendata(gaus=2,extr=10,exran=5):
    data=random.normal(0,2,size)
    if extr>0: data[::extr]+=random.uniform(-exran,exran,size/extr) #every extr-th point is an outlier
    return data
data=gendata()
moms=[(data**i).mean() for i in [1,2]]
from numpy import sqrt
vari=moms[1]-moms[0]**2
[sum((data-moms[0])**2>i**2*vari) for i in [1,2,3]]
Out[3]:
[297, 52, 2]
In [36]:
def bootest(nite=200,nsig=2,extr=10,exran=10):
    rep=[]
    for j in range(200):
        bootstr=gendata(extr=10,exran=exran)[random.randint(0,size,size)]
        moms=[(bootstr**i).mean() for i in [1,2]]
        vari=moms[1]-moms[0]**2
        rep.append([moms[0],bootstr[(bootstr-moms[0])**2<nsig**2*vari].mean()])
    arep=r_[rep]
    return arep.mean(0),arep.std(0)
In [21]:
zoo=[bootest() for i in range(200)]
In [9]:
#dual exponent - trimming efficiency
import numpy as np
%matplotlib inline
from matplotlib import pyplot as pl
q=np.r_[0.01:0.5:0.01]
D=2-(2-2*np.log(q)+np.log(q)**2)*q
pl.plot(q,1/D)
Out[9]:
[<matplotlib.lines.Line2D at 0x8c58438>]
In [25]:
abs(zoo[:,0,:]).mean(0)
Out[25]:
array([ 0.00651356,  0.00550413])
In [34]:
pl.hist(abs(zoo[:,0,0]),r_[:0.03:20j],alpha=.5)
pl.hist(abs(zoo[:,0,1]),r_[:0.03:20j],alpha=.5)
Out[34]:
(array([ 34.,  26.,  38.,  28.,  23.,  20.,  10.,   8.,   9.,   2.,   1.,
          1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]),
 array([ 0.        ,  0.00157895,  0.00315789,  0.00473684,  0.00631579,
         0.00789474,  0.00947368,  0.01105263,  0.01263158,  0.01421053,
         0.01578947,  0.01736842,  0.01894737,  0.02052632,  0.02210526,
         0.02368421,  0.02526316,  0.02684211,  0.02842105,  0.03      ]),
 <a list of 19 Patch objects>)
In [37]:
zoo2=r_[[bootest(extr=0) for i in range(200)]]
abs(zoo2).mean(0)
Out[37]:
array([[ 0.00790551,  0.00637971],
       [ 0.11957548,  0.1024984 ]])