In [3]:

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

plt.rc('font', size=13)

Convergence des estimateurs¶

Population de moyenne $\mu_0$ et de écart-type $\sigma_0$ (et de variance $\sigma_0^2$)
Mesure -ou estimation- de la moyenne, sur un échantillon comportant $N$ individus
la moyenne mesurée est $\tilde \mu_N = \sum y_i/N$
L'écart-type de la moyenne mesurée converge en $1/\sqrt{N}$:

$$ \sigma(\tilde \mu) = \frac{\sigma_0}{\sqrt{N}} $$

In [4]:

#<!-- collapse=True -->

import random

nbrs_courbes = 500

n_Min, n_Max = 1, 100

ylim0 = 3
plt.figure(figsize=(11, 6)  )

n_span = range(n_Min, n_Max)

for k in range( nbrs_courbes ):
    Xn = np.random.normal(loc=0.0, scale=1.0, size=n_Max) 

    moy_esti = [ Xn[:i].mean() for i in n_span  ]
    
    plt.semilogx(n_span, moy_esti , 'k', alpha=0.05, lw=1.0 );
    
# Un exemple de courbe:    
plt.semilogx(n_span,  moy_esti , 'r', alpha=.8, lw=0.5 );


# Ecart-type de la mesure : 
sigma_moy = 1.0/np.sqrt( np.array(n_span) )
plt.plot(n_span, sigma_moy, 'r', alpha=0.94, lw=1.5 );


plt.xlim([n_Min, n_Max]); plt.ylim([-ylim0, ylim0 ]);
plt.xlabel(u'taille de l\'échantillon (log)'), plt.ylabel(u'moyenne estimée');
plt.title( u"moyenne estimée (500 courbes)" );

x_lab, labels = [1, 2, 10, 20, 100], ['1', '2', '10', '20', '100']
plt.xticks(x_lab, labels); # , rotation='vertical'

Question bête: Est-ce que la connaissance de la distribution attendue peut améliorer l'estimation ?

c.a.d. faire un fit par la loi attendue...

Estimation par regression de la fonction de répartition¶

In [16]:

from scipy.stats import expon
from scipy.stats import norm

In [66]:

import scipy.optimize

def getFtheo( x, distrib, param ):
    F = distrib.cdf(x, loc=param[0], scale=param[1])
    return F

def estimateMeans( X, distrib ):
    Fexp = np.linspace( 0, 1, len(X) )
    xFexp = np.sort( X )

    cost = lambda p: np.sum( ( getFtheo( xFexp, distrib,  p ) - Fexp  )**2 )
    xopt = scipy.optimize.fmin(func=cost, x0=[1., 1.], disp=False)
    
    st = distrib.stats(loc=xopt[0], scale=xopt[1])

    return np.mean( X ), float( st[0] )

test et debug¶

In [67]:

distrib = expon # norm
N = 200

X = distrib.rvs(loc=0, scale=1, size=N)
estimateMeans( X, distrib )

Out[67]:

(0.90790617583710298, 0.9112361599685702)

In [68]:

Fexp = np.linspace( 0, 1, len(X) )
xFexp = np.sort( X )

Ftheo = getFtheo( xFexp, distrib,  [0, 1] )

plt.plot(xFexp, Fexp, ".");
plt.plot(xFexp, Ftheo);

Go¶

In [79]:

N_samples = 100

distrib = expon# norm #expon
N = 5


all_mu = []
all_muFit = []
for i in range(N_samples):
    X = distrib.rvs(loc=0, scale=1, size=N)
    mu, muFit = estimateMeans( X, distrib )
    
    all_mu.append( mu )
    all_muFit.append( muFit )
    
# Trace les distributions empiriques 
all_mu.sort()
all_muFit.sort()
F = np.linspace( 0, 1, len(all_mu) )
plt.figure(figsize=(10, 6));
plt.plot( all_mu, F, 'k', label='estimateur classique' )
plt.plot( all_muFit, F, 'r', label='estimateur Fit'  )
plt.legend();

Brouillon¶

In [16]:

N = 60000

In [17]:

Xn = np.random.exponential(scale=1.0, size=N)

In [18]:

Xn.mean()

Out[18]:

1.0031489566423544

In [20]:

nbrs_courbes = 500

n_Min, n_Max = 1, 100

ylim0 = 3
plt.figure(figsize=(11, 6)  )

n_span = range(n_Min, n_Max)

for k in range( nbrs_courbes ):
    Xn = np.random.exponential(scale=1.0, size=n_Max) - 1

    moy_esti = [ Xn[:i].mean() for i in n_span  ]
    
    plt.semilogx(n_span, moy_esti , 'k', alpha=0.05, lw=1.0 );
    
# Un exemple de courbe:    
plt.semilogx(n_span,  moy_esti , 'r', alpha=.8, lw=0.5 );


# Ecart-type de la mesure : 
sigma_moy = 1.0/np.sqrt( np.array(n_span) )
plt.plot(n_span, sigma_moy, 'r', alpha=0.94, lw=1.5 );


plt.xlim([n_Min, n_Max]); plt.ylim([-ylim0, ylim0 ]);
plt.xlabel(u'taille de l\'échantillon (log)'), plt.ylabel(u'moyenne estimée');
plt.title( u"moyenne estimée (500 courbes)" );

x_lab, labels = [1, 2, 10, 20, 100], ['1', '2', '10', '20', '100']
plt.xticks(x_lab, labels); # , rotation='vertical'

In [90]:

N = 10
Xn = np.random.exponential(scale=1., size=N)-1

print( Xn.mean() )

Fexp = np.linspace( 0, 1, len(Xn) )
xFexp = np.sort( Xn )

0.424779876441

In [91]:

plt.plot(  xFexp, Fexp , '.' )
plt.plot(  xFexp, Ftheo( xFexp, 4., -5 ) )

Out[91]:

[<matplotlib.lines.Line2D at 0x7f20fc697b00>]

In [92]:

import scipy.optimize

cost = lambda p: np.sum( ( Ftheo( xFexp, p[0], p[1] ) - Fexp  )**2 )
xopt = scipy.optimize.fmin(func=cost, x0=[1, 0])

print( xopt )

Optimization terminated successfully.
         Current function value: 0.053811
         Iterations: 57
         Function evaluations: 108
[ 1.44338545  0.18393873]

In [93]:

print( xopt[1] )
print( np.mean( Xn ) )

0.183938730909
0.424779876441

In [113]:

def Ftheo( x, lmda, mu ):
    F = 1 - np.exp( -(x-mu+1)/lmda )
    return F

def estimateMean(  X ):
    Fexp = np.linspace( 0, 1, len(Xn) )
    xFexp = np.sort( Xn )

    cost = lambda p: np.sum( ( Ftheo( xFexp, p[0], p[1] ) - Fexp  )**2 )
    xopt = scipy.optimize.fmin(func=cost, x0=[1.2, 1], disp=False)
    
    return np.mean( X ), xopt[1]

In [117]:

N = 20

all_mu = []
all_muFit = []
for i in range(30):
    Xn = np.random.exponential(scale=1., size=N)-1
    mu, muFit = estimateMean(  Xn  )
    all_mu.append( mu )
    all_muFit.append( muFit )
    
all_mu.sort()
all_muFit.sort()
F = np.linspace( 0, 1, len(all_mu) )
plt.plot( all_mu, F, 'k', label='estimateur classique' )
plt.plot( all_muFit, F, 'r', label='estimateur Fit'  )
plt.legend();

/home/etienne/xdze2_website/py3/lib/python3.4/site-packages/ipykernel/__main__.py:2: RuntimeWarning: overflow encountered in exp
  from ipykernel import kernelapp as app
/home/etienne/xdze2_website/py3/lib/python3.4/site-packages/ipykernel/__main__.py:9: RuntimeWarning: overflow encountered in square

il existe une fonction pour faire cela: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.fit.html#scipy.stats.rv_continuous.fit

In [ ]:

t.pdf(x, k)