In [3]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('font', size=13)
Convergence des estimateurs¶
- Population de moyenne $\mu_0$ et de écart-type $\sigma_0$ (et de variance $\sigma_0^2$)
- Mesure -ou estimation- de la moyenne, sur un échantillon comportant $N$ individus
la moyenne mesurée est $\tilde \mu_N = \sum y_i/N$
L'écart-type de la moyenne mesurée converge en $1/\sqrt{N}$:
In [4]:
#<!-- collapse=True -->
import random
nbrs_courbes = 500
n_Min, n_Max = 1, 100
ylim0 = 3
plt.figure(figsize=(11, 6) )
n_span = range(n_Min, n_Max)
for k in range( nbrs_courbes ):
Xn = np.random.normal(loc=0.0, scale=1.0, size=n_Max)
moy_esti = [ Xn[:i].mean() for i in n_span ]
plt.semilogx(n_span, moy_esti , 'k', alpha=0.05, lw=1.0 );
# Un exemple de courbe:
plt.semilogx(n_span, moy_esti , 'r', alpha=.8, lw=0.5 );
# Ecart-type de la mesure :
sigma_moy = 1.0/np.sqrt( np.array(n_span) )
plt.plot(n_span, sigma_moy, 'r', alpha=0.94, lw=1.5 );
plt.xlim([n_Min, n_Max]); plt.ylim([-ylim0, ylim0 ]);
plt.xlabel(u'taille de l\'échantillon (log)'), plt.ylabel(u'moyenne estimée');
plt.title( u"moyenne estimée (500 courbes)" );
x_lab, labels = [1, 2, 10, 20, 100], ['1', '2', '10', '20', '100']
plt.xticks(x_lab, labels); # , rotation='vertical'
Question bête: Est-ce que la connaissance de la distribution attendue peut améliorer l'estimation ?
c.a.d. faire un fit par la loi attendue...
Estimation par regression de la fonction de répartition¶
In [16]:
from scipy.stats import expon
from scipy.stats import norm
In [66]:
import scipy.optimize
def getFtheo( x, distrib, param ):
F = distrib.cdf(x, loc=param[0], scale=param[1])
return F
def estimateMeans( X, distrib ):
Fexp = np.linspace( 0, 1, len(X) )
xFexp = np.sort( X )
cost = lambda p: np.sum( ( getFtheo( xFexp, distrib, p ) - Fexp )**2 )
xopt = scipy.optimize.fmin(func=cost, x0=[1., 1.], disp=False)
st = distrib.stats(loc=xopt[0], scale=xopt[1])
return np.mean( X ), float( st[0] )
test et debug¶
In [67]:
distrib = expon # norm
N = 200
X = distrib.rvs(loc=0, scale=1, size=N)
estimateMeans( X, distrib )
Out[67]:
In [68]:
Fexp = np.linspace( 0, 1, len(X) )
xFexp = np.sort( X )
Ftheo = getFtheo( xFexp, distrib, [0, 1] )
plt.plot(xFexp, Fexp, ".");
plt.plot(xFexp, Ftheo);
Go¶
In [79]:
N_samples = 100
distrib = expon# norm #expon
N = 5
all_mu = []
all_muFit = []
for i in range(N_samples):
X = distrib.rvs(loc=0, scale=1, size=N)
mu, muFit = estimateMeans( X, distrib )
all_mu.append( mu )
all_muFit.append( muFit )
# Trace les distributions empiriques
all_mu.sort()
all_muFit.sort()
F = np.linspace( 0, 1, len(all_mu) )
plt.figure(figsize=(10, 6));
plt.plot( all_mu, F, 'k', label='estimateur classique' )
plt.plot( all_muFit, F, 'r', label='estimateur Fit' )
plt.legend();
Brouillon¶
In [16]:
N = 60000
In [17]:
Xn = np.random.exponential(scale=1.0, size=N)
In [18]:
Xn.mean()
Out[18]:
In [20]:
nbrs_courbes = 500
n_Min, n_Max = 1, 100
ylim0 = 3
plt.figure(figsize=(11, 6) )
n_span = range(n_Min, n_Max)
for k in range( nbrs_courbes ):
Xn = np.random.exponential(scale=1.0, size=n_Max) - 1
moy_esti = [ Xn[:i].mean() for i in n_span ]
plt.semilogx(n_span, moy_esti , 'k', alpha=0.05, lw=1.0 );
# Un exemple de courbe:
plt.semilogx(n_span, moy_esti , 'r', alpha=.8, lw=0.5 );
# Ecart-type de la mesure :
sigma_moy = 1.0/np.sqrt( np.array(n_span) )
plt.plot(n_span, sigma_moy, 'r', alpha=0.94, lw=1.5 );
plt.xlim([n_Min, n_Max]); plt.ylim([-ylim0, ylim0 ]);
plt.xlabel(u'taille de l\'échantillon (log)'), plt.ylabel(u'moyenne estimée');
plt.title( u"moyenne estimée (500 courbes)" );
x_lab, labels = [1, 2, 10, 20, 100], ['1', '2', '10', '20', '100']
plt.xticks(x_lab, labels); # , rotation='vertical'
In [90]:
N = 10
Xn = np.random.exponential(scale=1., size=N)-1
print( Xn.mean() )
Fexp = np.linspace( 0, 1, len(Xn) )
xFexp = np.sort( Xn )
In [91]:
plt.plot( xFexp, Fexp , '.' )
plt.plot( xFexp, Ftheo( xFexp, 4., -5 ) )
Out[91]:
In [92]:
import scipy.optimize
cost = lambda p: np.sum( ( Ftheo( xFexp, p[0], p[1] ) - Fexp )**2 )
xopt = scipy.optimize.fmin(func=cost, x0=[1, 0])
print( xopt )
In [93]:
print( xopt[1] )
print( np.mean( Xn ) )
In [113]:
def Ftheo( x, lmda, mu ):
F = 1 - np.exp( -(x-mu+1)/lmda )
return F
def estimateMean( X ):
Fexp = np.linspace( 0, 1, len(Xn) )
xFexp = np.sort( Xn )
cost = lambda p: np.sum( ( Ftheo( xFexp, p[0], p[1] ) - Fexp )**2 )
xopt = scipy.optimize.fmin(func=cost, x0=[1.2, 1], disp=False)
return np.mean( X ), xopt[1]
In [117]:
N = 20
all_mu = []
all_muFit = []
for i in range(30):
Xn = np.random.exponential(scale=1., size=N)-1
mu, muFit = estimateMean( Xn )
all_mu.append( mu )
all_muFit.append( muFit )
all_mu.sort()
all_muFit.sort()
F = np.linspace( 0, 1, len(all_mu) )
plt.plot( all_mu, F, 'k', label='estimateur classique' )
plt.plot( all_muFit, F, 'r', label='estimateur Fit' )
plt.legend();
il existe une fonction pour faire cela: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rv_continuous.fit.html#scipy.stats.rv_continuous.fit
In [ ]:
t.pdf(x, k)