'''given a 1D sample of observation, find a matching distribution * estimate maximum likelihood paramater for each distribution * rank estimated distribution by Kolmogorov-Smirnov and Anderson-Darling test statistics Author: Josef Pktd License: Simplified BSD original December 2008 TODO: * refactor to result class * split estimation by support, add option and choose automatically * ''' import scipy, scipy.stats import numpy import matplotlib.mlab as mlab import matplotlib.pyplot as plt import inspect def plothist(x,distfn, args, loc, scale, right=1): plt.figure() # the histogram of the data n, bins, patches = plt.hist(x, 25, normed=1, facecolor='green', alpha=0.75) maxheight = max([p.get_height() for p in patches]) print maxheight axlim = list(plt.axis()) axlim[-1] = maxheight*1.05 yt = distfn.pdf( bins, loc=loc, scale=scale, *args) yt[yt>maxheight]=maxheight lt = plt.plot(bins, yt, 'r--', linewidth=1) ys = scipy.stats.t.pdf( bins, 10,scale=10,)*right ls = plt.plot(bins, ys, 'b-', linewidth=1) plt.xlabel('Raw Data') plt.ylabel('Probability') plt.title(r'$\mathrm{Testing: %s :}\ \mu=%f,\ \sigma=%f$'%(distfn.name,loc,scale)) plt.grid(True) plt.draw() if __name__ == '__main__': #TODO: calculate correct tail probability for mixture prefix = 'run_conv500_1_' n = 500 dgp_arg = 10 dgp_scale = 10 results = [] real_valued_data = numpy.array([ 3.017,2.822,2.632,2.287,2.207,2.048, 1.963,1.784,1.712,2.972,2.719,2.495, 2.070,1.969,1.768,1.677,1.479,1.387, 2.843,2.485,2.163,1.687,1.408,1.279, 1.016,0.742,0.607 ]) eps = numpy.finfo(float).eps * 2.0 data_min = real_valued_data.min() data_max = real_valued_data.max() data_mean = real_valued_data.mean() data_range = data_max - data_min data_std_dev = numpy.sqrt(real_valued_data.var()) print '='*50 for item in inspect.getmembers(scipy.stats): if not isinstance(item[1], scipy.stats.rv_continuous) or item[0] == 'rv_continuous': continue distname = item[0] # these fit too slowly for my use if distname in ['ncf', 'kstwobign']: continue distfn = item[1] print '-'*30 print 'target = %s' % distname # Try different starting parameters best_ks_pval = 1.0E300 if distname in ['truncnorm','betaprime','reciprocal']: try: par0 = (data_mean-2.0*data_std_dev, data_mean+2.0*data_std_dev) par_est = tuple(distfn.fit(real_valued_data, loc=data_mean, scale=data_std_dev, *par0)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=0.0, scale=1.0)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=data_mean, scale=data_std_dev)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=data_max+eps, scale=data_std_dev)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=data_min-eps, scale=data_std_dev)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=data_max+eps, scale=data_range)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass try: par_est = tuple(distfn.fit(real_valued_data, loc=data_min-eps, scale=data_range)) arg_est = par_est[:-2] loc_est = par_est[-2] scale_est = par_est[-1] real_valued_data_normed = (real_valued_data-loc_est)/scale_est ks_stat, ks_pval = scipy.stats.kstest(real_valued_data_normed, distname, arg_est) if ks_pval < best_ks_pval: crit = distfn.ppf(1-0.1, loc=loc_est, scale=scale_est, *par_est) tail_prob = scipy.stats.t.sf(crit,dgp_arg,scale=dgp_scale) best = [distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ] best_ks_pval = ks_pval except: pass if best_ks_pval < 1.0E300: print 'crit, prob', best[4], best[5] results.append(best) from operator import itemgetter res_sort = sorted(results, key = itemgetter(2)) res_sort.reverse() #kstest statistic: smaller is better, pval larger is better print 'number of distributions', len(res_sort) imagedir = 'matchresults' import os if not os.path.exists(imagedir): os.makedirs(imagedir) for ii,di in enumerate(res_sort): distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob = di[:] distfn = getattr(scipy.stats,distname) print '%s ks-stat = %f, ks-pval = %f tail_prob = %f)' % \ (distname, ks_stat, ks_pval, tail_prob) plothist(real_valued_data,distfn,arg_est,loc_est,scale_est) #plt.savefig(os.path.join(imagedir,'%s%s%02d_%s.png'% (prefix, ri,ii, distname))) plt.show() plt.close()