* Added wpylib.math.stats.jackknife1, initial implementation of delete-one

jackknife resampling method. This module also contains a hack for weighted average (warning: the theory is not established yet, at least I have not seen it).
2013-12-11 00:15:24 -05:00
parent f0ba6f4068
commit 4e2b2c66c2
1 changed files with 162 additions and 0 deletions
--- a/math/stats/jackknife1.py
+++ b/math/stats/jackknife1.py
@@ -0,0 +1,162 @@
+"""
+REFERENCES:
+
+Jackknife and Bootstrap Resampling Methods in Statistical Analysis to Correct for Bias.
+P. Young
+http://young.physics.ucsc.edu/jackboot.pdf
+
+
+Notes on Bootstrapping
+
+
+
+"""
+
+import numpy
+
+from numpy import pi, cos
+from numpy.random import normal
+
+def test1_generate_data(ndata=1000):
+  """
+
+  """
+  return pi / 3 + normal(size=ndata)
+
+
+def test1():
+  global test1_dset
+  test1_dset = test1_generate_data()
+  dset = test1_dset
+  print "first jackknife routine: jk_generate_datasets -> jk_wstats"
+  dset_jk = jk_generate_datasets(dset)
+  cos_avg1 = jk_wstats(dset_jk, func=numpy.cos)
+  print cos_avg1
+
+  print "second jackknife routine: jk_generate_averages -> jk_stats_aa"
+  aa_jk = jk_generate_averages(dset)
+  cos_avg2 = jk_stats_aa(aa_jk, func=numpy.cos)
+  print cos_avg2
+
+  # the two results above must be identical
+
+
+def test2_generate_data():
+  rootdir = "/home/wirawan/Work/PWQMC-77/expt/qmc/MnO/AFM2/rh.1x1x1/Opium-GFRG/vol10.41/k-0772+3780+2187.run"
+  srcfile = rootdir + "/measurements.h5"
+  from pyqmc.results.pwqmc_meas import meas_hdf5
+
+  global test2_db
+  test2_db = meas_hdf5(srcfile)
+
+
+def jk_select_dataset(a, i):
+  """Selects the i-th dataset for jackknife operation from a
+  given dataset 'a'.
+  The argument i must be: 0 <= 0 < len(a).
+  This is essentially deleting the i-th data point from the
+  original dataset.
+  """
+  a = numpy.asarray(a)
+  N = a.shape[0]
+  assert len(a.shape) == 1
+  assert 0 <= i < N
+  rslt = numpy.empty(shape=(N-1,), dtype=a.dtype)
+  rslt[:i] = a[:i]
+  rslt[i:] = a[i+1:]
+  return rslt
+
+def jk_generate_datasets(a):
+  """Generates ALL the datasets for jackknife operation from
+  the original dataset 'a'.
+  For the i-th dataset, this is essentially deleting the
+  i-th data point from 'a'.
+  """
+  a = numpy.asarray(a)
+  N = a.shape[0]
+  assert len(a.shape) == 1
+  rslt = numpy.empty(shape=(N,N-1,), dtype=a.dtype)
+  for i in xrange(N):
+    rslt[i, :i] = a[:i]
+    rslt[i, i:] = a[i+1:]
+  return rslt
+
+def jk_generate_averages(a, weights=None):
+  """Generates ALL the average samples for jackknife operation
+  from the original dataset 'a'.
+  For the i-th dataset, this is essentially deleting the
+  i-th data point from 'a', then taking the average.
+  
+  This version does not store N*(N-1) data points; only (N).
+  """
+  a = numpy.asarray(a)
+  N = a.shape[0]
+  assert len(a.shape) == 1
+  aa_jk = numpy.empty(shape=(N,), dtype=a.dtype)
+  dset_i = numpy.empty(shape=(N-1,), dtype=a.dtype)
+  if weights != None:
+    weights_i = numpy.empty(shape=(N-1,), dtype=weights.dtype)
+  for i in xrange(N):
+    dset_i[:i] = a[:i]
+    dset_i[i:] = a[i+1:]
+    if weights != None:
+      weights_i[:i] = weights[:i]
+      weights_i[i:] = weights[i+1:]
+      aa_jk[i] = numpy.average(dset_i, weights=weights_i)
+    else:
+      aa_jk[i] = numpy.mean(dset_i)
+
+  return aa_jk
+
+'''
+def jk_stats_old(a_jk, func=None):
+  """a_jk must be in the same format as that produced by
+
+  """
+  # get all the jackknived stats.
+  if func == None:
+    jk_mean = numpy.mean(a_jk, axis=1)
+  else:
+    jk_mean = numpy.mean(func(a_jk), axis=1)
+'''
+
+def jk_wstats_dsets(a_jk, w_jk=None, func=None):
+  """a_jk and w_jk must be in the same format as that produced by
+  jk_generate_datasets.
+
+  """
+  # get all the jackknived stats.
+  N = len(a_jk)
+  # reconstruct full "a" array:
+  a = numpy.empty(shape=(N,), dtype=a_jk.dtype)
+  a[1:] = a_jk[0]
+  a[0] = a_jk[1][0]
+  if func == None:
+    func = lambda x : x
+  aa_jk = numpy.average(a_jk, axis=1, weights=w_jk)
+  #print aa_jk
+  f_jk = func(aa_jk)
+  mean = numpy.mean(f_jk)
+  var = numpy.std(f_jk) * numpy.sqrt(N-1)
+  mean_unbiased = N * func(a.mean()) - (N-1) * mean
+  return (mean, var, mean_unbiased)
+
+
+def jk_stats_aa(aa_jk, func=None, a=None):
+  """Computes the jackknife statistics from the preprocessed
+  jackknife averages (aa_jk).
+  The input array aa_jk is computed by jk_generate_averages().
+  """
+  # get all the jackknived stats.
+  N = len(aa_jk)
+  # reconstruct full "a" array:
+  if func == None:
+    func = lambda x : x
+  f_jk = func(aa_jk)
+  mean = numpy.mean(f_jk)
+  var = numpy.std(f_jk) * numpy.sqrt(N-1)
+  if a != None:
+    mean_unbiased = N * func(a.mean()) - (N-1) * mean
+  else:
+    mean_unbiased = None
+  return (mean, var, mean_unbiased)