[Numpy-svn] r5085 - in trunk/numpy/lib: . tests

numpy-svn@scip... numpy-svn@scip...
Fri Apr 25 12:41:08 CDT 2008


Author: dhuard
Date: 2008-04-25 12:41:04 -0500 (Fri, 25 Apr 2008)
New Revision: 5085

Modified:
   trunk/numpy/lib/function_base.py
   trunk/numpy/lib/tests/test_function_base.py
Log:
Modified histogram according to the discussion on the numpy ML.
This transitions from the old behavior to the new behavior using the new keyword.

Modified: trunk/numpy/lib/function_base.py
===================================================================
--- trunk/numpy/lib/function_base.py	2008-04-25 17:31:19 UTC (rev 5084)
+++ trunk/numpy/lib/function_base.py	2008-04-25 17:41:04 UTC (rev 5085)
@@ -11,6 +11,7 @@
            'add_docstring', 'meshgrid', 'delete', 'insert', 'append',
            'interp'
            ]
+import warnings
 
 import types
 import numpy.core.numeric as _nx
@@ -97,79 +98,174 @@
     except: return 0
     return 1
 
-def histogram(a, bins=10, range=None, normed=False):
+def histogram(a, bins=10, range=None, normed=False, weights=None, new=False):
     """Compute the histogram from a set of data.
 
-    Parameters:
+    Parameters
+    ----------
 
-        a : array
-            The data to histogram. n-D arrays will be flattened.
+    a : array
+      The data to histogram. 
 
-        bins : int or sequence of floats
-            If an int, then the number of equal-width bins in the given range.
-            Otherwise, a sequence of the lower bound of each bin.
+    bins : int or sequence
+      If an int, then the number of equal-width bins in the given range.
+      If new=True, bins can also be the bin edges, allowing for non-constant
+      bin widths.
 
-        range : (float, float)
-            The lower and upper range of the bins. If not provided, then
-            (a.min(), a.max()) is used. Values outside of this range are
-            allocated to the closest bin.
+    range : (float, float)
+      The lower and upper range of the bins. If not provided, then
+      range is simply (a.min(), a.max()). Using new=False, lower than range
+      are ignored, and values higher than range are tallied in the rightmost 
+      bin. Using new=True, both lower and upper outliers are ignored. 
 
-        normed : bool
-            If False, the result array will contain the number of samples in
-            each bin.  If True, the result array is the value of the
-            probability *density* function at the bin normalized such that the
-            *integral* over the range is 1. Note that the sum of all of the
-            histogram values will not usually be 1; it is not a probability
-            *mass* function.
+    normed : bool
+      If False, the result array will contain the number of samples in
+      each bin.  If True, the result array is the value of the
+      probability *density* function at the bin normalized such that the
+      *integral* over the range is 1. Note that the sum of all of the
+      histogram values will not usually be 1; it is not a probability
+      *mass* function.
 
-    Returns:
+    weights : array
+      An array of weights, the same shape as a. If normed is False, the 
+      histogram is computed by summing the weights of the values falling into
+      each bin. If normed is True, the weights are normalized, so that the 
+      integral of the density over the range is 1. This option is only 
+      available with new=True.
+         
+    new : bool
+       Compatibility argument to transition from the old version (v1.1) to 
+       the new version (v1.2). 
+    
+ 
+    Return
+    ------
+    hist : array
+        The values of the histogram. See `normed` and `weights` for a 
+        description of the possible semantics.
 
-        hist : array
-            The values of the histogram. See `normed` for a description of the
-            possible semantics.
+    bin_edges : float array
+        With new=False, return the left bin edges (length(hist)).
+        With new=True, return the bin edges (length(hist)+1). 
 
-        lower_edges : float array
-            The lower edges of each bin.
-
     SeeAlso:
 
         histogramdd
 
     """
-    a = asarray(a).ravel()
+    # Old behavior
+    if new is False:
+        warnings.warn("""
+        The semantics of histogram will be modified in 
+        release 1.2 to improve outlier handling. The new behavior can be 
+        obtained using new=True. Note that the new version accepts/returns 
+        the bin edges instead of the left bin edges. 
+        Please read the docstring for more information.""", FutureWarning)
+        a = asarray(a).ravel()
 
-    if (range is not None):
-        mn, mx = range
-        if (mn > mx):
-            raise AttributeError, 'max must be larger than min in range parameter.'
+        if (range is not None):
+            mn, mx = range
+            if (mn > mx):
+                raise AttributeError, \
+                    'max must be larger than min in range parameter.'
+    
+        if not iterable(bins):
+            if range is None:
+                range = (a.min(), a.max())
+            else:
+                warnings.warn("""Outliers handling will change in version 1.2.  
+                    Please read the docstring for details.""", FutureWarning)
+            mn, mx = [mi+0.0 for mi in range]
+            if mn == mx:
+                mn -= 0.5
+                mx += 0.5
+            bins = linspace(mn, mx, bins, endpoint=False)
+        else:
+            raise ValueError, 'Use new=True to pass bin edges explicitly.'
+            
+        if weights is not None:
+            raise ValueError, 'weights are only available with new=True.'
+            
+        # best block size probably depends on processor cache size
+        block = 65536
+        n = sort(a[:block]).searchsorted(bins)
+        for i in xrange(block, a.size, block):
+            n += sort(a[i:i+block]).searchsorted(bins)
+        n = concatenate([n, [len(a)]])
+        n = n[1:]-n[:-1]
+    
+        if normed:
+            db = bins[1] - bins[0]
+            return 1.0/(a.size*db) * n, bins
+        else:
+            return n, bins
 
-    if not iterable(bins):
-        if range is None:
-            range = (a.min(), a.max())
-        mn, mx = [mi+0.0 for mi in range]
-        if mn == mx:
-            mn -= 0.5
-            mx += 0.5
-        bins = linspace(mn, mx, bins, endpoint=False)
-    else:
-        bins = asarray(bins)
-        if (bins[1:]-bins[:-1] < 0).any():
-            raise AttributeError, 'bins must increase monotonically.'
+    
+    
+    # New behavior
+    elif new is True:
+        a = asarray(a)
+        if weights is not None:
+            weights = asarray(weights)
+            if np.any(weights.shape != a.shape):
+                raise ValueError, 'weights should have the same shape as a.'
+            weights = weights.ravel()
+        a =  a.ravel()
+            
+        if (range is not None):
+            mn, mx = range
+            if (mn > mx):
+                raise AttributeError, \
+                    'max must be larger than min in range parameter.'
+    
+        if not iterable(bins):
+            if range is None:
+                range = (a.min(), a.max())
+            mn, mx = [mi+0.0 for mi in range]
+            if mn == mx:
+                mn -= 0.5
+                mx += 0.5
+            bins = linspace(mn, mx, bins+1, endpoint=True)
+        else:
+            bins = asarray(bins)
+            if (np.diff(bins) < 0).any():
+                raise AttributeError, 'bins must increase monotonically.'
+    
+        # Histogram is an integer or a float array depending on the weights.
+        if weights is None:
+            ntype = int
+        else:
+            ntype = weights.dtype
+        n = np.zeros(bins.shape, ntype)
+        
+        block = 65536
+        if weights is None:
+            for i in xrange(0, a.size, block):
+                sa = sort(a[:block])
+                n += np.r_[sa.searchsorted(bins[:-1], 'left'), \
+                    sa.searchsorted(bins[-1], 'right')]
+        else:
+            zero = array(0, dtype=ntype)
+            for i in xrange(0, a.size, block):
+                tmp_a = a[:block]
+                tmp_w = weights[:block]
+                sorting_index = np.argsort(tmp_a)
+                sa = tmp_a[sorting_index]
+                sw = tmp_w[sorting_index] 
+                cw = np.concatenate(([zero,], sw.cumsum()))
+                bin_index = np.r_[sa.searchsorted(bins[:-1], 'left'), \
+                    sa.searchsorted(bins[-1], 'right')]
+                n += cw[bin_index]
+            
+        n = np.diff(n)
+       
+        if normed is False:
+            return n, bins
+        elif normed is True:
+            db = array(np.diff(bins), float)
+            return n/(n*db).sum(), bins
+        
 
-    # best block size probably depends on processor cache size
-    block = 65536
-    n = sort(a[:block]).searchsorted(bins)
-    for i in xrange(block, a.size, block):
-        n += sort(a[i:i+block]).searchsorted(bins)
-    n = concatenate([n, [len(a)]])
-    n = n[1:]-n[:-1]
-
-    if normed:
-        db = bins[1] - bins[0]
-        return 1.0/(a.size*db) * n, bins
-    else:
-        return n, bins
-
 def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     """histogramdd(sample, bins=10, range=None, normed=False, weights=None)
 

Modified: trunk/numpy/lib/tests/test_function_base.py
===================================================================
--- trunk/numpy/lib/tests/test_function_base.py	2008-04-25 17:31:19 UTC (rev 5084)
+++ trunk/numpy/lib/tests/test_function_base.py	2008-04-25 17:41:04 UTC (rev 5085)
@@ -5,6 +5,7 @@
 import numpy.lib;reload(numpy.lib)
 from numpy.lib import *
 from numpy.core import *
+
 del sys.path[0]
 
 class TestAny(NumpyTestCase):
@@ -432,12 +433,102 @@
         v=rand(n)
         (a,b)=histogram(v)
         #check if the sum of the bins equals the number of samples
-        assert(sum(a,axis=0)==n)
+        assert_equal(sum(a,axis=0), n)
         #check that the bin counts are evenly spaced when the data is from a
         # linear function
         (a,b)=histogram(linspace(0,10,100))
-        assert(all(a==10))
+        assert_array_equal(a, 10)
 
+    def check_simple_new(self):
+        n=100
+        v=rand(n)
+        (a,b)=histogram(v, new=True)
+        #check if the sum of the bins equals the number of samples
+        assert_equal(sum(a,axis=0), n)
+        #check that the bin counts are evenly spaced when the data is from a
+        # linear function
+        (a,b)=histogram(linspace(0,10,100), new=True)
+        assert_array_equal(a, 10)
+
+    def check_normed_new(self):
+        # Check that the integral of the density equals 1. 
+        n = 100
+        v = rand(n)
+        a,b = histogram(v, normed=True, new=True)
+        area = sum(a*diff(b))
+        assert_almost_equal(area, 1)
+
+        # Check with non constant bin width
+        v = rand(n)*10
+        bins = [0,1,5, 9, 10]
+        a,b = histogram(v, bins, normed=True, new=True)
+        area = sum(a*diff(b))
+        assert_almost_equal(area, 1)
+    
+       
+    def check_outliers_new(self):
+        # Check that outliers are not tallied
+        a = arange(10)+.5
+        
+        # Lower outliers
+        h,b = histogram(a, range=[0,9], new=True)
+        assert_equal(h.sum(),9)
+        
+        # Upper outliers
+        h,b = histogram(a, range=[1,10], new=True)
+        assert_equal(h.sum(),9)
+        
+        # Normalization
+        h,b = histogram(a, range=[1,9], normed=True, new=True)
+        assert_equal((h*diff(b)).sum(),1)
+        
+        # Weights
+        w = arange(10)+.5
+        h,b = histogram(a, range=[1,9], weights=w, normed=True, new=True)
+        assert_equal((h*diff(b)).sum(),1)
+        
+        h,b = histogram(a, bins=8, range=[1,9], weights=w, new=True)
+        assert_equal(h, w[1:-1])
+        
+        
+    def check_type_new(self):
+        # Check the type of the returned histogram
+        a = arange(10)+.5
+        h,b = histogram(a, new=True)
+        assert(issubdtype(h.dtype, int))
+        
+        h,b = histogram(a, normed=True, new=True)
+        assert(issubdtype(h.dtype, float))
+        
+        h,b = histogram(a, weights=ones(10, int), new=True)
+        assert(issubdtype(h.dtype, int))
+        
+        h,b = histogram(a, weights=ones(10, float), new=True)
+        assert(issubdtype(h.dtype, float))
+        
+        
+    def check_weights_new(self):
+        v = rand(100)
+        w = ones(100)*5
+        a,b = histogram(v,new=True)
+        na,nb = histogram(v, normed=True, new=True)
+        wa,wb = histogram(v, weights=w, new=True)
+        nwa,nwb = histogram(v, weights=w, normed=True, new=True)
+        assert_array_almost_equal(a*5, wa)
+        assert_array_almost_equal(na, nwa)
+        
+        # Check weights are properly applied.
+        v = linspace(0,10,10)
+        w = concatenate((zeros(5), ones(5)))
+        wa,wb = histogram(v, bins=arange(11),weights=w, new=True)
+        assert_array_almost_equal(wa, w)
+        
+        # Check with integer weights
+        wa, wb = histogram([1,2,2,4], bins=4, weights=[4,3,2,1], new=True)
+        assert_array_equal(wa, [4,5,0,1])
+        wa, wb = histogram([1,2,2,4], bins=4, weights=[4,3,2,1], normed=True, new=True)
+        assert_array_equal(wa, array([4,5,0,1])/10./3.*4)
+     
 class TestHistogramdd(NumpyTestCase):
     def check_simple(self):
         x = array([[-.5, .5, 1.5], [-.5, 1.5, 2.5], [-.5, 2.5, .5], \



More information about the Numpy-svn mailing list