[Numpy-discussion] Possible modification to bincount()
Stephen Simmons
mail at stevesimmons.com
Thu Jul 20 22:44:57 CDT 2006
While playing a little more with bincount(), one modification would be
handy: Allow negative integers in the bin list, but skip them when
counting bins
My specific use case is calculating subtotals on columns of large
datasets (1m rows x 30 cols), where some rows need to be excluded. The
groupings are expensive to compute, and sometimes will involve ~99% of
the rows (eliminate only outliers/errors), and other times only ~5% of
the rows (focus in on a subset).
I'd like to calculate subtotals like this using bincount(), without
having to copy the large datasets just to eliminate the unwanted rows:
# Assign each row to a group numbered from 0..G, except for -1 for rows
to exclude
row_groups = expensive_function(data)
# Count number in each group, excluding those with grp==-1
grp_counts = bincount(list=row_groups)
# Use bincount() to form subtotals by column, excluding those with grp==-1
subtotals = column_stack([ bincount(list=row_groups, weights=data[:,i])
for i in range(G+1) ])
Is there any appetite to make such a change to bincount()?
This would require two simple changes to bincount() in _compiled_base.c
and an update to the docstring. Here is the diff file with enough
context to show the entire arr_bincount() function:
*** orig_compiled_base.c 2006-07-21 13:14:21.250000000 +1000
--- _compiled_base.c 2006-07-21 13:34:41.718750000 +1000
***************
*** 70,143 ****
intp j ;
for ( j = 1 ; j < len; j ++ )
if ( i [j] < min )
{min = i [j] ;
mn = j ;}
return mn;
}
static PyObject *
arr_bincount(PyObject *self, PyObject *args, PyObject *kwds)
{
/* histogram accepts one or two arguments. The first is an array
! * of non-negative integers and the second, if present, is an
* array of weights, which must be promotable to double.
* Call these arguments list and weight. Both must be one-
* dimensional. len (weight) == len(list)
* If weight is not present:
! * histogram (list) [i] is the number of occurrences of i in list.
* If weight is present:
* histogram (list, weight) [i] is the sum of all weight [j]
! * where list [j] ==
i. */
/* self is not used */
PyArray_Descr *type;
PyObject *list = NULL, *weight=Py_None ;
PyObject *lst=NULL, *ans=NULL, *wts=NULL;
! intp *numbers, *ians, len , mxi, mni, ans_size;
int i;
double *weights , *dans;
static char *kwlist[] = {"list", "weights", NULL};
Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist,
&list, &weight));
Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1));
len = PyArray_SIZE(lst);
numbers = (intp *) PyArray_DATA(lst);
mxi = mxx (numbers, len) ;
- mni = mnx (numbers, len) ;
- Py_Assert(numbers[mni] >= 0,
- "irst argument of bincount must be non-negative");
ans_size = numbers [mxi] + 1 ;
type = PyArray_DescrFromType(PyArray_INTP);
if (weight == Py_None) {
Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
ians = (intp *)(PyArray_DATA(ans));
for (i = 0 ; i < len ; i++)
! ians [numbers [i]] += 1 ;
Py_DECREF(lst);
}
else {
Py_Try(wts = PyArray_ContiguousFromAny(weight,
PyArray_DOUBLE, 1, 1));
weights = (double *)PyArray_DATA (wts);
Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \
"does not match that of list");
type = PyArray_DescrFromType(PyArray_DOUBLE);
Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
dans = (double *)PyArray_DATA (ans);
for (i = 0 ; i < len ; i++) {
! dans[numbers[i]] += weights[i];
}
Py_DECREF(lst);
Py_DECREF(wts);
}
return ans;
fail:
Py_XDECREF(lst);
Py_XDECREF(wts);
Py_XDECREF(ans);
return NULL;
}
--- 70,145 ----
intp j ;
for ( j = 1 ; j < len; j ++ )
if ( i [j] < min )
{min = i [j] ;
mn = j ;}
return mn;
}
static PyObject *
arr_bincount(PyObject *self, PyObject *args, PyObject *kwds)
{
/* histogram accepts one or two arguments. The first is an array
! * of integers and the second, if present, is an
* array of weights, which must be promotable to double.
* Call these arguments list and weight. Both must be one-
* dimensional. len (weight) == len(list)
* If weight is not present:
! * histogram (list) [i] is the number of occurrences of i in list
! * for i>=0. Negative i values are ignored.
* If weight is present:
* histogram (list, weight) [i] is the sum of all weight [j]
! * where list [j] == i and i>=0. */
/* self is not used */
PyArray_Descr *type;
PyObject *list = NULL, *weight=Py_None ;
PyObject *lst=NULL, *ans=NULL, *wts=NULL;
! intp *numbers, *ians, len , mxi, ans_size;
int i;
double *weights , *dans;
static char *kwlist[] = {"list", "weights", NULL};
Py_Try(PyArg_ParseTupleAndKeywords(args, kwds, "O|O", kwlist,
&list, &weight));
Py_Try(lst = PyArray_ContiguousFromAny(list, PyArray_INTP, 1, 1));
len = PyArray_SIZE(lst);
numbers = (intp *) PyArray_DATA(lst);
mxi = mxx (numbers, len) ;
ans_size = numbers [mxi] + 1 ;
type = PyArray_DescrFromType(PyArray_INTP);
if (weight == Py_None) {
Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
ians = (intp *)(PyArray_DATA(ans));
for (i = 0 ; i < len ; i++)
! if (numbers[i]>=0) {
! ians[numbers [i]] += 1 ;
! }
Py_DECREF(lst);
}
else {
Py_Try(wts = PyArray_ContiguousFromAny(weight,
PyArray_DOUBLE, 1, 1));
weights = (double *)PyArray_DATA (wts);
Py_Assert(PyArray_SIZE(wts) == len, "bincount: length of weights " \
"does not match that of list");
type = PyArray_DescrFromType(PyArray_DOUBLE);
Py_Try(ans = PyArray_Zeros(1, &ans_size, type, 0));
dans = (double *)PyArray_DATA (ans);
for (i = 0 ; i < len ; i++) {
! if (numbers[i]>=0) {
! dans[numbers[i]] += weights[i];
! }
}
Py_DECREF(lst);
Py_DECREF(wts);
}
return ans;
fail:
Py_XDECREF(lst);
Py_XDECREF(wts);
Py_XDECREF(ans);
return NULL;
}
Cheers
Stephen
More information about the Numpy-discussion
mailing list