# [SciPy-user] indices of consecutive elements

Pierre GM pgmdevlist@gmail....
Tue Dec 2 11:13:16 CST 2008

```Daniel,
I coded a generic class that does what you want. It's not optimize,
but at least should get you started. Let me know if you find it useful
and if you find ways to tweak it...
Cheers,
P.

class Cluster(object):
"""
Groups consecutive data from an array according to a clustering
condition.
A cluster is defined as a group of consecutive values differing
by at most the
increment value.

Missing values are **not** handled: the input sequence must
therefore be free
of missing values.

Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`numpy.less_equal`.

Attributes
----------
inishape
Shape of the argument array (stored for resizing).
inisize
Size of the argument array.
uniques : sequence
List of unique cluster values, as they appear in
chronological order.
slices : sequence
List of the slices corresponding to each cluster of data.
starts : ndarray
Array of the indices at which the clusters start.
clustered : list
List of clustered data.

Examples
--------
>>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
>>> klust = cluster(A,0)
>>> [list(_) for _ in klust.clustered]
[[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
>>> klust.uniques
array([0, 1, 2, 3, 4, 3, 4])

>>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
...       1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
>>> Cluster(x,1).starts
array([ 0,  2,  3,  4,  5,  6,  7, 10, 11, 13, 17, 19])
>>> Cluster(x,1.5).starts
array([ 0,  6,  7, 10, 13, 17, 19])
>>> Cluster(x,2.5).starts
array([ 0,  6,  7, 19])
>>> Cluster(x,2.5,greater).starts
array([ 0,  1,  2,  3,  4,  5,  8,  9, 10,
...    11, 12, 13, 14, 15, 16, 17, 18])
>>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0]
>>> Cluster(y,1).starts
array([ 0,  1,  2,  5,  7, 10, 12, 16, 18])

"""
def __init__(self,darray,increment=1,operator=np.less_equal):
"""
Initializes instance.

Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`np.less_equal`

"""
raise ma.MAError("Masked arrays should be filled prior
clustering.")
else:
darray = np.asanyarray(darray)
n = darray.size
self.inishape = darray.shape
self.inisize = darray.size
clustercond = 1 -
operator(np.absolute(np.diff(darray.ravel())),
increment)
sid = np.r_[[0,], np.arange(1,n).compress(clustercond), [n,]]
slobj = np.asarray([slice(i,d)
for (i,d) in
#
self.uniques = darray.ravel()[sid[:-1]]
self.clustered = [darray[k] for k in slobj]
self.sizes = np.asarray(np.diff(sid))
self.slices = slobj
self.starts = sid[:-1]

def markonsize(self,operator,sizethresh):
"""
Creates a **mask** for the clusters that do not meet a size
requirement.
Thus, outputs ``False`` if the size requirement is met, ``True``
otherwise.

Parameters
----------
operator : function
Comparison operator
sizethresh : float
Requirement for the sizes of the clusters

"""
resmask = np.empty(self.inisize, dtype=bool)
#        for k in self.slices.compress(operator(self.sizes,sizethresh)):
for k in self.slices[operator(self.sizes,sizethresh)]:

def mark_greaterthan(self,sizemin):
"""
Shortcut for :meth:`markonsize(greater_equal,sizemin)`.
Thus, the command outputs ``False`` for clusters larger than
``sizemin``, and
``True`` for clusters smaller than ``sizemin``.

Parameters
----------
sizemin : int
Minimum size of the clusters.

--------
:meth:`markonsize`
Creates a **mask** for the clusters that do not meet a size
requirement.
"""
return self.markonsize(np.greater_equal,sizemin)

def grouped_slices(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of slices for the corresponding values.

--------
:meth:`~Cluster.grouped_limits`
that does the same thing
"""
#
output = dict([(k,[]) for k in np.unique1d(self.uniques)])
for (k,v) in zip(self.uniques, self.slices):
output[k].append(v)
return output

def grouped_limits(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of tuples (starting index, ending index) for the corresponding
values.

--------
:meth:`~Cluster.grouped_slices`
"""
output = dict([(k,[]) for k in np.unique1d(self.uniques)])
for (k,v) in zip(self.uniques, self.slices):
output[k].append((v.start, v.stop))
for k in output:
output[k] = np.array(output[k])
return output

On Dec 2, 2008, at 11:43 AM, Daniel Ashbrook wrote:

> I'm trying to figure out a way to return the indices of the start and
> end of a run of consecutive elements that match some condition, but
> only
> if there are more than a certain number.
>
> For example, take the array (with indices in comment for clarity):
>
> #0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22
> [0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0]
>
> I want to find the start and end indices of all runs of 1s with length
> of 4 or longer; so here the answer would be:
>
> [[2,5], [15,18]]
>
> Is there a reasonable way to do this without looping? I've been
> playing
> around with diff() and where() but without too much progress.
>
> Thanks,
>
>
> dan
```