# [SciPy-user] Finding Gaps in an Array

Pierre GM pgmdevlist@gmail....
Mon Jul 6 12:45:33 CDT 2009

```On Jul 6, 2009, at 1:22 PM, Lorenzo Isella wrote:
>
> Thanks David,
> But I also need something more.
> Let us make it real easy.  Consider the array
>
> import numpy as n
> f=n.array([0,1,2,4,9,22,23,24,32,33,59,60,76])
>
> I want to calculate the length of the subarrays consisting of numbers
> evenly spaced by 1 (like in the example, but now the gap is 1).
> In the case of f, these subarrays are:
>
> [0,1,2], [22,23,24], [32,33] and [59,60].
>
> Hence, I would like to get back an array giving me the counts
> [3,3,2,2].
>
> Does anyone know how to do this efficiently?
> Cheers
>
> Lorenzo

Lorenzo,
scikits.hydroclimpy:
http://hydroclimpy.sourceforge.net/
More specifically:
http://hydroclimpy.sourceforge.net/core.objects.html#cluster-object

Roughly, given an array and an increment parameter, Cluster computes a
sequence of slices that differ by less than increment (or more than
increment, depending on yet another parameter).
Here's the relevant portion of the code.
Note : in an upcoming version, this code will be implemented in
Cython, and renamed to Clumps (as the name Cluster is misleading).

##########
class Cluster(object):
"""
Groups consecutive data from an array according to a clustering
condition.

A cluster is defined as a group of consecutive values differing
by at most the
increment value.

Missing values are **not** handled: the input sequence must
therefore be free
of missing values.

Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`numpy.less_equal`.

Attributes
----------
inishape
Shape of the argument array (stored for resizing).
inisize
Size of the argument array.
uniques : sequence
List of unique cluster values, as they appear in
chronological order.
slices : sequence
List of the slices corresponding to each cluster of data.
starts : ndarray
Array of the indices at which the clusters start.
clustered : list
List of clustered data.

Examples
--------
>>> A = [0, 0, 1, 2, 2, 2, 3, 4, 3, 4, 4, 4]
>>> klust = cluster(A,0)
>>> [list(_) for _ in klust.clustered]
[[0, 0], [1], [2, 2, 2], [3], [4], [3], [4, 4, 4]]
>>> klust.uniques
array([0, 1, 2, 3, 4, 3, 4])

>>> x = [ 1.8, 1.3, 2.4, 1.2, 2.5, 3.9, 1. , 3.8, 4.2, 3.3,
...       1.2, 0.2, 0.9, 2.7, 2.4, 2.8, 2.7, 4.7, 4.2, 0.4]
>>> Cluster(x,1).starts
array([ 0,  2,  3,  4,  5,  6,  7, 10, 11, 13, 17, 19])
>>> Cluster(x,1.5).starts
array([ 0,  6,  7, 10, 13, 17, 19])
>>> Cluster(x,2.5).starts
array([ 0,  6,  7, 19])
>>> Cluster(x,2.5,greater).starts
array([ 0,  1,  2,  3,  4,  5,  8,  9, 10,
...    11, 12, 13, 14, 15, 16, 17, 18])
>>> y = [ 0, -1, 0, 0, 0, 1, 1, -1, -1, -1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 0]
>>> Cluster(y,1).starts
array([ 0,  1,  2,  5,  7, 10, 12, 16, 18])

"""
def __init__(self, darray, increment=1, operator=np.less_equal):
"""
Initializes instance.

Parameters
----------
darray : ndarray
Input data array to clusterize.
increment : {float}, optional
Increment between two consecutive values to group.
By default, use a value of 1.
operator : {function}, optional
Comparison operator for the definition of clusters.
By default, use :func:`numpy.less_equal`

"""
raise ma.MAError("Masked arrays should be filled prior
clustering.")
else:
darray = np.asanyarray(darray)
n = darray.size
self.inishape = darray.shape
self.inisize = darray.size
darray = darray.ravel()
clustercond = 1 - operator(np.absolute(np.diff(darray)),
increment)
sid = np.concatenate(([0, ],
np.arange(1, n).compress(clustercond),
[n, ]))
slobj = np.asarray([slice(i, d)
sid[1:])])
#
self.uniques = darray[sid[:-1]]
self.clustered = [darray[k] for k in slobj]
self.sizes = np.asarray(np.diff(sid))
self.slices = slobj
self.starts = sid[:-1]

def markonsize(self, operator, sizethresh):
"""
Creates a **mask** for the clusters that do not meet a size
requirement.
Thus, outputs ``False`` if the size requirement is met, ``True``
otherwise.

Parameters
----------
operator : function
Comparison operator
sizethresh : float
Requirement for the sizes of the clusters

"""
#        for k in self.slices.compress(operator(self.sizes,sizethresh)):
for k in self.slices[operator(self.sizes, sizethresh)]:

def mark_greaterthan(self, sizemin):
"""
Shortcut for :meth:`markonsize(greater_equal,sizemin)`.
Thus, the command outputs ``False`` for clusters larger than
``sizemin``, and
``True`` for clusters smaller than ``sizemin``.

Parameters
----------
sizemin : int
Minimum size of the clusters.

--------
:meth:`markonsize`
Creates a **mask** for the clusters that do not meet a size
requirement.
"""
return self.markonsize(np.greater_equal, sizemin)

def grouped_slices(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of slices for the corresponding values.

--------
Cluster.grouped_limits
that does the same thing
"""
#
uniques = self.uniques.view(np.ndarray)
output = dict([(k, []) for k in np.unique1d(uniques)])
for (k, v) in zip(self.uniques, self.slices):
output[k].append(v)
return output

def grouped_limits(self):
"""
Returns a dictionary with the unique values of ``self`` as keys,
and a list
of tuples (starting index, ending index) for the corresponding
values.

--------
Cluster.grouped_slices
"""
output = dict([(k, []) for k in np.unique1d(self.uniques)])
for (k, v) in zip(self.uniques, self.slices):
output[k].append((v.start, v.stop))
for k in output:
output[k] = np.array(output[k])
return output

```