[Numpy-discussion] Faster

Keith Goodman kwgoodman@gmail....
Fri May 2 19:51:15 CDT 2008


On Fri, May 2, 2008 at 5:47 PM, Keith Goodman <kwgoodman@gmail.com> wrote:
>
> On Fri, May 2, 2008 at 5:38 PM, Charles R Harris
>  <charlesr.harris@gmail.com> wrote:
>  > On Fri, May 2, 2008 at 6:24 PM, Keith Goodman <kwgoodman@gmail.com> wrote:
>  > > How can I make this function faster? It removes the i-th row and
>  > > column from an array.
>  > >
>  >
>  > Why do you want to do that?
>
>  Single linkage clustering; x is the distance matrix.

Here's the full code if you are interested. I haven't used it yet
other than running the test and test2 so it may be full of bugs.

import time

import numpy as np

class Cluster:
    "Single linkage hierarchical clustering"

    def __init__(self, dist, label=None, debug=False):
        """
        dist   Distance matrix, NxN numpy array
        label  Labels of each row of the distance matrix, list of N items,
               default is range(N)
        """
        assert dist.shape[0] == dist.shape[1], 'dist must be square (nxn)'
        assert (np.abs(dist - dist.T) < 1e-8).all(), 'dist must be symmetric'
        if label is None:
            label = range(dist.shape[0])
        assert dist.shape[0] == len(label), 'dist and label must match
in size'
        self.c = [[[z] for z in label]]
        self.label = label
        self.dist = dist
        self.debug = debug

    def run(self):
        for level in xrange(len(self.label) - 1):
            i, j = self.min_dist()
            self.join(i, j)

    def join(self, i, j):

        assert i != j, 'Cannot combine a cluster with itself'

        # Join labels
        new = list(self.c[-1])
        new[i] = new[i] + new[j]
        new.pop(j)
        self.c.append(new)

        # Join distance matrix
        self.dist[:,i] = self.dist[:,[i,j]].min(1)
        self.dist[i,:] = self.dist[:,i]
        idx = range(self.dist.shape[1])
        idx.remove(j)
        self.dist = self.dist[:,idx]
        self.dist = self.dist[idx,:]

        # Debug output
        if self.debug:
            print
            print len(self.c) - 1
            print 'Clusters'
            print self.c[-1]
            print 'Distance'
            print self.dist

    def min_dist(self):
        dist = self.dist + 1e10 * np.eye(self.dist.shape[0])
        i, j = np.where(dist == dist.min())
        return i[0], j[0]


def test():
    # Example from
    # home.dei.polimi.it/matteucc/Clustering/tutorial_html/hierarchical.html
    label =          ['BA', 'FI', 'MI', 'NA', 'RM', 'TO']
    dist = np.array([[0,    662,  877,  255,  412,  996],
                     [662,  0,    295,  468,  268,  400],
                     [877,  295,  0,    754,  564,  138],
                     [255,  468,  754,  0,    219,  869],
                     [412,  268,  564,  219,  0,    669],
                     [996,  400,  138,  869,  669,  0  ]])
    clust = Cluster(dist, label, debug=True)
    clust.run()

def test2(n):
    x = np.random.rand(n,n)
    x = x + x.T
    c = Cluster(x)
    t1 = time.time()
    c.run()
    t2 = time.time()
    print 'n = %d took %0.2f seconds' % (n, t2-t1)


More information about the Numpy-discussion mailing list