[SciPy-User] Hcluster Negative Distance Error?

disappearedng disappearedng@gmail....
Tue Apr 6 15:04:28 CDT 2010

Dear Everyone,

I have a input file which are all floating point numbers to 4 decimal place.
i.e. 13359    0.0000    0.0000    0.0001    0.0001    0.0002    0.0003
0.0007    ... (the first is the id).
My class uses the loadVectorsFromFile method which multiplies it by 10000
and then int() these numbers. On top of that, I also loop through each
vector to ensure that there are no
negative values inside. However, when I perform _hclustering, I am
continually seeing the error, "Linkage `Z` contains negative values".

I seriously think this is a bug because: 1) I checked my values, 2) the
values are no where small enough or big enough to approach the limits of the
floating point numbers and 3) the
formula that I used to derive the values in the file uses absolute value (my
input is DEFINITELY right).

Can someone enligten me as to why I am seeing this weird error? What is
going on that is causing this negative distance error?


    def loadVectorsFromFile(self, limit, loc, assertAllPositive=True,
        """Inflate to prevent "negative" distance, we use 4 decimal points,
so *10000
        vectors = {}
        self.winfo("Each vector is set to have %d limit in length" % limit)
        with open( loc ) as inf:
            for line in filter(None, inf.read().split('\n')):
                l = line.split('\t')
                if limit:
                    scores = map(float, l[1:limit+1])
                    scores = map(float, l[1:])

                if inflate:
                    vectors[ l[0]] = map( lambda x: int(x*10000),
scores)     #int might save space
                    vectors[ l[0]] = scores

        if assertAllPositive:
            #Assert that it has no negative value
            for dirID, l in vectors.iteritems():
                if reduce(operator.or_, map( lambda x: x < 0, l)):
                    self.werror( "Vector %s has negative values!" % dirID)
        return vectors

    def main( self, inputDir, outputDir, limit=0,
        Loads vector from a file and start clustering
            vectors is { featureID: tfidfVector (list), }
        IDFeatureDic = loadIdFeatureGroupDicFromIntermediate(
pjoin(self.configDir, mappingFname))
        if not os.path.exists(outputDir):

        vectors = self.loadVectorsFromFile( limit, pjoin( inputDir,
        for threshold in map( lambda x:float(x)/30, range(20,30)):
            clusters = self._hclustering(threshold, vectors)
            if clusters:
                outputLoc = pjoin(outputDir, "threshold.%s.result" %
                with open(outputLoc, 'w') as outf:
                    for clusterNo, cluster in clusters.iteritems():
                        outf.write('%s\n' % str(clusterNo))
                        for featureID in cluster:
                            feature, group = IDFeatureDic[featureID]
                            outline = "%s\t%s\n" % (feature, group)

    def _hclustering(self, threshold, vectors):
        """function which you should call to vary the threshold
        vectors:    { featureID:    [ tfidf scores, tfidf score, .. ]
        clusters = defaultdict(list)
        if len(vectors) > 1:
                results = hierarchy.fclusterdata( vectors.values(),
threshold, metric='cosine')
            except ValueError, e:
                self.werror("_hclustering: %s" % str(e))
                return False

            for i, featureID in enumerate( vectors.keys()):
                clusters[results[i]].append( featureID)
            return clusters
            return False
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mail.scipy.org/pipermail/scipy-user/attachments/20100407/625a2b2e/attachment.html 

More information about the SciPy-User mailing list