[SciPy-User] Hcluster Negative Distance Error?
disappearedng
disappearedng@gmail....
Tue Apr 6 15:04:28 CDT 2010
Dear Everyone,
I have a input file which are all floating point numbers to 4 decimal place.
i.e. 13359 0.0000 0.0000 0.0001 0.0001 0.0002 0.0003
0.0007 ... (the first is the id).
My class uses the loadVectorsFromFile method which multiplies it by 10000
and then int() these numbers. On top of that, I also loop through each
vector to ensure that there are no
negative values inside. However, when I perform _hclustering, I am
continually seeing the error, "Linkage `Z` contains negative values".
I seriously think this is a bug because: 1) I checked my values, 2) the
values are no where small enough or big enough to approach the limits of the
floating point numbers and 3) the
formula that I used to derive the values in the file uses absolute value (my
input is DEFINITELY right).
Can someone enligten me as to why I am seeing this weird error? What is
going on that is causing this negative distance error?
=====
def loadVectorsFromFile(self, limit, loc, assertAllPositive=True,
inflate=True):
"""Inflate to prevent "negative" distance, we use 4 decimal points,
so *10000
"""
vectors = {}
self.winfo("Each vector is set to have %d limit in length" % limit)
with open( loc ) as inf:
for line in filter(None, inf.read().split('\n')):
l = line.split('\t')
if limit:
scores = map(float, l[1:limit+1])
else:
scores = map(float, l[1:])
if inflate:
vectors[ l[0]] = map( lambda x: int(x*10000),
scores) #int might save space
else:
vectors[ l[0]] = scores
if assertAllPositive:
#Assert that it has no negative value
for dirID, l in vectors.iteritems():
if reduce(operator.or_, map( lambda x: x < 0, l)):
self.werror( "Vector %s has negative values!" % dirID)
return vectors
def main( self, inputDir, outputDir, limit=0,
inFname="data.vectors.all",
mappingFname='all.id.features.group.intermediate'):
"""
Loads vector from a file and start clustering
INPUT
vectors is { featureID: tfidfVector (list), }
"""
IDFeatureDic = loadIdFeatureGroupDicFromIntermediate(
pjoin(self.configDir, mappingFname))
if not os.path.exists(outputDir):
os.makedirs(outputDir)
vectors = self.loadVectorsFromFile( limit, pjoin( inputDir,
inFname))
for threshold in map( lambda x:float(x)/30, range(20,30)):
clusters = self._hclustering(threshold, vectors)
if clusters:
outputLoc = pjoin(outputDir, "threshold.%s.result" %
str(threshold))
with open(outputLoc, 'w') as outf:
for clusterNo, cluster in clusters.iteritems():
outf.write('%s\n' % str(clusterNo))
for featureID in cluster:
feature, group = IDFeatureDic[featureID]
outline = "%s\t%s\n" % (feature, group)
outf.write(outline.encode('utf-8'))
outf.write("\n")
else:
continue
def _hclustering(self, threshold, vectors):
"""function which you should call to vary the threshold
vectors: { featureID: [ tfidf scores, tfidf score, .. ]
"""
clusters = defaultdict(list)
if len(vectors) > 1:
try:
results = hierarchy.fclusterdata( vectors.values(),
threshold, metric='cosine')
except ValueError, e:
self.werror("_hclustering: %s" % str(e))
return False
for i, featureID in enumerate( vectors.keys()):
clusters[results[i]].append( featureID)
return clusters
else:
return False
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://mail.scipy.org/pipermail/scipy-user/attachments/20100407/625a2b2e/attachment.html
More information about the SciPy-User
mailing list