[Numpy-discussion] bug in numarray?

Humufr humufr at yahoo.fr
Mon Aug 29 12:17:10 CDT 2005


      Hi,

I think there are a problem with numarray (not sure).

I'm trying to correlate two differents file to find the same object in 
both. To do this I wrote some ugly software and I'm using the 
readcol2.py to read the file in a numarray, numarray string or list format.

The cross_name.py is doing the cross correlation when I'm using the 
numarray string format. I'm using three parameters at differents columns 
and I compare all of them with something like:

numarray.all(a[i,:] == b[j,:])

I saw that my script is very very slow or to be more precise became to 
be slow. It's seems ok at the beginning but little by little is slow 
down by a huge amount. I let it turn all the week end and it found ~40 
000 objects (both files are ~200000 lines...) in common in two days.
I change the software to use the list in python and in some minutes 
I'have ~20 000 objects found in common. So I think there are a big 
problem probably: 1) in my script, perhaps 2) in numarray or 3) in both.


I hope to have explain the problem clearly ...


N.

ps: I print an output for the script cross_name.py to visually see the 
slow down and that appeard to became slow around the 700 objects in 
common but it's gradully decline.
pps: I join the different file I used. The cross_name.py is the function 
with the problem.


-------------------------------------
#readcol2.py
-------------------------------------
def 
readcol(fname,comments='%',columns=None,delimiter=None,dep=0,arraytype='list'):
     """
     Load ASCII data from fname into an array and return the array.

     The data must be regular, same number of values in every row

     fname can be a filename or a file handle.


     Input:

     - Fname : the name of the file to read

     Optionnal input:

     - comments : a string to indicate the charactor to delimit the 
domments.

                  the default is the matlab character '%'.

     - columns : list or tuple ho contains the columns to use.

     - delimiter : a string to delimit the columns

     - dep : an integer to indicate from which line you want to begin

             to use the file (useful to avoid the descriptions lines)

     - arraytype : a string to indicate which kind of array you want ot

                   have: numeric array (numeric) or character array 
(numstring) or list (list). By default it's the

                   list mode used
		
		

     matfile data is not currently supported, but see
     Nigel Wade's matfile ftp://ion.le.ac.uk/matfile/matfile.tar.gz

     Example usage:

     x,y = transpose(readcol('test.dat'))  # data in two columns

     X = readcol('test.dat')    # a matrix of data

     x = readcol('test.dat')    # a single column of data

     x = readcol('test.dat,'#') # the character use like a comment 
delimiter is '#'

     initial function from pylab, improve by myself for my need

     """
     from numarray import array,transpose


     fh = file(fname)

     X = []
     numCols = None
     nline = 0
     if columns is None:
         for line in fh:
             nline += 1
             if dep is not None and nline <= dep: continue
             line = line[:line.find(comments)].strip()
             if not len(line): continue
             if arraytype=='numeric':
                 row = [float(val) for val in line.split(delimiter)]
             else:
                 row = [val.strip() for val in line.split(delimiter)]
             thisLen = len(row)
             if numCols is not None and thisLen != numCols:
                 raise ValueError('All rows must have the same number of 
columns')
             X.append(row)
     else:
         for line in fh:
             nline +=1
             if dep is not None and nline <= dep: continue
             line = line[:line.find(comments)].strip()
             if not len(line): continue
             row = line.split(delimiter)
             if arraytype=='numeric':
                 row = [float(row[i-1]) for i in columns]
             elif arraytype=='numstring':
                 row = [row[i-1].strip() for i in columns]
             else:
	    	row = [row[i-1].strip() for i in columns]
	    thisLen = len(row)
	
	    	
	
             if numCols is not None and thisLen != numCols:
                 raise ValueError('All rows must have the same number of 
columns')
             X.append(row)

     if arraytype=='numeric':
         X = array(X)
     	r,c = X.shape
     	if r==1 or c==1:
         	X.shape = max([r,c]),
     elif arraytype == 'numstring':
         import numarray.strings               # pb si numeric+pylab
         X = numarray.strings.array(X)
     	r,c = X.shape
     	if r==1 or c==1:
         	X.shape = max([r,c]),
     	
     return X


----------------------------------------------------------------
#cross_name.py

----------------------------------------------------------------

#/usr/bin/env python

'''
	Software to cross correlate two files. To use it you had to file a 
params file
	who contains the information of the file you want to correlate.
	The information must have the format:
	   namefile = list of column ; delimiter
	
	example:
	   file1 = 1,2,3 ;
	   file2 = 20,19,21 ; ,
	
	no delimiter = blanck
'''

# there are a big problem of efficiency. The software is far to long 
with big file like SDSS.
# I had to find where is the problem

import sys
import numarray
import string

#read the params file
params = {}
for line in file(sys.argv[1],'rU'):
     line = line.strip()         # delete the end of line (\n on unix)
     if not len(line): continue  # is line empty do nothing and pass to 
the next line
     if line.startswith('#'): continue # test if the line is a comments 
(# is the character to signal it)
     tup = line.split('=',1)     # split the line, the delimiter is the 
sign =
     columns = [int(i) for i in 
tup[1].strip().split(';')[0].strip().split(',')] # creat a list who 
contains
     										# the columns we want to use
     delimiter = tup[1].strip().split(';')[1].strip()	# check the 
delimiter of the data file (generally space or coma)
     if not len(delimiter): delimiter = None
     params[tup[0].strip()] = { 'columns' : columns, 'delimiter' : 
delimiter}

# Read the data files (only the columns ask in the params file)
debut_data = 1
data = []
for namefile in params.iterkeys():
     import readcol2  #import the function to read the files
 
#data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character'))
     params[namefile]['data'] = 
readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='character')


# Read another times the data files to have all the lines!
# Question: like it's a dictionnary are we sure that the file are in the 
same order... Check it!!!!!!!!!
if len(params.keys()) == 2:
     namefile,data,delimiter = [],[],[]
     for keys in params.iterkeys():
         namefile.append(keys)
         data.append(params[keys]['data'])
         delim = params[keys]['delimiter']
         if delim != None:
             delimiter.append(params[keys]['delimiter'])
         else:
             delimiter.append('   ')
     #res_a = []
     #res_b = []

     f1_ini = file(namefile[0]).readlines()[debut_data:]
     f2_ini = file(namefile[1]).readlines()[debut_data:]

     #f1_ini = [line for line in file(namefile[0])][debut_data:]
     #f2_ini = [line for line in file(namefile[1])][debut_data:]

     f1=open('cross'+namefile[0],'w')
     f2=open('cross'+namefile[1],'w')
     f3=open('pastecross'+namefile[0]+namefile[1],'w')

     b_i = 0
     for a_i in range(data[0].shape[0]):
	for b_i in range(b_i,data[1].shape[0]):
             if numarray.all(data[0][a_i,:] == data[1][b_i,:]):
                 f1.write(f1_ini[a_i])
                 f2.write(f2_ini[b_i])
                 f3.write(f1_ini[a_i].strip()+delimiter[0]+' 
'+string.replace(f2_ini[b_i],delimiter[1],delimiter[0]))
                 del f2_ini[b_i]
                 break
                 #res_a.append(a_i)
                 #res_b.append(b_i)
     f1.close()
     f2.close()
     f3.close()
else:
     print "too much file: only two allowed for the moment"



#save the results in 3 files: 2 with the common objects from each file.
# one with a paste of the lines of the 2 initial files.

-----------------------------------------------------------------------

#cross_name2.py

---------------------------------------------------------------------
#/usr/bin/env python

'''
	Software to cross correlate two files. To use it you had to file a 
params file
	who contains the information of the file you want to correlate.
	The information must have the format:
	   namefile = list of column ; delimiter
	
	example:
	   file1 = 1,2,3 ;
	   file2 = 20,19,21 ; ,
	
	no delimiter = blanck
'''

# there are a big problem of efficiency. The software is far to long 
with big file like SDSS.
# I had to find where is the problem

import sys
import numarray
import string

#read the params file
params = {}
for line in file(sys.argv[1],'rU'):
     line = line.strip()         # delete the end of line (\n on unix)
     if not len(line): continue  # is line empty do nothing and pass to 
the next line
     if line.startswith('#'): continue # test if the line is a comments 
(# is the character to signal it)
     tup = line.split('=',1)     # split the line, the delimiter is the 
sign =
     columns = [int(i) for i in 
tup[1].strip().split(';')[0].strip().split(',')] # creat a list who 
contains
     										# the columns we want to use
     delimiter = tup[1].strip().split(';')[1].strip()	# check the 
delimiter of the data file (generally space or coma)
     if not len(delimiter): delimiter = None
     params[tup[0].strip()] = { 'columns' : columns, 'delimiter' : 
delimiter}

# Read the data files (only the columns ask in the params file)
debut_data = 1
data = []
for namefile in params.iterkeys():
     import readcol2  #import the function to read the files
 
#data.append(readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=1,arraytype='character'))
     params[namefile]['data'] = 
readcol2.readcol(namefile,columns=params[namefile]['columns'],comments='#',delimiter=params[namefile]['delimiter'],dep=debut_data,arraytype='list')


# Read another times the data files to have all the lines!
# Question: like it's a dictionnary are we sure that the file are in the 
same order... Check it!!!!!!!!!
if len(params.keys()) == 2:
     namefile,data,delimiter = [],[],[]
     for keys in params.iterkeys():
         namefile.append(keys)
         data.append(params[keys]['data'])
         delim = params[keys]['delimiter']
         if delim != None:
             delimiter.append(params[keys]['delimiter'])
         else:
             delimiter.append('   ')
     #res_a = []
     #res_b = []

     f1_ini = file(namefile[0]).readlines()[debut_data:]
     f2_ini = file(namefile[1]).readlines()[debut_data:]

     #f1_ini = [line for line in file(namefile[0])][debut_data:]
     #f2_ini = [line for line in file(namefile[1])][debut_data:]

     f1=open('cross'+namefile[0],'w')
     f2=open('cross'+namefile[1],'w')
     f3=open('pastecross'+namefile[0]+namefile[1],'w')

#     i=0
#     for a_i in range(len(data[0])):
#     	#print data[0][a_i,:]
#     	for b_i in range(len(data[1])):
# 		if data[0][a_i] == data[1][b_i]:
# 			print data[0][a_i],data[1][b_i]
# 			i+=1
# 			print i
# 			break
     b_i=0
     for a_i in range(len(data[0])):
	for b_i in range(b_i,len(data[1])):
             if data[0][a_i] == data[1][b_i]:
                 f1.write(f1_ini[a_i])
                 f2.write(f2_ini[b_i])
                 f3.write(f1_ini[a_i].strip()+delimiter[0]+' 
'+string.replace(f2_ini[b_i],delimiter[1],delimiter[0]))
                 del f2_ini[b_i]
                 break
                 #res_a.append(a_i)
                 #res_b.append(b_i)
     f1.close()
     f2.close()
     f3.close()
else:
     print "too much file: only two allowed for the moment"



#save the results in 3 files: 2 with the common objects from each file.
# one with a paste of the lines of the 2 initial files.




More information about the Numpy-discussion mailing list