[Numpy-discussion] convert csv file into recarray without pre-specifying dtypes and variable names
Timothy Hochberg
tim.hochberg@ieee....
Sun Jul 8 22:25:11 CDT 2007
On 7/8/07, Vincent Nijs <v-nijs@kellogg.northwestern.edu> wrote:
>
> Thanks for looking into this Torgil! I agree that this is a much more
> complicated setup. I'll check if there is anything I can do on the data
> end.
> Otherwise I'll go with Timothy's suggestion and read in numbers as floats
> and convert to int later as needed.
Here is a strategy that should allow auto detection without too much in the
way of inefficiency. The basic idea is to convert till you run into a
problem, store that data away, and continue the conversion with a new dtype.
At the end you assemble all the chunks of data you've accumulated into one
large array. It should be reasonably efficient in terms of both memory and
speed.
The implementation is a little rough, but it should get the idea across.
--
. __
. |-\
.
. tim.hochberg@ieee.org
========================================================================
def find_formats(items, last):
formats = []
for i, x in enumerate(items):
dt, cvt = string_to_dt_cvt(x)
if last is not None:
last_cvt, last_dt = last[i]
if last_cvt is float and cvt is int:
cvt = float
formats.append((dt, cvt))
return formats
class LoadInfo(object):
def __init__(self, row0):
self.done = False
self.lastcols = None
self.row0 = row0
def data_iterator(lines, converters, delim, info):
yield tuple(f(x) for f, x in zip(converters, info.row0.split(delim)))
try:
for row in lines:
yield tuple(f(x) for f, x in zip(converters, row.split(delim)))
except:
info.row0 = row
else:
info.done = True
def load2(fname,delim = ',', has_varnm = True, prn_report = True):
"""
Loading data from a file using the csv module. Returns a recarray.
"""
f=open(fname,'rb')
if has_varnm:
varnames = [i.strip() for i in f.next().split(delim)]
else:
varnames = None
info = LoadInfo(f.next())
chunks = []
while not info.done:
row0 = info.row0.split(delim)
formats = find_formats(row0, info.lastcols)
if varnames is None:
varnames = varnm = ['col%s' % str(i+1) for i, _ in
enumerate(formate)]
descr=[]
conversion_functions=[]
for name, (dtype, cvt_fn) in zip(varnames, formats):
descr.append((name,dtype))
conversion_functions.append(cvt_fn)
chunks.append(N.fromiter(data_iterator(f, conversion_functions,
delim, info), descr))
if len(chunks) > 1:
n = sum(len(x) for x in chunks)
data = N.zeros([n], chunks[-1].dtype)
offset = 0
for x in chunks:
delta = len(x)
data[offset:offset+delta] = x
offset += delta
else:
[data] = chunks
# load report
if prn_report:
print "##########################################\n"
print "Loaded file: %s\n" % fname
print "Nr obs: %s\n" % data.shape[0]
print "Variables and datatypes:\n"
for i in data.dtype.descr:
print "Varname: %s, Type: %s, Sample: %s" % (i[0], i[1],
str(data[i[0]][0:3]))
print "\n##########################################\n"
return data
