[Numpy-discussion] convert csv file into recarray without pre-specifying dtypes and variable names

Timothy Hochberg tim.hochberg@ieee....
Sun Jul 8 22:25:11 CDT 2007


On 7/8/07, Vincent Nijs <v-nijs@kellogg.northwestern.edu> wrote:
>
> Thanks for looking into this Torgil! I agree that this is a much more
> complicated setup. I'll check if there is anything I can do on the data
> end.
> Otherwise I'll go with Timothy's suggestion and read in numbers as floats
> and convert to int later as needed.


Here is a strategy that should allow auto detection without too much in the
way of inefficiency. The basic idea is to convert till you run into a
problem, store that data away, and continue the conversion with a new dtype.
At the end you assemble all the chunks of data you've accumulated into one
large array. It should be reasonably efficient in terms of both memory and
speed.

The implementation is a little rough, but it should get the idea across.

-- 
.  __
.   |-\
.
.  tim.hochberg@ieee.org

========================================================================

def find_formats(items, last):
    formats = []
    for i, x in enumerate(items):
        dt, cvt = string_to_dt_cvt(x)
        if last is not None:
            last_cvt, last_dt = last[i]
            if last_cvt is float and cvt is int:
                cvt = float
        formats.append((dt, cvt))
    return formats

class LoadInfo(object):
    def __init__(self, row0):
        self.done = False
        self.lastcols = None
        self.row0 = row0

def data_iterator(lines, converters, delim, info):
    yield tuple(f(x) for f, x in zip(converters, info.row0.split(delim)))
    try:
        for row in lines:
            yield tuple(f(x) for f, x in zip(converters, row.split(delim)))
    except:
        info.row0 = row
    else:
        info.done = True

def load2(fname,delim = ',', has_varnm = True, prn_report = True):
    """
    Loading data from a file using the csv module. Returns a recarray.
    """
    f=open(fname,'rb')

    if has_varnm:
        varnames = [i.strip() for i in f.next().split(delim)]
    else:
        varnames = None


    info = LoadInfo(f.next())
    chunks = []

    while not info.done:
        row0 = info.row0.split(delim)
        formats = find_formats(row0, info.lastcols)
        if varnames is None:
            varnames = varnm = ['col%s' % str(i+1) for i, _ in
enumerate(formate)]
        descr=[]
        conversion_functions=[]
        for name, (dtype, cvt_fn) in zip(varnames, formats):
            descr.append((name,dtype))
            conversion_functions.append(cvt_fn)

        chunks.append(N.fromiter(data_iterator(f, conversion_functions,
delim, info), descr))

    if len(chunks) > 1:
        n = sum(len(x) for x in chunks)
        data = N.zeros([n], chunks[-1].dtype)
        offset = 0
        for x in chunks:
            delta = len(x)
            data[offset:offset+delta] = x
            offset += delta
    else:
        [data] = chunks

    # load report
    if prn_report:
        print "##########################################\n"
        print "Loaded file: %s\n" % fname
        print "Nr obs: %s\n" % data.shape[0]
        print "Variables and datatypes:\n"
        for i in data.dtype.descr:
            print "Varname: %s, Type: %s, Sample: %s" % (i[0], i[1],
str(data[i[0]][0:3]))
            print "\n##########################################\n"

    return data
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://projects.scipy.org/pipermail/numpy-discussion/attachments/20070708/53dd8bc4/attachment-0001.html 


More information about the Numpy-discussion mailing list