[Scipy-svn] r4094 - trunk/scipy/io/arff

scipy-svn@scip... scipy-svn@scip...
Mon Apr 7 14:10:16 CDT 2008


Author: cdavid
Date: 2008-04-07 14:10:10 -0500 (Mon, 07 Apr 2008)
New Revision: 4094

Added:
   trunk/scipy/io/arff/arffread.py
Log:
Add arff reader implementation.

Added: trunk/scipy/io/arff/arffread.py
===================================================================
--- trunk/scipy/io/arff/arffread.py	2008-04-07 19:09:50 UTC (rev 4093)
+++ trunk/scipy/io/arff/arffread.py	2008-04-07 19:10:10 UTC (rev 4094)
@@ -0,0 +1,519 @@
+#! /usr/bin/env python
+# Last Change: Mon Aug 20 08:00 PM 2007 J
+import re
+import itertools
+import sys
+
+import numpy as N
+
+from scipy.io.arff.utils import partial
+
+"""A module to read arff files."""
+
+# An Arff file is basically two parts: 
+#   - header
+#   - data
+#
+# A header has each of its components starting by @META where META is one of
+# the keyword (attribute of relation, for now).
+
+# TODO:
+#   - both integer and reals are treated as numeric -> the integer info is lost !
+#   - Replace ValueError by ParseError or something
+
+# We know can handle the following:
+#   - numeric and nominal attributes
+#   - missing values for numeric attributes
+
+r_meta = re.compile('^\s*@')
+# Match a comment
+r_comment = re.compile(r'^%')
+# Match an empty line
+r_empty = re.compile(r'^\s+$')
+# Match a header line, that is a line which starts by @ + a word
+r_headerline = re.compile(r'^@\S*')
+r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
+r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
+r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
+
+# To get attributes name enclosed with ''
+r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
+# To get attributes name enclosed with '', possibly spread accross multilines
+r_mcomattrval = re.compile(r"'([..\n]+)'\s+(..+$)")
+# To get normal attributes 
+r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
+
+#-------------------------
+# Module defined exception
+#-------------------------
+class ArffError(IOError):
+    pass
+
+class ParseArffError(ArffError):
+    pass
+
+#------------------
+# Various utilities
+#------------------
+
+# An attribute  is defined as @attribute name value
+def parse_type(attrtype):
+    """Given an arff attribute value (meta data), returns its type.
+    
+    Expect the value to be a name."""
+    uattribute = attrtype.lower().strip()
+    if uattribute[0] == '{':
+        return 'nominal'
+    elif uattribute[:len('real')] == 'real':
+        return 'numeric'
+    elif uattribute[:len('integer')] == 'integer':
+        return 'numeric'
+    elif uattribute[:len('numeric')] == 'numeric':
+        return 'numeric'
+    elif uattribute[:len('string')] == 'string':
+        return 'string'
+    elif uattribute[:len('relational')] == 'relational':
+        return 'relational'
+    else:
+        raise ValueError("unknown attribute %s" % uattribute)
+
+
+def get_nominal(attribute):
+    """If attribute is nominal, returns a list of the values"""
+    return attribute.split(',')
+        
+def read_data_list(ofile):
+    """Read each line of the iterable and put it in a list."""
+    data = [ofile.next()]
+    if data[0].strip()[0] == '{':
+        raise ValueError("This looks like a sparse ARFF: not supported yet")
+    data.extend([i for i in ofile])
+    return data
+
+def get_ndata(ofile):
+    """Read the whole file to get number of data attributes."""
+    data = [ofile.next()]
+    loc = 1
+    if data[0].strip()[0] == '{':
+        raise ValueError("This looks like a sparse ARFF: not supported yet")
+    for i in ofile:
+        loc += 1
+    return loc
+
+def maxnomlen(atrv):
+    """Given a string contening a nominal type definition, returns the string
+    len of the biggest component.
+    
+    A nominal type is defined as seomthing framed between brace ({}).
+    
+    Example: maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
+    ratata, the longest nominal value)."""
+    nomtp = get_nom_val(atrv)
+    return max(len(i) for i in nomtp)
+
+def get_nom_val(atrv):
+    """Given a string contening a nominal type, returns a tuple of the possible
+    values.    
+    
+    A nominal type is defined as something framed between brace ({}).
+    
+    Example: get_nom_val("{floup, bouga, fl, ratata}") returns ("floup",
+    "bouga", "fl", "ratata")."""
+    r_nominal = re.compile('{(..+)}')
+    m = r_nominal.match(atrv)
+    if m:
+        return tuple(i.strip() for i in m.group(1).split(','))
+    else:
+        raise ValueError("This does not look like a nominal string")
+
+def go_data(ofile):
+    """Skip header.
+    
+    the first next() call of the returned iterator will be the @data line"""
+    return itertools.dropwhile(lambda x : not r_datameta.match(x), ofile)
+
+#----------------
+# Parsing header
+#----------------
+def tokenize_attribute(iterable, attribute):
+    """Parse a raw string in header (eg starts by @attribute).
+    
+    Given a raw string attribute, try to get the name and type of the
+    attribute. Constraints:
+        - The first line must start with @attribute (case insensitive, and
+          space like characters begore @attribute are allowed)
+        - Works also if the attribute is spread on multilines. 
+        - Works if empty lines or comments are in between
+    
+    :Parameters:
+        attribute : str
+            the attribute string. 
+    
+    :Returns:
+        name : str
+            name of the attribute
+        value : str
+            value of the attribute
+        next : str
+            next line to be parsed
+
+    Example:
+        - if attribute is a string defined in python as r"floupi real", will
+          return floupi as name, and real as value.
+        - if attribute is r"'floupi 2' real", will return 'floupi 2' as name,
+          and real as value. """
+    sattr = attribute.strip()
+    mattr = r_attribute.match(sattr)
+    if mattr:
+        # atrv is everything after @attribute
+        atrv = mattr.group(1)
+        if r_comattrval.match(atrv):
+            name, type = tokenize_single_comma(atrv)
+            next = iterable.next()
+        elif r_wcomattrval.match(atrv):
+            name, type = tokenize_single_wcomma(atrv)
+            next = iterable.next()
+        else:
+            # Not sure we should support this, as it does not seem supported by
+            # weka.
+            raise ValueError("multi line not supported yet")
+            #name, type, next = tokenize_multilines(iterable, atrv)
+    else:
+        raise ValueError("First line unparsable: %s" % sattr)
+
+    if type == 'relational':
+        raise ValueError("relational attributes not supported yet")
+    return name, type, next
+
+def tokenize_multilines(iterable, val):
+    """Can tokenize an attribute spread over several lines."""
+    # If one line does not match, read all the following lines up to next
+    # line with meta character, and try to parse everything up to there.
+    if not r_mcomattrval.match(val):
+        all = [val]
+        i = iterable.next()
+        while not r_meta.match(i):
+            all.append(i)
+            i = iterable.next()
+        if r_mend.search(i):
+            raise ValueError("relational attribute not supported yet")
+        print "".join(all[:-1])
+        m = r_comattrval.match("".join(all[:-1]))
+        return m.group(1), m.group(2), i
+    else:
+        raise ValueError("Cannot parse attribute names spread over multi "\
+                        "lines yet")
+    
+def tokenize_single_comma(val):
+    # XXX we match twice the same string (here and at the caller level). It is
+    # stupid, but it is easier for now...
+    m = r_comattrval.match(val)
+    if m:
+        try:
+            name = m.group(1).strip()
+            type = m.group(2).strip()
+        except IndexError:
+            raise ValueError("Error while tokenizing attribute")
+    else:
+        raise ValueError("Error while tokenizing single %s" % val)
+    return name, type
+
+def tokenize_single_wcomma(val):
+    # XXX we match twice the same string (here and at the caller level). It is
+    # stupid, but it is easier for now...
+    m = r_wcomattrval.match(val)
+    if m:
+        try:
+            name = m.group(1).strip()
+            type = m.group(2).strip()
+        except IndexError:
+            raise ValueError("Error while tokenizing attribute")
+    else:
+        raise ValueError("Error while tokenizing single %s" % val)
+    return name, type
+
+def read_header(ofile):
+    """Read the header of the iterable ofile."""
+    i = ofile.next()
+
+    # Pass first comments
+    while r_comment.match(i):
+        i = ofile.next()
+
+    # Header is everything up to DATA attribute ?
+    relation = None
+    attributes = []
+    while not r_datameta.match(i):
+        m = r_headerline.match(i)
+        if m:
+            isattr = r_attribute.match(i)
+            if isattr:
+                name, type, i = tokenize_attribute(ofile, i)
+                attributes.append((name, type))
+            else:
+                isrel = r_relation.match(i)
+                if isrel:
+                    relation = isrel.group(1)
+                else:
+                    raise ValueError("Error parsing line %s" % i)
+                i = ofile.next()
+        else:
+            i = ofile.next()
+
+    return relation, attributes
+
+#--------------------
+# Parsing actual data
+#--------------------
+def safe_float(x):
+    """given a string x, convert it to a float. If the stripped string is a ?,
+    return a Nan (missing value)."""
+    if x.strip() == '?':
+        return N.nan
+    else:
+        return N.float(x)
+
+def safe_nominal(value, pvalue):
+    svalue = value.strip()
+    if svalue in pvalue:
+        return svalue
+    elif svalue == '?':
+        return svalue
+    else:
+        raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
+
+def get_delim(line):
+    """Given a string representing a line of data, check whether the
+    delimiter is ',' or space."""
+    l = line.split(',')
+    if len(l) > 1:
+        return ','
+    else:
+        l = line.split(' ')
+        if len(l) > 1:
+            return ' '
+        else:
+            raise ValueError("delimiter not understood: " + line)
+
+class MetaData:
+    """Small container to keep useful informations on a ARFF dataset.
+    
+    Also maintains the list of attributes in order, i.e. doing for i in meta,
+    where meta is an instance of MetaData, will return the different attribute
+    names in the order they were defined."""
+    def __init__(self, rel, attr):
+        self.name = rel
+        # We need the dictionary to be ordered
+        # XXX: may be better to implement an ordered dictionary
+        self._attributes = {}
+        self._attrnames = []
+        for name, value in attr:
+            tp = parse_type(value)
+            self._attrnames.append(name)
+            if tp == 'nominal':
+                self._attributes[name] = (tp, get_nom_val(value))
+            else:
+                self._attributes[name] = (tp, None)
+
+    def __repr__(self):
+        msg = ""
+        msg += "Dataset: %s\n" % self.name
+        for i in self._attrnames:
+            msg += "\t%s's type is %s" % (i, self._attributes[i][0])
+            if self._attributes[i][1]:
+                msg += ", range is %s" % str(self._attributes[i][1])
+            msg += '\n'
+        return msg
+    
+    def __iter__(self):
+        return iter(self._attrnames)
+
+    def __getitem__(self, key):
+        return self._attributes[key]
+
+def read_arff(filename):
+    ofile = open(filename)
+
+    # Parse the header file 
+    try:
+        rel, attr = read_header(ofile)
+    except ValueError, e:
+        msg = "Error while parsing header, error was: " + str(e)
+        raise ParseArffError(msg)
+
+    # Check whether we have a string attribute (not supported yet)
+    hasstr = False
+    for name, value in attr:
+        type = parse_type(value)
+        if type == 'string':
+            hasstr = True
+
+    meta = MetaData(rel, attr)
+
+    # XXX The following code is not great
+    # Build the type descriptor descr and the list of convertors to convert
+    # each attribute to the suitable type (which should match the one in
+    # descr).
+
+    # This can be used once we want to support integer as integer values and
+    # not as numeric anymore (using masked arrays ?).
+    acls2dtype = {'real' : N.float, 'integer' : N.float, 'numeric' : N.float}
+    acls2conv = {'real' : safe_float, 'integer' : safe_float, 'numeric' : safe_float}
+    descr = []
+    convertors = []
+    if not hasstr:
+        for name, value in attr:
+            type = parse_type(value)
+            if type == 'date':
+                raise ValueError("date type not supported yet, sorry")
+            elif type == 'nominal':
+                n = maxnomlen(value)
+                descr.append((name, 'S%d' % n))
+                pvalue = get_nom_val(value)
+                convertors.append(partial(safe_nominal, pvalue = pvalue))
+            else:
+                descr.append((name, acls2dtype[type]))
+                convertors.append(safe_float)
+                #dc.append(acls2conv[type])
+                #sdescr.append((name, acls2sdtype[type]))
+    else:
+        # How to support string efficiently ? Ideally, we should know the max
+        # size of the string before allocating the numpy array.
+        raise NotImplementedError("String attributes not supported yet, sorry")
+
+    ni = len(convertors)
+
+    # Get the delimiter from the first line of data:
+    def next_data_line(row_iter):
+        """Assumes we are already in the data part (eg after @data)."""
+        raw = row_iter.next()
+        while r_empty.match(raw):
+            raw = row_iter.next()
+        while r_comment.match(raw):
+            raw = row_iter.next()
+        return raw
+
+    try:
+        dtline = next_data_line(ofile)
+        delim = get_delim(dtline)
+    except ValueError, e:
+        raise ParseArffError("Error while parsing delimiter: " + str(e))
+    finally:
+        ofile.seek(0, 0)
+        ofile = go_data(ofile)
+        # skip the @data line
+        ofile.next()
+
+    def generator(row_iter, delim = ','):
+        # TODO: this is where we are spending times (~80%). I think things
+        # could be made more efficiently: 
+        #   - We could for example "compile" the function, because some values
+        #   do not change here. 
+        #   - The function to convert a line to dtyped values could also be
+        #   generated on the fly from a string and be executed instead of
+        #   looping.
+        #   - The regex are overkill: for comments, checking that a line starts
+        #   by % should be enough and faster, and for empty lines, same thing
+        #   --> this does not seem to change anything.
+
+        # We do not abstract skipping comments and empty lines for performances
+        # reason.
+        raw = row_iter.next()
+        while r_empty.match(raw):
+            raw = row_iter.next()
+        while r_comment.match(raw):
+            raw = row_iter.next()
+
+        row = raw.split(delim)
+        yield tuple([convertors[i](row[i]) for i in range(ni)])
+        for raw in row_iter:
+            while r_comment.match(raw):
+                raw = row_iter.next()
+            while r_empty.match(raw):
+                raw = row_iter.next()
+            row = raw.split(delim)
+            yield tuple([convertors[i](row[i]) for i in range(ni)])
+
+    a = generator(ofile, delim = delim)
+    # No error should happen here: it is a bug otherwise
+    data = N.fromiter(a, descr)
+    return data, meta
+
+#-----
+# Misc
+#-----
+def basic_stats(data):
+    nbfac = data.size * 1. / (data.size - 1)
+    return N.nanmin(data), N.nanmax(data), N.mean(data), N.std(data) * nbfac
+
+def print_attribute(name, tp, data):
+    type = tp[0]
+    if type == 'numeric' or type == 'real' or type == 'integer':
+        min, max, mean, std = basic_stats(data)
+        print "%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std)
+    else:
+        msg = name + ",{"
+        for i in range(len(tp[1])-1):
+            msg += tp[1][i] + ","
+        msg += tp[1][-1]
+        msg += "}"
+        print msg
+
+def test_weka(filename):
+    data, meta = read_arff(filename)
+    print len(data.dtype)
+    print data.size
+    for i in meta:
+        print_attribute(i,meta[i],data[i])
+
+def floupi(filename):
+    data, meta = read_arff(filename)
+    from attrselect import print_dataset_info
+    print_dataset_info(data)
+    print "relation %s, has %d instances" % (meta.name, data.size)
+    itp = iter(types)
+    for i in data.dtype.names:
+        print_attribute(i,itp.next(),data[i])
+        #tp = itp.next()
+        #if tp == 'numeric' or tp == 'real' or tp == 'integer':
+        #    min, max, mean, std = basic_stats(data[i])
+        #    print "\tinstance %s: min %f, max %f, mean %f, std %f" % \
+        #            (i, min, max, mean, std)
+        #else:
+        #    print "\tinstance %s is non numeric" % i
+
+if __name__ == '__main__':
+    #import glob
+    #for i in glob.glob('arff.bak/data/*'):
+    #    relation, attributes = read_header(open(i))
+    #    print "Parsing header of %s: relation %s, %d attributes" % (i,
+    #            relation, len(attributes))
+
+    import sys
+    filename = sys.argv[1]
+    #filename = 'arff.bak/data/pharynx.arff'
+    #floupi(filename)
+    test_weka(filename)
+
+    #gf = []
+    #wf = []
+    #for i in glob.glob('arff.bak/data/*'):
+    #    try:
+    #        print "=============== reading %s ======================" % i
+    #        floupi(i)
+    #        gf.append(i)
+    #    except ValueError, e:
+    #        print "!!!! Error parsing the file !!!!!"
+    #        print e
+    #        wf.append(i)
+    #    except IndexError, e:
+    #        print "!!!! Error parsing the file !!!!!"
+    #        print e
+    #        wf.append(i)
+    #    except ArffError, e:
+    #        print "!!!! Error parsing the file !!!!!"
+    #        print e
+    #        wf.append(i)
+
+    #print "%d good files" % len(gf)
+    #print "%d bad files" % len(wf)



More information about the Scipy-svn mailing list