[Numpy-discussion] performance comparison of C++ vs Numeric (MA) operations.
Joe Van Andel
vanandel at atd.ucar.edu
Tue Jun 12 19:20:10 CDT 2001
I was curious about the relative performance of C++ vs Numeric Python,
for operations on arrays of roughly 400,000 array elements. I built a
simple array single precision multiplication function in C++, that
performs an element by element multiply, checking whether each element
is "valid" or "missing data".
Then, for comparision, I wrote a similar multiplication routine, using
the Masked Array (MA) package of Numeric Python.
I compiled Numeric Python (20.1.0b2) with '-O3', by modifying setup.py
to contain lines like
OPTIMIZE=['-O3']
ext_modules =
.
.
Extension('multiarray', ['Src/multiarraymodule.c'],
extra_compile_args=OPTIMIZE
),
---------------------------------------
On an 800 Mhz dual processor Dell Linux box, using gcc 2.95.3,
Software Performance
------------------------------------------------
Numeric Python 5.0e6 multiplies/second
Numeric Python -03 6.1e6 multiplies/second
C++ 10.3e6 multiplies/second
C++ -O3 10.3e6 multiplies/second
(I tried using "plain" Numeric arrays, rather than Masked arrays, and it
didn't seem to make much difference.)
Has anyone else benchmarked the relative performance of C/C++ vs Numeric
Python?
Does anyone know of other optimizations to Numeric Python that could be
implemented?
I know a more realistic benchmark would include I/O, which might tend to
reduce the apparent difference in performance.
I've attached the benchmark modules, in case someone would like to
examine them.
--
Joe VanAndel
National Center for Atmospheric Research
http://www.atd.ucar.edu/~vanandel/
Internet: vanandel at ucar.edu
-------------- next part --------------
import sys
# test harness for Masked array performonce
from MA import *
#from Numeric import *
from Perp.util.TimerUtility import TimerUtility
def mult_test(a1, a2):
res = a1 * a2
if __name__ == '__main__':
repeat = 100
gates = 1000
beams = 370
if len(sys.argv) > 1:
repeat = int(sys.argv[1])
t1 = ones((beams, gates), Float)
a1 = masked_values(t1, -327.68)
a2 = masked_values(t1, -327.68)
i = 0
tu = TimerUtility(())
while (i < repeat):
i = i+1
res = mult_test(a1, a2)
elapsed = tu.elapsed()
print 'completed %d in %f seconds' % (repeat , elapsed)
cntMultiply = repeat*gates*beams
print '%8.3g checked multiplies/second' % (cntMultiply/elapsed)
-------------- next part --------------
#include <iostream>
#include <stdlib.h>
#include "PerfTimer.h"
typedef float *FLOAT_PTR;
extern void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue);
const int gates = 1000;
const int beams = 370;
int main(int argc, char *argv[])
{
int repeat =100;
const float missingValue = -327.68;
if (argc > 1) repeat = atoi(argv[1]);
FLOAT_PTR *a1 = new FLOAT_PTR[beams];
FLOAT_PTR *a2 = new FLOAT_PTR[beams];
FLOAT_PTR *res = new FLOAT_PTR[beams];
// allocate storage for 2d variables
for (int b = 0; b < beams; ++b) {
a1[b] = new float[gates];
a2[b] = new float[gates];
res[b] = new float[gates];
}
PerfTimer pt;
for (int r = 0; r < repeat; ++r) {
mult_test(a1, a2, res,missingValue);
}
double elapsed = pt.Elapsed();
double cntMultiply = repeat*gates*beams;
cout << repeat << " repetitions completed" << endl;
cout << cntMultiply << "checked multiplies" << endl;
cout << cntMultiply/elapsed << "checked multiplies/second" << endl;
}
void mult_test(FLOAT_PTR *a1, FLOAT_PTR *a2, FLOAT_PTR *resp, float missingValue)
{
const float atol = 1.e-8;
const float rtol = 1.0e-5;
for (int b=0; b < beams; ++b) {
for (int g = 0; g < gates; ++g) {
if (fabs(a1[b][g] - missingValue) < atol + rtol * fabs(missingValue)) {
resp[b][g] = missingValue;
continue;
} else if (fabs(a2[b][g] - missingValue) < atol + rtol * fabs(missingValue)) {
resp[b][g] = missingValue;
} else {
resp[b][g] = a1[b][g] * a2[b][g];
}
} // for gates
} // for beams
}
-------------- next part --------------
#include <time.h>
class PerfTimer {
public:
// constructor, starts timing
PerfTimer();
// reset starting time
void Start();
// compute elapsed time since last construction or Start()
double Elapsed();
private:
struct timespec startTime_;
};
-------------- next part --------------
#include "PerfTimer.h"
PerfTimer::PerfTimer() {
Start();
}
void
PerfTimer::Start() {
clock_gettime(CLOCK_REALTIME, &startTime_);
}
double
PerfTimer::Elapsed() {
struct timespec stopTime;
clock_gettime(CLOCK_REALTIME, &stopTime);
return (stopTime.tv_sec + stopTime.tv_nsec/1.0e9 -
(startTime_.tv_sec + startTime_.tv_nsec/1.0e9) );
}
-------------- next part --------------
#CCFLAGS=-O3
CCFLAGS=
CXXFLAGS=${CCFLAGS}
all: arrayperf
arrayperf: arrayperf.cc
g++ -o arrayperf arrayperf.cc PerfTimer.cc -lm -lrt
-------------- next part --------------
#
import time
class TimerUtility:
""" Timer/Utility for performance measurment
"""
def __init__(self, verbose=1,useElapsed=0):
""" ctor: can suppress printing by settings **verbose** to 0
"""
# if we're timing elapsed events, including subprocesses,
# then use time.time()
if useElapsed:
self.__timeFunc = time.time
else:
self.__timeFunc = time.clock
self.__startTime = self.__timeFunc()
self.__lastTime = self.__startTime
self.__verbose = verbose
def elapsed(self, msg = 'Elapsed '):
""" print elapsed time since instance creation or last **elapsed()** call
"""
current = self.__timeFunc()
delta = (current - self.__lastTime)
if (self.__verbose):
print '%s : %5.2f' % (msg, float(delta))
self.__lastTime = current
return delta
def rate(self, count, msg = 'rate'):
""" print elapsed time and rate since instance creation or last **elapsed()** call
"""
current = self.__timeFunc()
delta = (current - self.__lastTime)
if (self.__verbose):
print '%s : %5.2f : %6.2f' % (msg, float(delta),
float(count)/float(delta))
self.__lastTime = current
return delta
def total(self, msg = 'Total '):
""" print total time since TimerUtility was created
"""
current = self.__timeFunc()
diff = (current - self.__startTime)
if (self.__verbose):
print '%s : %5.2f' % (msg, diff)
return diff
More information about the Numpy-discussion
mailing list