2009年5月18日月曜日

Performance consideration of numpy.array and tuples

While I was writing a python module in C that calculates the euclidean distance between two spatial points, I came across a question: numpy.array and tuples, which is faster in this particular case? That wouldn't look obvious because numpy.array() is slower in creating a new instance while it allows to access the raw pointer to the underlying data, which should enable faster element retrieval.

So I wrote different versions of a function that calculates dot product in C, and did the benchmark between them.

Here is the code I used in the comparison:

mytestmodule.c:

#include <Python.h>
#include <numpy/arrayobject.h>

PyObject* dotproductSeq(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;

if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}

if (!PySequence_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
return NULL;
}

if (!PySequence_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
return NULL;
}

Py_ssize_t sz = PySequence_Size(arg1);

if (sz != PySequence_Size(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
return NULL;
}

for (i = 0; i < sz; ++i) {
result +=
PyFloat_AsDouble(PySequence_GetItem(arg1, i)) *
PyFloat_AsDouble(PySequence_GetItem(arg2, i));
}

return PyFloat_FromDouble(result);
}

PyObject* dotproductSeqUnsafe(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;

if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}

if (!PySequence_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
return NULL;
}

if (!PySequence_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
return NULL;
}

Py_ssize_t sz = PySequence_Fast_GET_SIZE(arg1);

if (sz != PySequence_Fast_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
return NULL;
}

for (i = 0; i < sz; ++i) {
result +=
PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg1, i)) *
PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg2, i));
}

return PyFloat_FromDouble(result);
}

PyObject* dotproductTuple(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;

if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}

if (!PyTuple_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
return NULL;
}

if (!PyTuple_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
return NULL;
}

Py_ssize_t sz = PyTuple_GET_SIZE(arg1);

if (sz != PyTuple_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
return NULL;
}

for (i = 0; i < sz; ++i) {
result +=
PyFloat_AsDouble(PyTuple_GET_ITEM(arg1, i)) *
PyFloat_AsDouble(PyTuple_GET_ITEM(arg2, i));
}

return PyFloat_FromDouble(result);
}

PyObject* dotproductTupleUnsafe(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;

if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}

if (!PyTuple_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
return NULL;
}

if (!PyTuple_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
return NULL;
}

Py_ssize_t sz = PyTuple_GET_SIZE(arg1);

if (sz != PyTuple_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
return NULL;
}

for (i = 0; i < sz; ++i) {
result +=
PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg1, i)) *
PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg2, i));
}

return PyFloat_FromDouble(result);
}


PyObject* dotproductNDArray(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
npy_byte const* data1;
npy_byte const* data2;
npy_intp arg1_stride;
npy_intp arg2_stride;
npy_double result = 0;
PyArray_Descr* arg1_descr;
PyArray_Descr* arg2_descr;
int i;

if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}

if (!PyArray_Check(arg1) || PyArray_NDIM(arg1) != 1) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a 1-dimensional ndarray");
return NULL;
}

if (!PyArray_Check(arg2) || PyArray_NDIM(arg2) != 1) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a 1-dimensional ndarray");
return NULL;
}

npy_intp sz = PyArray_DIMS(arg1)[0];

if (sz != PyArray_DIMS(arg2)[0]) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be ndarrays of the same length");
return NULL;
}

arg1_descr = PyArray_DESCR(arg1);
arg2_descr = PyArray_DESCR(arg2);

if (NPY_DOUBLE != arg1_descr->type_num) {
PyErr_SetString(PyExc_ValueError, "elements of argument 1 must be float");
return NULL;
}

if (NPY_DOUBLE != arg2_descr->type_num) {
PyErr_SetString(PyExc_ValueError, "elements of argument 2 must be float");
return NULL;
}

data1 = (npy_byte*)PyArray_DATA(arg1);
data2 = (npy_byte*)PyArray_DATA(arg2);
arg1_stride = PyArray_STRIDES(arg1)[0];
arg2_stride = PyArray_STRIDES(arg2)[0];
result = 0;

for (i = 0; i < sz; ++i) {
result +=
*(npy_double const*)(data1 + arg1_stride * i) *
*(npy_double const*)(data2 + arg2_stride * i);
}

return PyFloat_FromDouble(result);
}

static struct PyMethodDef methods[] = {
{ "dotproductSeq", (PyCFunction)dotproductSeq, METH_VARARGS, "" },
{ "dotproductSeqUnsafe", (PyCFunction)dotproductSeqUnsafe, METH_VARARGS, "" },
{ "dotproductTuple", (PyCFunction)dotproductTuple, METH_VARARGS, "" },
{ "dotproductTupleUnsafe", (PyCFunction)dotproductTupleUnsafe, METH_VARARGS, "" },
{ "dotproductNDArray", (PyCFunction)dotproductNDArray, METH_VARARGS, "" }
};

void initmytest()
{
import_array();
Py_InitModule3("mytest", methods, "");
}


bench.py:

import numpy
from mytest import *
import cProfile as profile
import pstats
import os

def case1():
d = 0.
while d < 10000.:
dotproductNDArray(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1

def case2():
d = 0.
while d < 10000.:
dotproductSeq((d, d, d), (d, d, d))
d += .1

def case3():
d = 0.
while d < 10000.:
dotproductSeq(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1

def case4():
d = 0.
while d < 10000.:
dotproductSeqUnsafe((d, d, d), (d, d, d))
d += .1

def case5():
d = 0.
while d < 10000.:
dotproductTuple((d, d, d), (d, d, d))
d += .1

def case6():
d = 0.
while d < 10000.:
dotproductTupleUnsafe((d, d, d), (d, d, d))
d += .1

def case7():
d = 0.
while d < 10000.:
numpy.dot(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1

def case8():
d = 0.
while d < 10000.:
numpy.dot((d, d, d), (d, d, d))
d += .1

def runProf(fun):
tmp = os.tmpnam() # XXX:racy
profile.run('fun()', tmp)
return tmp

stats = pstats.Stats(*[runProf(fun) for fun in (case1, case2, case3, case4, case5, case6, case7, case8)])
stats.sort_stats('cumulative')
stats.print_stats(__file__)


And the results:


Ordered by: cumulative time
List reduced from 17 to 8 due to restriction <'bench.py'>

ncalls tottime percall cumtime percall filename:lineno(function)
1 0.164 0.164 1.437 1.437 bench.py:43(case7)
1 0.151 0.151 1.377 1.377 bench.py:19(case3)
1 0.145 0.145 1.268 1.268 bench.py:7(case1)
1 0.090 0.090 1.232 1.232 bench.py:49(case8)
1 0.049 0.049 0.089 0.089 bench.py:13(case2)
1 0.048 0.048 0.084 0.084 bench.py:25(case4)
1 0.048 0.048 0.065 0.065 bench.py:31(case5)
1 0.048 0.048 0.063 0.063 bench.py:37(case6)


Summary

  • If you are sure that a given argument is a tuple, use PyTuple_XXX() and that is the fastest.

  • Construction of a ndarray object is roughly 15 times as slow as a tuple.

  • numpy.dot() is remarkably slow.

  • PyFloat_AsDouble() is not that slow comparing to PyFloat_AS_DOUBLE().

Blogger Syntax Highliter

フォロワー