mozo is a mozo: 2009

While I was writing a python module in C that calculates the euclidean distance between two spatial points, I came across a question: numpy.array and tuples, which is faster in this particular case? That wouldn't look obvious because numpy.array() is slower in creating a new instance while it allows to access the raw pointer to the underlying data, which should enable faster element retrieval.

So I wrote different versions of a function that calculates dot product in C, and did the benchmark between them.

Here is the code I used in the comparison:

mytestmodule.c:


#include <Python.h>
#include <numpy/arrayobject.h>

PyObject* dotproductSeq(PyObject* self, PyObject* args)
{
 PyObject *arg1;
 PyObject *arg2;
 int i;
 double result = 0;

 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
     return NULL;
 }

 if (!PySequence_Check(arg1)) {
     PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
     return NULL;
 }

 if (!PySequence_Check(arg2)) {
     PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
     return NULL;
 }

 Py_ssize_t sz = PySequence_Size(arg1);

 if (sz != PySequence_Size(arg2)) {
     PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
     return NULL;
 }

 for (i = 0; i < sz; ++i) {
     result +=
         PyFloat_AsDouble(PySequence_GetItem(arg1, i)) *
         PyFloat_AsDouble(PySequence_GetItem(arg2, i));
 }

 return PyFloat_FromDouble(result);
}

PyObject* dotproductSeqUnsafe(PyObject* self, PyObject* args)
{
 PyObject *arg1;
 PyObject *arg2;
 int i;
 double result = 0;

 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
     return NULL;
 }

 if (!PySequence_Check(arg1)) {
     PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
     return NULL;
 }

 if (!PySequence_Check(arg2)) {
     PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
     return NULL;
 }

 Py_ssize_t sz = PySequence_Fast_GET_SIZE(arg1);

 if (sz != PySequence_Fast_GET_SIZE(arg2)) {
     PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
     return NULL;
 }

 for (i = 0; i < sz; ++i) {
     result +=
         PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg1, i)) *
         PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg2, i));
 }

 return PyFloat_FromDouble(result);
}

PyObject* dotproductTuple(PyObject* self, PyObject* args)
{
 PyObject *arg1;
 PyObject *arg2;
 int i;
 double result = 0;

 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
     return NULL;
 }

 if (!PyTuple_Check(arg1)) {
     PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
     return NULL;
 }

 if (!PyTuple_Check(arg2)) {
     PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
     return NULL;
 }

 Py_ssize_t sz = PyTuple_GET_SIZE(arg1);

 if (sz != PyTuple_GET_SIZE(arg2)) {
     PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
     return NULL;
 }

 for (i = 0; i < sz; ++i) {
     result +=
         PyFloat_AsDouble(PyTuple_GET_ITEM(arg1, i)) *
         PyFloat_AsDouble(PyTuple_GET_ITEM(arg2, i));
 }

 return PyFloat_FromDouble(result);
}

PyObject* dotproductTupleUnsafe(PyObject* self, PyObject* args)
{
 PyObject *arg1;
 PyObject *arg2;
 int i;
 double result = 0;

 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
     return NULL;
 }

 if (!PyTuple_Check(arg1)) {
     PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
     return NULL;
 }

 if (!PyTuple_Check(arg2)) {
     PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
     return NULL;
 }

 Py_ssize_t sz = PyTuple_GET_SIZE(arg1);

 if (sz != PyTuple_GET_SIZE(arg2)) {
     PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
     return NULL;
 }

 for (i = 0; i < sz; ++i) {
     result +=
         PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg1, i)) *
         PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg2, i));
 }

 return PyFloat_FromDouble(result);
}


PyObject* dotproductNDArray(PyObject* self, PyObject* args)
{
 PyObject *arg1;
 PyObject *arg2;
 npy_byte const* data1;
 npy_byte const* data2;
 npy_intp arg1_stride;
 npy_intp arg2_stride;
 npy_double result = 0;
 PyArray_Descr* arg1_descr;
 PyArray_Descr* arg2_descr;
 int i;

 if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
     return NULL;
 }

 if (!PyArray_Check(arg1) || PyArray_NDIM(arg1) != 1) {
     PyErr_SetString(PyExc_TypeError, "argument 1 must be a 1-dimensional ndarray");
     return NULL;
 }

 if (!PyArray_Check(arg2) || PyArray_NDIM(arg2) != 1) {
     PyErr_SetString(PyExc_TypeError, "argument 2 must be a 1-dimensional ndarray");
     return NULL;
 }

 npy_intp sz = PyArray_DIMS(arg1)[0];

 if (sz != PyArray_DIMS(arg2)[0]) {
     PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be ndarrays of the same length");
     return NULL;
 }

 arg1_descr = PyArray_DESCR(arg1);
 arg2_descr = PyArray_DESCR(arg2);

 if (NPY_DOUBLE != arg1_descr->type_num) {
     PyErr_SetString(PyExc_ValueError, "elements of argument 1 must be float");
     return NULL;
 }

 if (NPY_DOUBLE != arg2_descr->type_num) {
     PyErr_SetString(PyExc_ValueError, "elements of argument 2 must be float");
     return NULL;
 }

 data1 = (npy_byte*)PyArray_DATA(arg1);
 data2 = (npy_byte*)PyArray_DATA(arg2);
 arg1_stride = PyArray_STRIDES(arg1)[0];
 arg2_stride = PyArray_STRIDES(arg2)[0];
 result = 0;

 for (i = 0; i < sz; ++i) {
     result +=
         *(npy_double const*)(data1 + arg1_stride * i) *
         *(npy_double const*)(data2 + arg2_stride * i);
 }

 return PyFloat_FromDouble(result);
}

static struct PyMethodDef methods[] = {
 { "dotproductSeq", (PyCFunction)dotproductSeq, METH_VARARGS, "" },
 { "dotproductSeqUnsafe", (PyCFunction)dotproductSeqUnsafe, METH_VARARGS, "" },
 { "dotproductTuple", (PyCFunction)dotproductTuple, METH_VARARGS, "" },
 { "dotproductTupleUnsafe", (PyCFunction)dotproductTupleUnsafe, METH_VARARGS, "" },
 { "dotproductNDArray", (PyCFunction)dotproductNDArray, METH_VARARGS, "" }
};

void initmytest()
{
 import_array();
 Py_InitModule3("mytest", methods, "");
}

bench.py:


import numpy
from mytest import *
import cProfile as profile
import pstats
import os

def case1():
    d = 0.
    while d < 10000.:
        dotproductNDArray(numpy.array((d, d, d)), numpy.array((d, d, d)))
        d += .1

def case2():
    d = 0.
    while d < 10000.:
        dotproductSeq((d, d, d), (d, d, d))
        d += .1

def case3():
    d = 0.
    while d < 10000.:
        dotproductSeq(numpy.array((d, d, d)), numpy.array((d, d, d)))
        d += .1

def case4():
    d = 0.
    while d < 10000.:
        dotproductSeqUnsafe((d, d, d), (d, d, d))
        d += .1

def case5():
    d = 0.
    while d < 10000.:
        dotproductTuple((d, d, d), (d, d, d))
        d += .1

def case6():
    d = 0.
    while d < 10000.:
        dotproductTupleUnsafe((d, d, d), (d, d, d))
        d += .1

def case7():
    d = 0.
    while d < 10000.:
        numpy.dot(numpy.array((d, d, d)), numpy.array((d, d, d)))
        d += .1

def case8():
    d = 0.
    while d < 10000.:
        numpy.dot((d, d, d), (d, d, d))
        d += .1

def runProf(fun):
    tmp = os.tmpnam() # XXX:racy
    profile.run('fun()', tmp)
    return tmp

stats = pstats.Stats(*[runProf(fun) for fun in (case1, case2, case3, case4, case5, case6, case7, case8)])
stats.sort_stats('cumulative')
stats.print_stats(__file__)

And the results:


 Ordered by: cumulative time
 List reduced from 17 to 8 due to restriction <'bench.py'>

 ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      1    0.164    0.164    1.437    1.437 bench.py:43(case7)
      1    0.151    0.151    1.377    1.377 bench.py:19(case3)
      1    0.145    0.145    1.268    1.268 bench.py:7(case1)
      1    0.090    0.090    1.232    1.232 bench.py:49(case8)
      1    0.049    0.049    0.089    0.089 bench.py:13(case2)
      1    0.048    0.048    0.084    0.084 bench.py:25(case4)
      1    0.048    0.048    0.065    0.065 bench.py:31(case5)
      1    0.048    0.048    0.063    0.063 bench.py:37(case6)

Summary

If you are sure that a given argument is a tuple, use PyTuple_XXX() and that is the fastest.

Construction of a ndarray object is roughly 15 times as slow as a tuple.

numpy.dot() is remarkably slow.

PyFloat_AsDouble() is not that slow comparing to PyFloat_AS_DOUBLE().

mozo is a mozo

2009年5月18日月曜日

Performance consideration of numpy.array and tuples

Summary

Blogger Syntax Highliter

フォロワー

ブログアーカイブ

自己紹介

mozo is a mozo

2009年5月18日月曜日

Performance consideration of numpy.array and tuples

Summary

Blogger Syntax Highliter

フォロワー

ブログ アーカイブ

自己紹介

ブログアーカイブ