So I wrote different versions of a function that calculates dot product in C, and did the benchmark between them.
Here is the code I used in the comparison:
mytestmodule.c:
#include <Python.h>
#include <numpy/arrayobject.h>
PyObject* dotproductSeq(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;
if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}
if (!PySequence_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
return NULL;
}
if (!PySequence_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
return NULL;
}
Py_ssize_t sz = PySequence_Size(arg1);
if (sz != PySequence_Size(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
return NULL;
}
for (i = 0; i < sz; ++i) {
result +=
PyFloat_AsDouble(PySequence_GetItem(arg1, i)) *
PyFloat_AsDouble(PySequence_GetItem(arg2, i));
}
return PyFloat_FromDouble(result);
}
PyObject* dotproductSeqUnsafe(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;
if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}
if (!PySequence_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a sequence");
return NULL;
}
if (!PySequence_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a sequence");
return NULL;
}
Py_ssize_t sz = PySequence_Fast_GET_SIZE(arg1);
if (sz != PySequence_Fast_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be sequences of the same length");
return NULL;
}
for (i = 0; i < sz; ++i) {
result +=
PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg1, i)) *
PyFloat_AS_DOUBLE(PySequence_Fast_GET_ITEM(arg2, i));
}
return PyFloat_FromDouble(result);
}
PyObject* dotproductTuple(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;
if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}
if (!PyTuple_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
return NULL;
}
if (!PyTuple_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
return NULL;
}
Py_ssize_t sz = PyTuple_GET_SIZE(arg1);
if (sz != PyTuple_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
return NULL;
}
for (i = 0; i < sz; ++i) {
result +=
PyFloat_AsDouble(PyTuple_GET_ITEM(arg1, i)) *
PyFloat_AsDouble(PyTuple_GET_ITEM(arg2, i));
}
return PyFloat_FromDouble(result);
}
PyObject* dotproductTupleUnsafe(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
int i;
double result = 0;
if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}
if (!PyTuple_Check(arg1)) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a tuple");
return NULL;
}
if (!PyTuple_Check(arg2)) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a tuple");
return NULL;
}
Py_ssize_t sz = PyTuple_GET_SIZE(arg1);
if (sz != PyTuple_GET_SIZE(arg2)) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be tuple of the same length");
return NULL;
}
for (i = 0; i < sz; ++i) {
result +=
PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg1, i)) *
PyFloat_AS_DOUBLE(PyTuple_GET_ITEM(arg2, i));
}
return PyFloat_FromDouble(result);
}
PyObject* dotproductNDArray(PyObject* self, PyObject* args)
{
PyObject *arg1;
PyObject *arg2;
npy_byte const* data1;
npy_byte const* data2;
npy_intp arg1_stride;
npy_intp arg2_stride;
npy_double result = 0;
PyArray_Descr* arg1_descr;
PyArray_Descr* arg2_descr;
int i;
if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
return NULL;
}
if (!PyArray_Check(arg1) || PyArray_NDIM(arg1) != 1) {
PyErr_SetString(PyExc_TypeError, "argument 1 must be a 1-dimensional ndarray");
return NULL;
}
if (!PyArray_Check(arg2) || PyArray_NDIM(arg2) != 1) {
PyErr_SetString(PyExc_TypeError, "argument 2 must be a 1-dimensional ndarray");
return NULL;
}
npy_intp sz = PyArray_DIMS(arg1)[0];
if (sz != PyArray_DIMS(arg2)[0]) {
PyErr_SetString(PyExc_ValueError, "argument 1 and 2 must be ndarrays of the same length");
return NULL;
}
arg1_descr = PyArray_DESCR(arg1);
arg2_descr = PyArray_DESCR(arg2);
if (NPY_DOUBLE != arg1_descr->type_num) {
PyErr_SetString(PyExc_ValueError, "elements of argument 1 must be float");
return NULL;
}
if (NPY_DOUBLE != arg2_descr->type_num) {
PyErr_SetString(PyExc_ValueError, "elements of argument 2 must be float");
return NULL;
}
data1 = (npy_byte*)PyArray_DATA(arg1);
data2 = (npy_byte*)PyArray_DATA(arg2);
arg1_stride = PyArray_STRIDES(arg1)[0];
arg2_stride = PyArray_STRIDES(arg2)[0];
result = 0;
for (i = 0; i < sz; ++i) {
result +=
*(npy_double const*)(data1 + arg1_stride * i) *
*(npy_double const*)(data2 + arg2_stride * i);
}
return PyFloat_FromDouble(result);
}
static struct PyMethodDef methods[] = {
{ "dotproductSeq", (PyCFunction)dotproductSeq, METH_VARARGS, "" },
{ "dotproductSeqUnsafe", (PyCFunction)dotproductSeqUnsafe, METH_VARARGS, "" },
{ "dotproductTuple", (PyCFunction)dotproductTuple, METH_VARARGS, "" },
{ "dotproductTupleUnsafe", (PyCFunction)dotproductTupleUnsafe, METH_VARARGS, "" },
{ "dotproductNDArray", (PyCFunction)dotproductNDArray, METH_VARARGS, "" }
};
void initmytest()
{
import_array();
Py_InitModule3("mytest", methods, "");
}
bench.py:
import numpy
from mytest import *
import cProfile as profile
import pstats
import os
def case1():
d = 0.
while d < 10000.:
dotproductNDArray(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1
def case2():
d = 0.
while d < 10000.:
dotproductSeq((d, d, d), (d, d, d))
d += .1
def case3():
d = 0.
while d < 10000.:
dotproductSeq(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1
def case4():
d = 0.
while d < 10000.:
dotproductSeqUnsafe((d, d, d), (d, d, d))
d += .1
def case5():
d = 0.
while d < 10000.:
dotproductTuple((d, d, d), (d, d, d))
d += .1
def case6():
d = 0.
while d < 10000.:
dotproductTupleUnsafe((d, d, d), (d, d, d))
d += .1
def case7():
d = 0.
while d < 10000.:
numpy.dot(numpy.array((d, d, d)), numpy.array((d, d, d)))
d += .1
def case8():
d = 0.
while d < 10000.:
numpy.dot((d, d, d), (d, d, d))
d += .1
def runProf(fun):
tmp = os.tmpnam() # XXX:racy
profile.run('fun()', tmp)
return tmp
stats = pstats.Stats(*[runProf(fun) for fun in (case1, case2, case3, case4, case5, case6, case7, case8)])
stats.sort_stats('cumulative')
stats.print_stats(__file__)
And the results:
Ordered by: cumulative time
List reduced from 17 to 8 due to restriction <'bench.py'>
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.164 0.164 1.437 1.437 bench.py:43(case7)
1 0.151 0.151 1.377 1.377 bench.py:19(case3)
1 0.145 0.145 1.268 1.268 bench.py:7(case1)
1 0.090 0.090 1.232 1.232 bench.py:49(case8)
1 0.049 0.049 0.089 0.089 bench.py:13(case2)
1 0.048 0.048 0.084 0.084 bench.py:25(case4)
1 0.048 0.048 0.065 0.065 bench.py:31(case5)
1 0.048 0.048 0.063 0.063 bench.py:37(case6)
Summary
- If you are sure that a given argument is a tuple, use PyTuple_XXX() and that is the fastest.
- Construction of a ndarray object is roughly 15 times as slow as a tuple.
- numpy.dot() is remarkably slow.
- PyFloat_AsDouble() is not that slow comparing to PyFloat_AS_DOUBLE().