Log-likelihood benchmark¶

This is a simple benchmark, which I use for basic test of vector-based computing engines.

The problem is to find log-likelihood of normal distribution: $\sum_i \log \left[ \dfrac{1}{\sqrt{2 \pi \sigma}} \exp \left( - \dfrac{(x_i - x)^2}{2 \sigma^2} \right) \right]$

There are elementwise subtraction, division, power, exponent, logarithm and summation of array, so this is broader test.

tested:

1. numpy + scipy's pdf
2. numpy
3. cython
4. numexpr
5. theano
6. parakeet
7. C++
8. FORTRAN

computing log-likelihood for normal distribution

Notes

1. Not optimizing computations here (but in theory theano and parakeet may remove unnecessary computations)
2. This test includes exp, log, division and summation of array
3. Everything is running on CPU in one thread, this limitation is for 'fairness' of tests
In [1]:
import numpy
import scipy
from scipy.stats import norm
import theano


Scipy implementation¶

In [2]:
def llh_scipy(data, mean, sigma):
lh = norm(mean, sigma).pdf(data)
return numpy.log(lh).sum()


Numpy implementation¶

In [3]:
def llh_numpy(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = numpy.exp(- s)
pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
return numpy.log(pdfs).sum()


Cython implementation¶

In [4]:
%load_ext Cython

In [5]:
%%cython
cdef extern from "math.h":
double sqrt(double m)
double exp(double m)
double log(double m)

import cython
import numpy as np

cimport numpy as np
from numpy cimport ndarray
pi = np.pi

@cython.boundscheck(False)
@cython.wraparound(False)
def llh_cython(ndarray[np.float64_t, ndim=1] data, double mean, double sigma):
cdef int l = len(data)
cdef double llh = 0
cdef double s = 0
for i in xrange(l):
s = (data[i] - mean) ** 2 / (2 * (sigma ** 2))
llh += log(exp(-s) / (sqrt(2 * pi) * sigma))
return llh


Numexpr implementation¶

In [6]:
import numexpr

def llh_numexpr(data, mean, sigma):
expression = 'sum(log(exp(- (data-mean) **2  / (2 * sigma ** 2)) / (sqrt(2 * pi) * sigma)))'
return numexpr.evaluate(expression, local_dict=dict(data=data, mean=mean, sigma=sigma, pi=numpy.pi))


Parakeet implementation¶

In [7]:
import parakeet
parakeet.config.backend = 'c'

@parakeet.jit
def llh_parakeet(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = numpy.exp(- s)
pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
return numpy.log(pdfs).sum()


Theano implementation¶

In [8]:
import theano
import theano.tensor as T

print theano.config.device

cpu

In [9]:
theano.config.openmp

Out[9]:
False
In [10]:
def llh_theano(data, mean, sigma):
s = (data - mean) ** 2 / (2 * (sigma ** 2))
pdfs = T.exp(- s)
pdfs /= T.sqrt(2 * numpy.pi) * sigma
return T.log(pdfs).sum()

mean, sigma = T.scalars('m', 's')
d = T.vector('data')

llh_theano = theano.function([d, mean, sigma], llh_theano(d, mean, sigma))


FORTRAN implementation¶

In [11]:
%load_ext fortranmagic

In [12]:
%%fortran
subroutine llh_fortran(data, mean, sigma, result)
real*8, dimension(:), intent(in) :: data
real*8, intent(in) :: mean, sigma
real*8, intent(out) :: result

real*8, dimension(size(data, 1)) :: s
real*8, parameter :: PI = 3.14159265358979323846

s = (data - mean) ** 2 / (2 * sigma ** 2)
s = exp(- s) / (sqrt(2 * PI) * sigma)
result = sum(log(s))

end subroutine llh_fortran


C++ implementation for comparison of speed¶

we are neither passing, nor returning anything in c++. Just doing same operations in C++ for some array to compare speed.

Mind the overhead for creating new process - it is essential for small sizes.

In [13]:
%%writefile test_speed.cpp
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// using namespace std;

int main(int argc, char** argv) {
if (argc < 4){
std::cout << "run with n_samples, mean, sigma!";
return 1;
}
int size = atoi(argv[1]);
double mean = atof(argv[2]);
double sigma = atof(argv[3]);

double * data = new double[size];
double factor = 1. / size;
for (int i=0; i<size; ++i){
data[i] = i * factor;
}
double result = 0.;
double s = 0.;
double x = 0.;

for (int i=0; i<size; ++i){
x = (data[i] - mean);
s =  x * x / (2 * (sigma * sigma));
result += log(exp(-s) / (sqrt(2 * M_PI) * sigma));
}

std::cout << std::endl << result << std::endl;
return 0;
}

Overwriting test_speed.cpp

In [14]:
!g++ test_speed.cpp -o test_speed -O3

In [15]:
def llh_cpp(data, mean, sigma):
size = len(data)
out = !./test_speed {len(data)} {mean} {sigma}


Data generation, checking that all functions output the same value¶

In [16]:
from collections import OrderedDict
functions = OrderedDict()
functions['scipy'] = llh_scipy
functions['numpy'] = llh_numpy
functions['cython'] = llh_cython
functions['numexpr'] = llh_numexpr
functions['parakeet'] = llh_parakeet
functions['theano'] = llh_theano
functions['fortran'] = llh_fortran
functions['c++'] = llh_cpp

In [17]:
data = numpy.random.normal(size=1000000).astype('float64')
for name, function in functions.items():
print name, function(data, 0.1, 1.1)

scipy -1431841.8928
numpy -1431841.8928
cython -1431841.8928
numexpr -1431841.8928
parakeet -1431841.8928
theano -1431841.8928
fortran -1431841.90671
c++ None

In [18]:
import timeit
sizes = [10 ** 5, 10 ** 6, 10 ** 7, 10 ** 8]
import pandas
scores = pandas.DataFrame(data=0, columns=functions.keys(), index=sizes)
for size in sizes:
for name, function in functions.items():
data = numpy.random.normal(size=size).astype('float64')
result = %timeit -o function(data, 0.1, 1.1)
scores.loc[size, name] = result.best

100 loops, best of 3: 11.2 ms per loop
100 loops, best of 3: 6 ms per loop
100 loops, best of 3: 10.8 ms per loop
100 loops, best of 3: 7.14 ms per loop
100 loops, best of 3: 6.54 ms per loop
100 loops, best of 3: 8.07 ms per loop
100 loops, best of 3: 6.03 ms per loop
100 loops, best of 3: 16.6 ms per loop
10 loops, best of 3: 134 ms per loop
10 loops, best of 3: 62.2 ms per loop
10 loops, best of 3: 108 ms per loop
10 loops, best of 3: 71 ms per loop
10 loops, best of 3: 63.4 ms per loop
10 loops, best of 3: 81.7 ms per loop
10 loops, best of 3: 57.1 ms per loop
10 loops, best of 3: 83 ms per loop
1 loops, best of 3: 1.82 s per loop
1 loops, best of 3: 857 ms per loop
1 loops, best of 3: 1.08 s per loop
1 loops, best of 3: 713 ms per loop
1 loops, best of 3: 660 ms per loop
1 loops, best of 3: 877 ms per loop
1 loops, best of 3: 639 ms per loop
1 loops, best of 3: 694 ms per loop
1 loops, best of 3: 19.8 s per loop
1 loops, best of 3: 9.01 s per loop
1 loops, best of 3: 11.1 s per loop
1 loops, best of 3: 7.12 s per loop
1 loops, best of 3: 6.44 s per loop
1 loops, best of 3: 8.54 s per loop
1 loops, best of 3: 6.39 s per loop
1 loops, best of 3: 6.88 s per loop


Results (time in seconds, less is better)¶

In [19]:
scores

Out[19]:
scipy numpy cython numexpr parakeet theano fortran c++
100000 0.011209 0.006000 0.010787 0.007137 0.006539 0.008067 0.006031 0.016596
1000000 0.134079 0.062242 0.108327 0.071049 0.063404 0.081651 0.057135 0.083036
10000000 1.823849 0.857088 1.078784 0.713240 0.659757 0.876672 0.638656 0.694401
100000000 19.826547 9.006766 11.110369 7.124668 6.436616 8.542589 6.389464 6.884255

Comparison to numpy time (less is better)¶

In [20]:
normalized_scores = scores.copy()
for column in normalized_scores.columns:
normalized_scores[column] /= scores['numpy']

In [21]:
normalized_scores

Out[21]:
scipy numpy cython numexpr parakeet theano fortran c++
100000 1.868104 1 1.797846 1.189456 1.089899 1.344443 1.005108 2.765994
1000000 2.154161 1 1.740413 1.141503 1.018674 1.311835 0.917955 1.334087
10000000 2.127960 1 1.258662 0.832166 0.769766 1.022849 0.745146 0.810186
100000000 2.201295 1 1.233558 0.791035 0.714642 0.948464 0.709407 0.764343

Conclusion¶

Many libraries claim that they can speed up number crunching in python. Results of this test are floating (+- 0.1), but what we can see

1. numpy turned out to be fastest at moderate sizes of arrays
2. numpy implementation at least not more complex than others
3. parakeet was the only to get sensible speed up

Technical info¶

In [22]:
import multiprocessing
print multiprocessing.cpu_count(), 'xeon cores'

16 xeon cores

In [23]:
numpy.__version__

Out[23]:
'1.10.1'
In [24]:
scipy.__version__

Out[24]:
'0.14.0'
In [25]:
import cython
cython.__version__

Out[25]:
'0.21.1'
In [26]:
numexpr.__version__

Out[26]:
'2.4'
In [27]:
parakeet.__version__

Out[27]:
'0.23.2'
In [28]:
theano.__version__

Out[28]:
'0.7.0'
In [29]:
!g++ -v

Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)

In [30]:
!gfortran -v

Using built-in specs.
COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu