Log-likelihood benchmark

This is a simple benchmark, which I use for basic test of vector-based computing engines.

The problem is to find log-likelihood of normal distribution: $\sum_i \log \left[ \dfrac{1}{\sqrt{2 \pi \sigma}} \exp \left( - \dfrac{(x_i - x)^2}{2 \sigma^2} \right) \right] $

There are elementwise subtraction, division, power, exponent, logarithm and summation of array, so this is broader test.

tested:

  1. numpy + scipy's pdf
  2. numpy
  3. cython
  4. numexpr
  5. theano
  6. parakeet
  7. C++
  8. FORTRAN

computing log-likelihood for normal distribution

Notes

  1. Not optimizing computations here (but in theory theano and parakeet may remove unnecessary computations)
  2. This test includes exp, log, division and summation of array
  3. Everything is running on CPU in one thread, this limitation is for 'fairness' of tests
In [1]:
import numpy
import scipy
from scipy.stats import norm
import theano

Scipy implementation

In [2]:
def llh_scipy(data, mean, sigma):
    lh = norm(mean, sigma).pdf(data)
    return numpy.log(lh).sum()

Numpy implementation

In [3]:
def llh_numpy(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = numpy.exp(- s)
    pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
    return numpy.log(pdfs).sum()

Cython implementation

In [4]:
%load_ext Cython
In [5]:
%%cython
cdef extern from "math.h":
    double sqrt(double m)
    double exp(double m)
    double log(double m)

import cython
import numpy as np

cimport numpy as np
from numpy cimport ndarray
pi = np.pi

@cython.boundscheck(False)
@cython.wraparound(False)
def llh_cython(ndarray[np.float64_t, ndim=1] data, double mean, double sigma):
    cdef int l = len(data)
    cdef double llh = 0
    cdef double s = 0
    for i in xrange(l):
        s = (data[i] - mean) ** 2 / (2 * (sigma ** 2))
        llh += log(exp(-s) / (sqrt(2 * pi) * sigma))
    return llh

Numexpr implementation

In [6]:
import numexpr
numexpr.set_num_threads(1)

def llh_numexpr(data, mean, sigma):
    expression = 'sum(log(exp(- (data-mean) **2  / (2 * sigma ** 2)) / (sqrt(2 * pi) * sigma)))'
    return numexpr.evaluate(expression, local_dict=dict(data=data, mean=mean, sigma=sigma, pi=numpy.pi))

Parakeet implementation

In [7]:
import parakeet
parakeet.config.backend = 'c'

@parakeet.jit 
def llh_parakeet(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = numpy.exp(- s)
    pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
    return numpy.log(pdfs).sum()

Theano implementation

In [8]:
import theano
import theano.tensor as T

print theano.config.device
cpu
In [9]:
theano.config.openmp
Out[9]:
False
In [10]:
def llh_theano(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = T.exp(- s)
    pdfs /= T.sqrt(2 * numpy.pi) * sigma
    return T.log(pdfs).sum()

mean, sigma = T.scalars('m', 's')
d = T.vector('data')

llh_theano = theano.function([d, mean, sigma], llh_theano(d, mean, sigma))

FORTRAN implementation

In [11]:
%load_ext fortranmagic
In [12]:
%%fortran 
subroutine llh_fortran(data, mean, sigma, result)
    real*8, dimension(:), intent(in) :: data
    real*8, intent(in) :: mean, sigma
    real*8, intent(out) :: result
            
    real*8, dimension(size(data, 1)) :: s
    real*8, parameter :: PI = 3.14159265358979323846

    s = (data - mean) ** 2 / (2 * sigma ** 2)
    s = exp(- s) / (sqrt(2 * PI) * sigma)
    result = sum(log(s))
    
end subroutine llh_fortran

C++ implementation for comparison of speed

we are neither passing, nor returning anything in c++. Just doing same operations in C++ for some array to compare speed.

Mind the overhead for creating new process - it is essential for small sizes.

In [13]:
%%writefile test_speed.cpp
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// using namespace std;


int main(int argc, char** argv) {
    if (argc < 4){
        std::cout << "run with n_samples, mean, sigma!";
        return 1;
    }
    int size = atoi(argv[1]);
    double mean = atof(argv[2]);
    double sigma = atof(argv[3]);
    
    double * data = new double[size];
    double factor = 1. / size;
    for (int i=0; i<size; ++i){
        data[i] = i * factor;
    }
    double result = 0.;
    double s = 0.;
    double x = 0.;
    
    for (int i=0; i<size; ++i){
        x = (data[i] - mean);
        s =  x * x / (2 * (sigma * sigma));
        result += log(exp(-s) / (sqrt(2 * M_PI) * sigma));
    }
    
    std::cout << std::endl << result << std::endl;
    return 0;
}
Overwriting test_speed.cpp
In [14]:
!g++ test_speed.cpp -o test_speed -O3
In [15]:
def llh_cpp(data, mean, sigma):
    size = len(data)
    out = !./test_speed {len(data)} {mean} {sigma}

Data generation, checking that all functions output the same value

In [16]:
from collections import OrderedDict
functions = OrderedDict()
functions['scipy'] = llh_scipy
functions['numpy'] = llh_numpy
functions['cython'] = llh_cython
functions['numexpr'] = llh_numexpr
functions['parakeet'] = llh_parakeet
functions['theano'] = llh_theano
functions['fortran'] = llh_fortran
functions['c++'] = llh_cpp
In [17]:
data = numpy.random.normal(size=1000000).astype('float64')
for name, function in functions.items():
    print name, function(data, 0.1, 1.1)
scipy -1431841.8928
numpy -1431841.8928
cython -1431841.8928
numexpr -1431841.8928
parakeet -1431841.8928
theano -1431841.8928
fortran -1431841.90671
c++ None
In [18]:
import timeit 
sizes = [10 ** 5, 10 ** 6, 10 ** 7, 10 ** 8]
import pandas
scores = pandas.DataFrame(data=0, columns=functions.keys(), index=sizes)
for size in sizes:
    for name, function in functions.items():
        data = numpy.random.normal(size=size).astype('float64')
        result = %timeit -o function(data, 0.1, 1.1)
        scores.loc[size, name] = result.best
100 loops, best of 3: 11.2 ms per loop
100 loops, best of 3: 6 ms per loop
100 loops, best of 3: 10.8 ms per loop
100 loops, best of 3: 7.14 ms per loop
100 loops, best of 3: 6.54 ms per loop
100 loops, best of 3: 8.07 ms per loop
100 loops, best of 3: 6.03 ms per loop
100 loops, best of 3: 16.6 ms per loop
10 loops, best of 3: 134 ms per loop
10 loops, best of 3: 62.2 ms per loop
10 loops, best of 3: 108 ms per loop
10 loops, best of 3: 71 ms per loop
10 loops, best of 3: 63.4 ms per loop
10 loops, best of 3: 81.7 ms per loop
10 loops, best of 3: 57.1 ms per loop
10 loops, best of 3: 83 ms per loop
1 loops, best of 3: 1.82 s per loop
1 loops, best of 3: 857 ms per loop
1 loops, best of 3: 1.08 s per loop
1 loops, best of 3: 713 ms per loop
1 loops, best of 3: 660 ms per loop
1 loops, best of 3: 877 ms per loop
1 loops, best of 3: 639 ms per loop
1 loops, best of 3: 694 ms per loop
1 loops, best of 3: 19.8 s per loop
1 loops, best of 3: 9.01 s per loop
1 loops, best of 3: 11.1 s per loop
1 loops, best of 3: 7.12 s per loop
1 loops, best of 3: 6.44 s per loop
1 loops, best of 3: 8.54 s per loop
1 loops, best of 3: 6.39 s per loop
1 loops, best of 3: 6.88 s per loop

Results (time in seconds, less is better)

In [19]:
scores
Out[19]:
scipy numpy cython numexpr parakeet theano fortran c++
100000 0.011209 0.006000 0.010787 0.007137 0.006539 0.008067 0.006031 0.016596
1000000 0.134079 0.062242 0.108327 0.071049 0.063404 0.081651 0.057135 0.083036
10000000 1.823849 0.857088 1.078784 0.713240 0.659757 0.876672 0.638656 0.694401
100000000 19.826547 9.006766 11.110369 7.124668 6.436616 8.542589 6.389464 6.884255

Comparison to numpy time (less is better)

In [20]:
normalized_scores = scores.copy()
for column in normalized_scores.columns:
    normalized_scores[column] /= scores['numpy']    
In [21]:
normalized_scores
Out[21]:
scipy numpy cython numexpr parakeet theano fortran c++
100000 1.868104 1 1.797846 1.189456 1.089899 1.344443 1.005108 2.765994
1000000 2.154161 1 1.740413 1.141503 1.018674 1.311835 0.917955 1.334087
10000000 2.127960 1 1.258662 0.832166 0.769766 1.022849 0.745146 0.810186
100000000 2.201295 1 1.233558 0.791035 0.714642 0.948464 0.709407 0.764343

Conclusion

Many libraries claim that they can speed up number crunching in python. Results of this test are floating (+- 0.1), but what we can see

  1. numpy turned out to be fastest at moderate sizes of arrays
  2. numpy implementation at least not more complex than others
  3. parakeet was the only to get sensible speed up

Technical info

In [22]:
import multiprocessing
print multiprocessing.cpu_count(), 'xeon cores'
16 xeon cores
In [23]:
numpy.__version__
Out[23]:
'1.10.1'
In [24]:
scipy.__version__
Out[24]:
'0.14.0'
In [25]:
import cython
cython.__version__
Out[25]:
'0.21.1'
In [26]:
numexpr.__version__
Out[26]:
'2.4'
In [27]:
parakeet.__version__
Out[27]:
'0.23.2'
In [28]:
theano.__version__
Out[28]:
'0.7.0'
In [29]:
!g++ -v
Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5) 
In [30]:
!gfortran -v
Using built-in specs.
COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5) 

This post was written in IPython. You can download the notebook from repository.