Log-likelihood benchmark¶

This is a simple benchmark, which I use for basic test of vector-based computing engines.

The problem is to find log-likelihood of normal distribution: $\sum_i \log \left[ \dfrac{1}{\sqrt{2 \pi \sigma}} \exp \left( - \dfrac{(x_i - x)^2}{2 \sigma^2} \right) \right] $

There are elementwise subtraction, division, power, exponent, logarithm and summation of array, so this is broader test.

tested:

numpy + scipy's pdf
numpy
cython
numexpr
theano
parakeet
C++
FORTRAN

computing log-likelihood for normal distribution

Notes

Not optimizing computations here (but in theory theano and parakeet may remove unnecessary computations)
This test includes exp, log, division and summation of array
Everything is running on CPU in one thread, this limitation is for 'fairness' of tests

In [1]:

import numpy
import scipy
from scipy.stats import norm
import theano

Scipy implementation¶

In [2]:

def llh_scipy(data, mean, sigma):
    lh = norm(mean, sigma).pdf(data)
    return numpy.log(lh).sum()

Numpy implementation¶

In [3]:

def llh_numpy(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = numpy.exp(- s)
    pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
    return numpy.log(pdfs).sum()

Cython implementation¶

In [4]:

%load_ext Cython

In [5]:

%%cython
cdef extern from "math.h":
    double sqrt(double m)
    double exp(double m)
    double log(double m)

import cython
import numpy as np

cimport numpy as np
from numpy cimport ndarray
pi = np.pi

@cython.boundscheck(False)
@cython.wraparound(False)
def llh_cython(ndarray[np.float64_t, ndim=1] data, double mean, double sigma):
    cdef int l = len(data)
    cdef double llh = 0
    cdef double s = 0
    for i in xrange(l):
        s = (data[i] - mean) ** 2 / (2 * (sigma ** 2))
        llh += log(exp(-s) / (sqrt(2 * pi) * sigma))
    return llh

Numexpr implementation¶

In [6]:

import numexpr
numexpr.set_num_threads(1)

def llh_numexpr(data, mean, sigma):
    expression = 'sum(log(exp(- (data-mean) **2  / (2 * sigma ** 2)) / (sqrt(2 * pi) * sigma)))'
    return numexpr.evaluate(expression, local_dict=dict(data=data, mean=mean, sigma=sigma, pi=numpy.pi))

Parakeet implementation¶

In [7]:

import parakeet
parakeet.config.backend = 'c'

@parakeet.jit 
def llh_parakeet(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = numpy.exp(- s)
    pdfs /= numpy.sqrt(2 * numpy.pi) * sigma
    return numpy.log(pdfs).sum()

Theano implementation¶

In [8]:

import theano
import theano.tensor as T

print theano.config.device

cpu

In [9]:

theano.config.openmp

Out[9]:

False

In [10]:

def llh_theano(data, mean, sigma):
    s = (data - mean) ** 2 / (2 * (sigma ** 2))
    pdfs = T.exp(- s)
    pdfs /= T.sqrt(2 * numpy.pi) * sigma
    return T.log(pdfs).sum()

mean, sigma = T.scalars('m', 's')
d = T.vector('data')

llh_theano = theano.function([d, mean, sigma], llh_theano(d, mean, sigma))

FORTRAN implementation¶

In [11]:

%load_ext fortranmagic

In [12]:

%%fortran 
subroutine llh_fortran(data, mean, sigma, result)
    real*8, dimension(:), intent(in) :: data
    real*8, intent(in) :: mean, sigma
    real*8, intent(out) :: result
            
    real*8, dimension(size(data, 1)) :: s
    real*8, parameter :: PI = 3.14159265358979323846

    s = (data - mean) ** 2 / (2 * sigma ** 2)
    s = exp(- s) / (sqrt(2 * PI) * sigma)
    result = sum(log(s))
    
end subroutine llh_fortran

C++ implementation for comparison of speed¶

we are neither passing, nor returning anything in c++. Just doing same operations in C++ for some array to compare speed.

Mind the overhead for creating new process - it is essential for small sizes.

In [13]:

%%writefile test_speed.cpp
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

// using namespace std;


int main(int argc, char** argv) {
    if (argc < 4){
        std::cout << "run with n_samples, mean, sigma!";
        return 1;
    }
    int size = atoi(argv[1]);
    double mean = atof(argv[2]);
    double sigma = atof(argv[3]);
    
    double * data = new double[size];
    double factor = 1. / size;
    for (int i=0; i<size; ++i){
        data[i] = i * factor;
    }
    double result = 0.;
    double s = 0.;
    double x = 0.;
    
    for (int i=0; i<size; ++i){
        x = (data[i] - mean);
        s =  x * x / (2 * (sigma * sigma));
        result += log(exp(-s) / (sqrt(2 * M_PI) * sigma));
    }
    
    std::cout << std::endl << result << std::endl;
    return 0;
}

Overwriting test_speed.cpp

In [14]:

!g++ test_speed.cpp -o test_speed -O3

In [15]:

def llh_cpp(data, mean, sigma):
    size = len(data)
    out = !./test_speed {len(data)} {mean} {sigma}

Data generation, checking that all functions output the same value¶

In [16]:

from collections import OrderedDict
functions = OrderedDict()
functions['scipy'] = llh_scipy
functions['numpy'] = llh_numpy
functions['cython'] = llh_cython
functions['numexpr'] = llh_numexpr
functions['parakeet'] = llh_parakeet
functions['theano'] = llh_theano
functions['fortran'] = llh_fortran
functions['c++'] = llh_cpp

In [17]:

data = numpy.random.normal(size=1000000).astype('float64')
for name, function in functions.items():
    print name, function(data, 0.1, 1.1)

scipy -1431841.8928
numpy -1431841.8928
cython -1431841.8928
numexpr -1431841.8928
parakeet -1431841.8928
theano -1431841.8928
fortran -1431841.90671
c++ None

In [18]:

import timeit 
sizes = [10 ** 5, 10 ** 6, 10 ** 7, 10 ** 8]
import pandas
scores = pandas.DataFrame(data=0, columns=functions.keys(), index=sizes)
for size in sizes:
    for name, function in functions.items():
        data = numpy.random.normal(size=size).astype('float64')
        result = %timeit -o function(data, 0.1, 1.1)
        scores.loc[size, name] = result.best

100 loops, best of 3: 11.2 ms per loop
100 loops, best of 3: 6 ms per loop
100 loops, best of 3: 10.8 ms per loop
100 loops, best of 3: 7.14 ms per loop
100 loops, best of 3: 6.54 ms per loop
100 loops, best of 3: 8.07 ms per loop
100 loops, best of 3: 6.03 ms per loop
100 loops, best of 3: 16.6 ms per loop
10 loops, best of 3: 134 ms per loop
10 loops, best of 3: 62.2 ms per loop
10 loops, best of 3: 108 ms per loop
10 loops, best of 3: 71 ms per loop
10 loops, best of 3: 63.4 ms per loop
10 loops, best of 3: 81.7 ms per loop
10 loops, best of 3: 57.1 ms per loop
10 loops, best of 3: 83 ms per loop
1 loops, best of 3: 1.82 s per loop
1 loops, best of 3: 857 ms per loop
1 loops, best of 3: 1.08 s per loop
1 loops, best of 3: 713 ms per loop
1 loops, best of 3: 660 ms per loop
1 loops, best of 3: 877 ms per loop
1 loops, best of 3: 639 ms per loop
1 loops, best of 3: 694 ms per loop
1 loops, best of 3: 19.8 s per loop
1 loops, best of 3: 9.01 s per loop
1 loops, best of 3: 11.1 s per loop
1 loops, best of 3: 7.12 s per loop
1 loops, best of 3: 6.44 s per loop
1 loops, best of 3: 8.54 s per loop
1 loops, best of 3: 6.39 s per loop
1 loops, best of 3: 6.88 s per loop

Results (time in seconds, less is better)¶

In [19]:

scores

Out[19]:

	scipy	numpy	cython	numexpr	parakeet	theano	fortran	c++
100000	0.011209	0.006000	0.010787	0.007137	0.006539	0.008067	0.006031	0.016596
1000000	0.134079	0.062242	0.108327	0.071049	0.063404	0.081651	0.057135	0.083036
10000000	1.823849	0.857088	1.078784	0.713240	0.659757	0.876672	0.638656	0.694401
100000000	19.826547	9.006766	11.110369	7.124668	6.436616	8.542589	6.389464	6.884255

Comparison to numpy time (less is better)¶

In [20]:

normalized_scores = scores.copy()
for column in normalized_scores.columns:
    normalized_scores[column] /= scores['numpy']

In [21]:

normalized_scores

Out[21]:

	scipy	numpy	cython	numexpr	parakeet	theano	fortran	c++
100000	1.868104	1	1.797846	1.189456	1.089899	1.344443	1.005108	2.765994
1000000	2.154161	1	1.740413	1.141503	1.018674	1.311835	0.917955	1.334087
10000000	2.127960	1	1.258662	0.832166	0.769766	1.022849	0.745146	0.810186
100000000	2.201295	1	1.233558	0.791035	0.714642	0.948464	0.709407	0.764343

Conclusion¶

Many libraries claim that they can speed up number crunching in python. Results of this test are floating (+- 0.1), but what we can see

numpy turned out to be fastest at moderate sizes of arrays
numpy implementation at least not more complex than others
parakeet was the only to get sensible speed up

Technical info¶

In [22]:

import multiprocessing
print multiprocessing.cpu_count(), 'xeon cores'

16 xeon cores

In [23]:

numpy.__version__

Out[23]:

'1.10.1'

In [24]:

scipy.__version__

Out[24]:

'0.14.0'

In [25]:

import cython
cython.__version__

Out[25]:

'0.21.1'

In [26]:

numexpr.__version__

Out[26]:

'2.4'

In [27]:

parakeet.__version__

Out[27]:

'0.23.2'

In [28]:

theano.__version__

Out[28]:

'0.7.0'

In [29]:

!g++ -v

Using built-in specs.
COLLECT_GCC=g++
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)

In [30]:

!gfortran -v

Using built-in specs.
COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.3-1ubuntu5' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)