System

Test 1: Residual

For
this test case computes
In the examples directory of FLENS you can find the following simple source file:
  #include <flens/flens.h>
  #include <timer.h>
  
  using namespace flens;
  using namespace std;
  
  typedef SparseGeMatrix<CRS<double> > Mat;
  typedef DenseVector<Array<double> >  Vec;
  
  void
  matrixFill(Mat &A)
  {
      int n = A.numRows();
      for (int i=1; i<=n; ++i) {
          if (i>1) {
              A(i,i-1) += -1;
          }
          A(i,i) += 4;
          if (i<n) {
              A(i,i+1) += -1;
          }
      }
      A.finalize();
  }
  
  int
  main(int argc, char **argv)
  {
      int n = 10000000;
      if (argc>1) {
          n = atoi(argv[1]);
      }
      cout << "n = " << n << endl;
  
      Mat A(n, n, 3);
      Vec b(n), x(n), r(n);
      b = 2;
      b(1) = b(n) = 3;
      x = 0.99;
  
      {
          timer t;
  
          t.tic();
          matrixFill(A);
          cout << "fill: " << t.toc() << endl;
      }
  
      {
          timer t;
  
          t.tic();
          for (int k=1; k<=20; ++k) {
              r = b - A*x;
          }
          cout << "r = b - A*x: " << t.toc()
               << " (" << nrm2(r) << ")" << endl;
      }
  
      {
          timer t;
  
          t.tic();
          for (int k=1; k<=20; ++k) {
              r = A*x - b;
          }
          cout << "r = A*x - b: " << t.toc()
               << " (" << nrm2(r) << ")" << endl;
      }
  
      {
          timer t;
  
          t.tic();
          for (int k=1; k<=20; ++k) {
              r = -A*x;
          }
          cout << "r = -A*x: " << t.toc()
               << " (" << nrm2(r) << ")" << endl;
      }
  
  }

Note that function matrixFill does initialize the sparse matrix row-wise:

    for (int i=1; i<=n; ++i) {
        if (i>1) {
            A(i,i-1) += -1;
        }
        A(i,i) += 4;
        if (i<n) {
            A(i,i+1) += -1;
        }
    }
    A.finalize();

So far FLENS is (kind of) optimized for a random order initialization which typically occurs in the assembling phase of the Finite Element Method. So runtime results show that there is potential to further optimize the row-wise initialization:

  lehn@node31:~/flens_new/FLENS-lite/examples> make residual_sparse
  g++ -DNDEBUG -Wall -g -O3 -fPIC -I /home/lehn/flens_new/FLENS-lite/examples/.. -I.
   -I /cluster/include -I /cluster/ATLAS3.6.0/include -I . -o residual_sparse
   residual_sparse.cc -L.. -lflens -L /cluster/lib -L /cluster/ATLAS3.6.0/lib/Linux_HAMMER64SSE2_2/
   -llapack -lcblas -lf77blas -latlas -lg2c
  lehn@node31:~/flens_new/FLENS-lite/examples>
  lehn@node31:~/flens_new/FLENS-lite/examples>
  lehn@node31:~/flens_new/FLENS-lite/examples>
  lehn@node31:~/flens_new/FLENS-lite/examples> ./residual_sparse
  n = 10000000
  fill: 2.82
  r = b - A*x: 4.71 (63.2456)
  r = A*x - b: 5.3 (63.2456)
  r = -A*x: 3.85 (6261.31)
  lehn@node31:~/flens_new/FLENS-lite/examples>

So times are

This test is based on a uBLAS example ex_residual2.cpp:

    // -*- c++ -*- //
    /** \file main.cpp  \brief main routines */
    /***************************************************************************
     -   begin                : 2003-09-05
     -   copyright            : (C) 2003 by Gunter Winkler
     -   email                : guwi17@gmx.de
     ***************************************************************************/
  
    /***************************************************************************
     *                                                                         *
     *   This program is free software; you can redistribute it and/or modify  *
     *   it under the terms of the GNU General Public License as published by  *
     *   the Free Software Foundation; either version 2 of the License, or     *
     *   (at your option) any later version.                                   *
     *                                                                         *
     ***************************************************************************/
  
    #include <iostream>
  
    #include <boost/timer.hpp>
  
    #include <boost/numeric/ublas/matrix.hpp>
    #include <boost/numeric/ublas/matrix_sparse.hpp>
    #include <boost/numeric/ublas/vector.hpp>
    #include <boost/numeric/ublas/io.hpp>
    #include <boost/numeric/ublas/operation.hpp>
  
  
    #ifndef NDEBUG
    const size_t default_size = 100;
    #else
    const size_t default_size = 10000000;
    #endif
  
    const size_t subsize = 3;
  
    using namespace boost::numeric::ublas ;
  
    using std::cout;
    using std::endl;
  
    namespace boost { namespace numeric { namespace ublas {
  
        /** \brief compute residual r = b - Ax
         *
         */
        template<class V, class T1, class IA1, class TA1, class E2>
        void.
        residual ( const compressed_matrix<T1, row_major, 0, IA1, TA1> & A,
                   const vector_expression<E2> &x,
                   const vector_expression<E2> &b,
                   V &r, row_major_tag) {
  
            BOOST_UBLAS_CHECK( x().size() >= A.size2(), external_logic() );
            BOOST_UBLAS_CHECK( b().size() >= A.size1(), external_logic() );
            BOOST_UBLAS_CHECK( r.size() >= A.size1(), external_logic() );
  
            typedef typename V::size_type size_type;
            typedef typename V::value_type value_type;
  
            for (size_type i = 0; i < A.size1 (); ++ i) {
                size_type begin = A.index1_data () [i];
                size_type end = A.index1_data () [i + 1];
                value_type t (b() (i));
                for (size_type j = begin; j < end; ++ j)
                    t -= A.value_data () [j] * x () (A.index2_data () [j]);
                r (i) = t;
            }
            return;
        }
  
    }}}
  
    template <class MAT>
    void matrix_fill(MAT& A)
    {
      typedef typename MAT::size_type  size_type;
  
      for (size_type i=0; i<A.size1(); ++i)|{
        if (i>=1) { A.push_back(i,i-1,-1.0); }
        A.push_back(i,i,4);
        if (i+1<A.size2()) { A.push_back(i,i+1,-1.0); }
      }
    }
  
    int main(int argc, char *argv[])
    {
  
      typedef size_t  size_type;
  
      size_type size = default_size;
  
      if (argc > 1)
        size = ::atoi (argv [1]);
  
      cout << "size " << size << endl;
  
      compressed_matrix<double, row_major> A(size,size);
      {
        boost::timer t;
        t.restart();
        matrix_fill(A);
        cout << "fill " << t.elapsed() << endl;
      }
  
      vector<double>                       b( scalar_vector<double>(size, 2.0) );
      b(0)=3;.
      b(size-1)=3;
  
      vector<double>                       x( scalar_vector<double>(size, 0.99) );
  
      {
        boost::timer t;
        t.restart();
        vector<double>                       r(size);
        for (int k=1; k<=20; ++k) {
            r = prod(A, x) - b;
        }
        cout << "r=Ax-b : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
      }
  
      {
        boost::timer t;
        t.restart();
        vector<double>                       r(size);
        for (int k=1; k<=20; ++k) {
            r = b - prod(A, x);
        }
        cout << "r=b - Ax : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
      }
  
      {
        boost::timer t;
        t.restart();
        vector<double>                       r(size);
        for (int k=1; k<=20; ++k) {
            noalias(r) = b - prod(A, x);
        }
        cout << "noalias(r)=b-Ax : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
      }
  
      {
        boost::timer t;
        t.restart();
        for (int k=1; k<=20; ++k) {
            vector<double>                       r(b - prod(A, x));
        }
        cout << "vector<double> r(b - prod(A, x)) : " << t.elapsed() << endl;
      }
  
      {
        boost::timer t;
        t.restart();
        vector<double>                       r(b);
        for (int k=1; k<=20; ++k) {
            axpy_prod(A,-x,r,false);
        }
        cout << "axpy_prod(A,-x,r,false) : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
      }
  
      {
        boost::timer t;
        t.restart();
        vector<double>                       r(size);
        for (int k=1; k<=20; ++k) {
            residual(A,x,b,r,row_major_tag());
        }
        cout << "residual(A,x,b,r,row_major_tag()) : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
      }
  
      return EXIT_SUCCESS;
    }

In this code different variants are used for computing the residual. I get the following runtime results (using Boost 1.34.1):

    lehn@node31:~/ublas2_test> make
    g++ -DNDEBUG -ftemplate-depth-50 -funroll-loops -O3 -I. -Wall -o ex_residual ex_residual.cpp
    lehn@node31:~/ublas2_test>
    lehn@node31:~/ublas2_test>
    lehn@node31:~/ublas2_test>
    lehn@node31:~/ublas2_test>
    lehn@node31:~/ublas2_test> ./ex_residual
    size 10000000
    fill 1.68
    r=Ax-b : 11.97 (63.2456)
    r=b - Ax : 12.32 (63.2456)
    noalias(r)=b-Ax : 12.85 (63.2456)
    vector<double> r(b - prod(A, x)) : 14.16
    axpy_prod(A,-x,r,false) : 7.78 (118902)
    residual(A,x,b,r,row_major_tag()) : 8.32 (63.2456)
    lehn@node31:~/ublas2_test> ./ex_residual
    size 10000000
    fill 1.7
    r=Ax-b : 11.98 (63.2456)
    r=b - Ax : 12.02 (63.2456)
    noalias(r)=b-Ax : 10.93 (63.2456)
    vector<double> r(b - prod(A, x)) : 13.56
    axpy_prod(A,-x,r,false) : 7.84 (118902)
    residual(A,x,b,r,row_major_tag()) : 8.32 (63.2456)
    lehn@node31:~/ublas2_test>

So uBLAS obviously handles the row-wise initialization better. However, the fastest variant for computing the residual (8.32s) is slightly slower than in the FLENS example (4.71s).