

#include <flens/flens.h>
#include <timer.h>
using namespace flens;
using namespace std;
typedef SparseGeMatrix<CRS<double> > Mat;
typedef DenseVector<Array<double> > Vec;
void
matrixFill(Mat &A)
{
int n = A.numRows();
for (int i=1; i<=n; ++i) {
if (i>1) {
A(i,i-1) += -1;
}
A(i,i) += 4;
if (i<n) {
A(i,i+1) += -1;
}
}
A.finalize();
}
int
main(int argc, char **argv)
{
int n = 10000000;
if (argc>1) {
n = atoi(argv[1]);
}
cout << "n = " << n << endl;
Mat A(n, n, 3);
Vec b(n), x(n), r(n);
b = 2;
b(1) = b(n) = 3;
x = 0.99;
{
timer t;
t.tic();
matrixFill(A);
cout << "fill: " << t.toc() << endl;
}
{
timer t;
t.tic();
for (int k=1; k<=20; ++k) {
r = b - A*x;
}
cout << "r = b - A*x: " << t.toc()
<< " (" << nrm2(r) << ")" << endl;
}
{
timer t;
t.tic();
for (int k=1; k<=20; ++k) {
r = A*x - b;
}
cout << "r = A*x - b: " << t.toc()
<< " (" << nrm2(r) << ")" << endl;
}
{
timer t;
t.tic();
for (int k=1; k<=20; ++k) {
r = -A*x;
}
cout << "r = -A*x: " << t.toc()
<< " (" << nrm2(r) << ")" << endl;
}
}
Note that function matrixFill does initialize the sparse matrix row-wise:
for (int i=1; i<=n; ++i) {
if (i>1) {
A(i,i-1) += -1;
}
A(i,i) += 4;
if (i<n) {
A(i,i+1) += -1;
}
}
A.finalize();
So far FLENS is (kind of) optimized for a random order initialization which typically occurs in the assembling phase of the Finite Element Method. So runtime results show that there is potential to further optimize the row-wise initialization:
lehn@node31:~/flens_new/FLENS-lite/examples> make residual_sparse g++ -DNDEBUG -Wall -g -O3 -fPIC -I /home/lehn/flens_new/FLENS-lite/examples/.. -I. -I /cluster/include -I /cluster/ATLAS3.6.0/include -I . -o residual_sparse residual_sparse.cc -L.. -lflens -L /cluster/lib -L /cluster/ATLAS3.6.0/lib/Linux_HAMMER64SSE2_2/ -llapack -lcblas -lf77blas -latlas -lg2c lehn@node31:~/flens_new/FLENS-lite/examples> lehn@node31:~/flens_new/FLENS-lite/examples> lehn@node31:~/flens_new/FLENS-lite/examples> lehn@node31:~/flens_new/FLENS-lite/examples> ./residual_sparse n = 10000000 fill: 2.82 r = b - A*x: 4.71 (63.2456) r = A*x - b: 5.3 (63.2456) r = -A*x: 3.85 (6261.31) lehn@node31:~/flens_new/FLENS-lite/examples>
So times are
This test is based on a uBLAS example ex_residual2.cpp:
// -*- c++ -*- //
/** \file main.cpp \brief main routines */
/***************************************************************************
- begin : 2003-09-05
- copyright : (C) 2003 by Gunter Winkler
- email : guwi17@gmx.de
***************************************************************************/
/***************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
***************************************************************************/
#include <iostream>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/operation.hpp>
#ifndef NDEBUG
const size_t default_size = 100;
#else
const size_t default_size = 10000000;
#endif
const size_t subsize = 3;
using namespace boost::numeric::ublas ;
using std::cout;
using std::endl;
namespace boost { namespace numeric { namespace ublas {
/** \brief compute residual r = b - Ax
*
*/
template<class V, class T1, class IA1, class TA1, class E2>
void.
residual ( const compressed_matrix<T1, row_major, 0, IA1, TA1> & A,
const vector_expression<E2> &x,
const vector_expression<E2> &b,
V &r, row_major_tag) {
BOOST_UBLAS_CHECK( x().size() >= A.size2(), external_logic() );
BOOST_UBLAS_CHECK( b().size() >= A.size1(), external_logic() );
BOOST_UBLAS_CHECK( r.size() >= A.size1(), external_logic() );
typedef typename V::size_type size_type;
typedef typename V::value_type value_type;
for (size_type i = 0; i < A.size1 (); ++ i) {
size_type begin = A.index1_data () [i];
size_type end = A.index1_data () [i + 1];
value_type t (b() (i));
for (size_type j = begin; j < end; ++ j)
t -= A.value_data () [j] * x () (A.index2_data () [j]);
r (i) = t;
}
return;
}
}}}
template <class MAT>
void matrix_fill(MAT& A)
{
typedef typename MAT::size_type size_type;
for (size_type i=0; i<A.size1(); ++i)|{
if (i>=1) { A.push_back(i,i-1,-1.0); }
A.push_back(i,i,4);
if (i+1<A.size2()) { A.push_back(i,i+1,-1.0); }
}
}
int main(int argc, char *argv[])
{
typedef size_t size_type;
size_type size = default_size;
if (argc > 1)
size = ::atoi (argv [1]);
cout << "size " << size << endl;
compressed_matrix<double, row_major> A(size,size);
{
boost::timer t;
t.restart();
matrix_fill(A);
cout << "fill " << t.elapsed() << endl;
}
vector<double> b( scalar_vector<double>(size, 2.0) );
b(0)=3;.
b(size-1)=3;
vector<double> x( scalar_vector<double>(size, 0.99) );
{
boost::timer t;
t.restart();
vector<double> r(size);
for (int k=1; k<=20; ++k) {
r = prod(A, x) - b;
}
cout << "r=Ax-b : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
}
{
boost::timer t;
t.restart();
vector<double> r(size);
for (int k=1; k<=20; ++k) {
r = b - prod(A, x);
}
cout << "r=b - Ax : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
}
{
boost::timer t;
t.restart();
vector<double> r(size);
for (int k=1; k<=20; ++k) {
noalias(r) = b - prod(A, x);
}
cout << "noalias(r)=b-Ax : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
}
{
boost::timer t;
t.restart();
for (int k=1; k<=20; ++k) {
vector<double> r(b - prod(A, x));
}
cout << "vector<double> r(b - prod(A, x)) : " << t.elapsed() << endl;
}
{
boost::timer t;
t.restart();
vector<double> r(b);
for (int k=1; k<=20; ++k) {
axpy_prod(A,-x,r,false);
}
cout << "axpy_prod(A,-x,r,false) : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
}
{
boost::timer t;
t.restart();
vector<double> r(size);
for (int k=1; k<=20; ++k) {
residual(A,x,b,r,row_major_tag());
}
cout << "residual(A,x,b,r,row_major_tag()) : " << t.elapsed() << " (" << norm_2(r) << ")" << endl;
}
return EXIT_SUCCESS;
}
In this code different variants are used for computing the residual. I get the following runtime results (using Boost 1.34.1):
lehn@node31:~/ublas2_test> make
g++ -DNDEBUG -ftemplate-depth-50 -funroll-loops -O3 -I. -Wall -o ex_residual ex_residual.cpp
lehn@node31:~/ublas2_test>
lehn@node31:~/ublas2_test>
lehn@node31:~/ublas2_test>
lehn@node31:~/ublas2_test>
lehn@node31:~/ublas2_test> ./ex_residual
size 10000000
fill 1.68
r=Ax-b : 11.97 (63.2456)
r=b - Ax : 12.32 (63.2456)
noalias(r)=b-Ax : 12.85 (63.2456)
vector<double> r(b - prod(A, x)) : 14.16
axpy_prod(A,-x,r,false) : 7.78 (118902)
residual(A,x,b,r,row_major_tag()) : 8.32 (63.2456)
lehn@node31:~/ublas2_test> ./ex_residual
size 10000000
fill 1.7
r=Ax-b : 11.98 (63.2456)
r=b - Ax : 12.02 (63.2456)
noalias(r)=b-Ax : 10.93 (63.2456)
vector<double> r(b - prod(A, x)) : 13.56
axpy_prod(A,-x,r,false) : 7.84 (118902)
residual(A,x,b,r,row_major_tag()) : 8.32 (63.2456)
lehn@node31:~/ublas2_test>
So uBLAS obviously handles the row-wise initialization better. However, the fastest variant for computing the residual (8.32s) is slightly slower than in the FLENS example (4.71s).