1 files changed, 1153 insertions, 0 deletions
diff --git a/tests/linpack.c b/tests/linpack.c
new file mode 100644
index 00000000..363415c3
--- /dev/null
+++ b/tests/linpack.c
@@ -0,0 +1,1153 @@
+/*      gcc linpack.c cpuidc64.o cpuida64.o -m64 -lrt -lc -lm -o linpack
+ *
+ *          Linpack 100x100 Benchmark In C/C++ For PCs
+ *     
+ *  Different compilers can produce different floating point numeric
+ *  results, probably due to compiling instructions in a different
+ *  sequence. As the program checks these, they may need to be changed.
+ *  The log file indicates non-standard results and these values can
+ *  be copied and pasted into this program. See // Values near the
+ *  end of main().
+ *
+ *  Different compilers do not optimise the code in the same way.
+ *  This can lead to wide variations in benchmark speeds. See results
+ *  with MS6 compiler ID and compare with those from same CPUs from
+ *  the Watcom compiler generated code.
+ *
+ ***************************************************************************
+*/
+
+#define _CRT_SECURE_NO_WARNINGS 1
+#ifdef WIN32
+#include <Windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+#define UNROLL
+#define DP
+
+#ifdef SP
+#define REAL float
+#define ZERO 0.0
+#define ONE 1.0
+#define PREC "Single"
+#endif
+
+#ifdef DP
+#define REAL double
+#define ZERO 0.0e0
+#define ONE 1.0e0
+#define PREC "Double"
+#endif
+
+#ifdef ROLL
+#define ROLLING "Rolled"
+#endif
+#ifdef UNROLL
+#define ROLLING "Unrolled"
+#endif
+
+ // VERSION
+               
+ #ifdef CNNT
+    #define options   "Non-optimised"
+    #define opt "0"
+ #else
+//    #define options   "Optimised"
+    #define options   "Opt 3 64 Bit"
+    #define opt "1"
+ #endif
+
+#define NTIMES 10
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <time.h> 
+
+
+/* this is truly rank, but it's minimally invasive, and lifted in part from the STREAM scores */
+
+static double secs;
+
+#ifndef WIN32
+
+double mysecond()
+{
+        struct timeval tp;
+        struct timezone tzp;
+        int i;
+
+        i = gettimeofday(&tp,&tzp);
+        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
+}
+#else
+
+double mysecond()
+{
+	static LARGE_INTEGER freq = {0};
+	LARGE_INTEGER count = {0};
+	if(freq.QuadPart == 0LL) {
+		QueryPerformanceFrequency(&freq);
+	}
+	QueryPerformanceCounter(&count);
+	return (double)count.QuadPart / (double)freq.QuadPart;
+}
+
+#endif
+
+void start_time()
+{
+	secs = mysecond();
+}
+
+void end_time()
+{
+	secs = mysecond() - secs;
+}
+
+void print_time (int row);
+void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
+void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
+void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
+void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
+void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
+REAL epslon (REAL x);
+int idamax (int n, REAL dx[], int incx);
+void dscal (int n, REAL da, REAL dx[], int incx);
+REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
+
+static REAL atime[9][15];
+double runSecs = 1;
+
+
+int main (int argc, char *argv[])
+{
+        static REAL aa[200*200],a[200*201],b[200],x[200];       
+        REAL cray,ops,total,norma,normx;
+        REAL resid,residn,eps,tm2,epsn,x1,x2;
+        REAL mflops;
+        static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
+        int endit, pass, loop;
+        REAL overhead1, overhead2, time2;
+        REAL max1, max2;
+        char was[5][20];
+        char expect[5][20];
+        char title[5][20];
+        int errors;
+        
+ 
+        printf("\n");
+         
+        printf("##########################################\n"); 
+
+
+    
+        lda = 201;
+        ldaa = 200;
+        cray = .056; 
+        n = 100;
+
+        fprintf(stdout, "%s ", ROLLING);
+        fprintf(stdout, "%s ", PREC);
+        fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n");
+
+        fprintf(stdout,"Optimisation %s\n\n",options);
+
+        ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
+
+        matgen(a,lda,n,b,&norma);
+        start_time();
+        dgefa(a,lda,n,ipvt,&info);
+        end_time();
+        atime[0][0] = secs;
+        start_time();
+        dgesl(a,lda,n,ipvt,b,0);
+        end_time();
+        atime[1][0] = secs;
+        total = atime[0][0] + atime[1][0];
+
+/*     compute a residual to verify results.  */ 
+
+        for (i = 0; i < n; i++) {
+                x[i] = b[i];
+        }
+        matgen(a,lda,n,b,&norma);
+        for (i = 0; i < n; i++) {
+                b[i] = -b[i];
+        }
+        dmxpy(n,b,n,lda,x,a);
+        resid = 0.0;
+        normx = 0.0;
+        for (i = 0; i < n; i++) {
+                resid = (resid > fabs((double)b[i])) 
+                        ? resid : fabs((double)b[i]);
+                normx = (normx > fabs((double)x[i])) 
+                        ? normx : fabs((double)x[i]);
+        }
+        eps = epslon(ONE);
+        residn = resid/( n*norma*normx*eps );
+        epsn = eps;
+        x1 = x[0] - 1;
+        x2 = x[n-1] - 1;
+        
+        printf("norm resid      resid           machep");
+        printf("         x[0]-1          x[n-1]-1\n");
+        printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
+               (double)residn, (double)resid, (double)epsn, 
+               (double)x1, (double)x2);
+
+        printf("Times are reported for matrices of order        %5d\n",n);
+        printf("1 pass times for array with leading dimension of%5d\n\n",lda);
+        printf("      dgefa      dgesl      total     Mflops       unit");
+        printf("      ratio\n");
+
+        atime[2][0] = total;
+        if (total > 0.0)
+        {
+            atime[3][0] = ops/(1.0e6*total);
+            atime[4][0] = 2.0/atime[3][0];
+        }
+        else
+        {
+            atime[3][0] = 0.0;
+            atime[4][0] = 0.0;
+        }
+        atime[5][0] = total/cray;
+       
+        print_time(0);
+
+/************************************************************************
+ *       Calculate overhead of executing matgen procedure              *
+ ************************************************************************/
+       
+        printf("\nCalculating matgen overhead\n");
+        pass = -20;
+        loop = NTIMES;
+        do
+        {
+            start_time();
+            pass = pass + 1;        
+            for ( i = 0 ; i < loop ; i++)
+            {
+                 matgen(a,lda,n,b,&norma);
+            }
+            end_time();
+            overhead1 = secs;
+            printf("%10d times %6.2f seconds\n", loop, overhead1);
+            if (overhead1 > runSecs)
+            {
+                pass = 0;
+            }
+            if (pass < 0)
+            {
+                if (overhead1 < 0.1)
+                {
+                    loop = loop * 10;
+                }
+                else
+                {
+                    loop = loop * 2;
+                }
+            }
+        }
+        while (pass < 0);
+        
+        overhead1 = overhead1 / (double)loop;
+
+        printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
+
+/************************************************************************
+ *           Calculate matgen/dgefa passes for runSecs seconds                *
+ ************************************************************************/
+       
+        printf("Calculating matgen/dgefa passes for %d seconds\n", (int)runSecs);
+        pass = -20;
+        ntimes = NTIMES;
+        do
+        {
+            start_time();
+            pass = pass + 1;        
+            for ( i = 0 ; i < ntimes ; i++)
+            {
+                matgen(a,lda,n,b,&norma);
+                dgefa(a,lda,n,ipvt,&info );
+            }
+            end_time();
+            time2 = secs;
+            printf("%10d times %6.2f seconds\n", ntimes, time2);
+            if (time2 > runSecs)
+            {
+                pass = 0;
+            }
+            if (pass < 0)
+            {
+                if (time2 < 0.1)
+                {
+                    ntimes = ntimes * 10;
+                }
+                else
+                {
+                    ntimes = ntimes * 2;
+                }
+            }
+        }
+        while (pass < 0);
+        
+        ntimes =  (int)(runSecs * (double)ntimes / time2);
+        if (ntimes == 0) ntimes = 1;
+
+        printf("Passes used %10d \n\n", ntimes);
+        printf("Times for array with leading dimension of%4d\n\n",lda);
+        printf("      dgefa      dgesl      total     Mflops       unit");
+        printf("      ratio\n");        
+
+/************************************************************************
+ *                              Execute 5 passes                        *
+ ************************************************************************/
+      
+        tm2 = ntimes * overhead1;
+        atime[3][6] = 0;
+
+        for (j=1 ; j<6 ; j++)
+        {
+            start_time();
+            for (i = 0; i < ntimes; i++)
+            {
+                matgen(a,lda,n,b,&norma);
+                dgefa(a,lda,n,ipvt,&info );
+            }
+            end_time();
+            atime[0][j] = (secs - tm2)/ntimes;
+
+            start_time();              
+            for (i = 0; i < ntimes; i++)
+            {
+                dgesl(a,lda,n,ipvt,b,0);
+            }
+            end_time();
+
+            atime[1][j] = secs/ntimes;
+            total       = atime[0][j] + atime[1][j];
+            atime[2][j] = total;
+            atime[3][j] = ops/(1.0e6*total);
+            atime[4][j] = 2.0/atime[3][j];
+            atime[5][j] = total/cray;
+            atime[3][6] = atime[3][6] + atime[3][j];
+            
+            print_time(j);
+        }
+        atime[3][6] = atime[3][6] / 5.0;
+        printf("Average                          %11.2f\n",
+                                               (double)atime[3][6]);        
+        
+        printf("\nCalculating matgen2 overhead\n");
+
+/************************************************************************
+ *             Calculate overhead of executing matgen procedure         *
+ ************************************************************************/
+
+        start_time();        
+        for ( i = 0 ; i < loop ; i++)
+        {
+            matgen(aa,ldaa,n,b,&norma);    
+        }
+        end_time();
+        overhead2 = secs;
+        overhead2 = overhead2 / (double)loop;
+        
+        printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
+        printf("Times for array with leading dimension of%4d\n\n",ldaa);
+        printf("      dgefa      dgesl      total     Mflops       unit");
+        printf("      ratio\n");
+
+/************************************************************************
+ *                              Execute 5 passes                        *
+ ************************************************************************/
+              
+        tm2 = ntimes * overhead2;
+        atime[3][12] = 0;
+
+        for (j=7 ; j<12 ; j++)
+        {
+            start_time();
+            for (i = 0; i < ntimes; i++)
+            {
+                matgen(aa,ldaa,n,b,&norma);
+                dgefa(aa,ldaa,n,ipvt,&info  );
+            }
+            end_time();
+            atime[0][j] = (secs - tm2)/ntimes;
+            
+            start_time();      
+            for (i = 0; i < ntimes; i++)
+            {
+                dgesl(aa,ldaa,n,ipvt,b,0);
+            }
+            end_time();
+            atime[1][j] = secs/ntimes;
+            total       = atime[0][j] + atime[1][j];
+            atime[2][j] = total;
+            atime[3][j] = ops/(1.0e6*total);
+            atime[4][j] = 2.0/atime[3][j];
+            atime[5][j] = total/cray;
+            atime[3][12] = atime[3][12] + atime[3][j];
+
+            print_time(j);
+        }
+        atime[3][12] = atime[3][12] / 5.0; 
+        printf("Average                          %11.2f\n",
+                                              (double)atime[3][12]);  
+
+/************************************************************************
+ *           Use minimum average as overall Mflops rating               *
+ ************************************************************************/
+      
+        mflops = atime[3][6];
+        if (atime[3][12] < mflops) mflops = atime[3][12];
+       
+        printf("\n");
+        printf( "%s ", ROLLING);
+        printf( "%s ", PREC);
+        printf(" Precision %11.2f Mflops \n\n",mflops);
+
+
+    max1 = 0;
+    for (i=1 ; i<6 ; i++)
+    {
+        if (atime[3][i] > max1) max1 = atime[3][i];                 
+    }
+
+    max2 = 0;
+    for (i=7 ; i<12 ; i++)
+    {                 
+        if (atime[3][i] > max2) max2 = atime[3][i];                 
+    }
+    if (max1 < max2) max2 = max1;
+   
+    sprintf(was[0], "%16.1f",(double)residn);
+    sprintf(was[1], "%16.8e",(double)resid);
+    sprintf(was[2], "%16.8e",(double)epsn);
+    sprintf(was[3], "%16.8e",(double)x1);
+    sprintf(was[4], "%16.8e",(double)x2);
+
+/*
+    //  Values for Watcom
+
+    sprintf(expect[0], "             0.4");
+    sprintf(expect[1], " 7.41628980e-014");
+    sprintf(expect[2], " 1.00000000e-015");
+    sprintf(expect[3], "-1.49880108e-014");
+    sprintf(expect[4], "-1.89848137e-014");
+    // Values for Visual C++
+
+    sprintf(expect[0], "             1.7");
+    sprintf(expect[1], " 7.41628980e-014");
+    sprintf(expect[2], " 2.22044605e-016");
+    sprintf(expect[3], "-1.49880108e-014");
+    sprintf(expect[4], "-1.89848137e-014");
+
+    // Values for Ubuntu GCC 32 Bit
+
+    sprintf(expect[0], "             1.9");
+    sprintf(expect[1], "  8.39915160e-14");
+    sprintf(expect[2], "  2.22044605e-16");
+    sprintf(expect[3], " -6.22835117e-14");
+    sprintf(expect[4], " -4.16333634e-14");
+*/
+
+     // Values for Ubuntu GCC 32 Bit
+
+    sprintf(expect[0], "             1.7");
+    sprintf(expect[1], "  7.41628980e-14");
+    sprintf(expect[2], "  2.22044605e-16");
+    sprintf(expect[3], " -1.49880108e-14");
+    sprintf(expect[4], " -1.89848137e-14");
+
+    sprintf(title[0], "norm. resid");
+    sprintf(title[1], "resid      ");
+    sprintf(title[2], "machep     ");
+    sprintf(title[3], "x[0]-1     ");
+    sprintf(title[4], "x[n-1]-1   ");
+
+    if (strtol(opt, NULL, 10) == 0)
+    {
+        sprintf(expect[2], " 8.88178420e-016");
+    }
+    errors = 0;
+
+    printf ("\n");
+}
+     
+/*----------------------*/ 
+void print_time (int row)
+
+{
+printf("%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n",   (double)atime[0][row],
+       (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], 
+       (double)atime[4][row], (double)atime[5][row]);
+       return;
+}
+      
+/*----------------------*/ 
+
+void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+
+{
+        int init, i, j;
+
+        init = 1325;
+        *norma = 0.0;
+        for (j = 0; j < n; j++) {
+                for (i = 0; i < n; i++) {
+                        init = 3125*init % 65536;
+                        a[lda*j+i] = (init - 32768.0)/16384.0;                        
+                        *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
+                        
+                        /* alternative for some compilers
+                        if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
+                        */
+                }
+        }
+        for (i = 0; i < n; i++) {
+          b[i] = 0.0;
+        }
+        for (j = 0; j < n; j++) {
+                for (i = 0; i < n; i++) {
+                        b[i] = b[i] + a[lda*j+i];
+                }
+        }
+        return;
+}
+
+/*----------------------*/ 
+void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+/*
+     dgefa factors a double precision matrix by gaussian elimination.
+
+     dgefa is usually called by dgeco, but it can be called
+     directly with a saving in time if  rcond  is not needed.
+     (time for dgeco) = (1 + 9/n)*(time for dgefa) .
+
+     on entry
+
+        a       REAL precision[n][lda]
+                the matrix to be factored.
+
+        lda     integer
+                the leading dimension of the array  a .
+
+        n       integer
+                the order of the matrix  a .
+
+     on return
+
+        a       an upper triangular matrix and the multipliers
+                which were used to obtain it.
+                the factorization can be written  a = l*u  where
+                l  is a product of permutation and unit lower
+                triangular matrices and  u  is upper triangular.
+
+        ipvt    integer[n]
+                an integer vector of pivot indices.
+
+        info    integer
+                = 0  normal value.
+                = k  if  u[k][k] .eq. 0.0 .  this is not an error
+                     condition for this subroutine, but it does
+                     indicate that dgesl or dgedi will divide by zero
+                     if called.  use  rcond  in dgeco for a reliable
+                     indication of singularity.
+
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+
+     functions
+
+     blas daxpy,dscal,idamax
+*/
+
+{
+/*     internal variables       */
+
+REAL t;
+int j,k,kp1,l,nm1;
+
+
+/*     gaussian elimination with partial pivoting       */
+
+        *info = 0;
+        nm1 = n - 1;
+        if (nm1 >=  0) {
+                for (k = 0; k < nm1; k++) {
+                        kp1 = k + 1;
+
+                        /* find l = pivot index */
+
+                        l = idamax(n-k,&a[lda*k+k],1) + k;
+                        ipvt[k] = l;
+
+                        /* zero pivot implies this column already 
+                           triangularized */
+
+                        if (a[lda*k+l] != ZERO) {
+
+                                /* interchange if necessary */
+
+                                if (l != k) {
+                                        t = a[lda*k+l];
+                                        a[lda*k+l] = a[lda*k+k];
+                                        a[lda*k+k] = t; 
+                                }
+
+                                /* compute multipliers */
+
+                                t = -ONE/a[lda*k+k];
+                                dscal(n-(k+1),t,&a[lda*k+k+1],1);
+
+                                /* row elimination with column indexing */
+
+                                for (j = kp1; j < n; j++) {
+                                        t = a[lda*j+l];
+                                        if (l != k) {
+                                                a[lda*j+l] = a[lda*j+k];
+                                                a[lda*j+k] = t;
+                                        }
+                                        daxpy(n-(k+1),t,&a[lda*k+k+1],1,
+                                              &a[lda*j+k+1],1);
+                                } 
+                        }
+                        else { 
+                                *info = k;
+                        }
+                } 
+        }
+        ipvt[n-1] = n-1;
+        if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
+        return;
+}
+
+/*----------------------*/ 
+
+void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+
+/*
+     dgesl solves the double precision system
+     a * x = b  or  trans(a) * x = b
+     using the factors computed by dgeco or dgefa.
+
+     on entry
+
+        a       double precision[n][lda]
+                the output from dgeco or dgefa.
+
+        lda     integer
+                the leading dimension of the array  a .
+
+        n       integer
+                the order of the matrix  a .
+
+        ipvt    integer[n]
+                the pivot vector from dgeco or dgefa.
+
+        b       double precision[n]
+                the right hand side vector.
+
+        job     integer
+                = 0         to solve  a*x = b ,
+                = nonzero   to solve  trans(a)*x = b  where
+                            trans(a)  is the transpose.
+
+    on return
+
+        b       the solution vector  x .
+
+     error condition
+
+        a division by zero will occur if the input factor contains a
+        zero on the diagonal.  technically this indicates singularity
+        but it is often caused by improper arguments or improper
+        setting of lda .  it will not occur if the subroutines are
+        called correctly and if dgeco has set rcond .gt. 0.0
+        or dgefa has set info .eq. 0 .
+
+     to compute  inverse(a) * c  where  c  is a matrix
+     with  p  columns
+           dgeco(a,lda,n,ipvt,rcond,z)
+           if (!rcond is too small){
+                for (j=0,j<p,j++)
+                        dgesl(a,lda,n,ipvt,c[j][0],0);
+           }
+
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+
+     functions
+
+     blas daxpy,ddot
+*/
+{
+/*     internal variables       */
+
+        REAL t;
+        int k,kb,l,nm1;
+
+        nm1 = n - 1;
+        if (job == 0) {
+
+                /* job = 0 , solve  a * x = b
+                   first solve  l*y = b         */
+
+                if (nm1 >= 1) {
+                        for (k = 0; k < nm1; k++) {
+                                l = ipvt[k];
+                                t = b[l];
+                                if (l != k){ 
+                                        b[l] = b[k];
+                                        b[k] = t;
+                                }       
+                                daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
+                        }
+                } 
+
+                /* now solve  u*x = y */
+
+                for (kb = 0; kb < n; kb++) {
+                    k = n - (kb + 1);
+                    b[k] = b[k]/a[lda*k+k];
+                    t = -b[k];
+                    daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
+                }
+        }
+        else { 
+
+                /* job = nonzero, solve  trans(a) * x = b
+                   first solve  trans(u)*y = b                  */
+
+                for (k = 0; k < n; k++) {
+                        t = ddot(k,&a[lda*k+0],1,&b[0],1);
+                        b[k] = (b[k] - t)/a[lda*k+k];
+                }
+
+                /* now solve trans(l)*x = y     */
+
+                if (nm1 >= 1) {
+                        for (kb = 1; kb < nm1; kb++) {
+                                k = n - (kb+1);
+                                b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
+                                l = ipvt[k];
+                                if (l != k) {
+                                        t = b[l];
+                                        b[l] = b[k];
+                                        b[k] = t;
+                                }
+                        }
+                }
+        }
+        return;
+}
+
+/*----------------------*/ 
+
+void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
+/*
+     constant times a vector plus a vector.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+{
+        int i,ix,iy,m,mp1;
+
+        mp1 = 0;
+        m = 0;
+
+        if(n <= 0) return;
+        if (da == ZERO) return;
+
+        if(incx != 1 || incy != 1) {
+
+                /* code for unequal increments or equal increments
+                   not equal to 1                                       */
+
+                ix = 0;
+                iy = 0;
+                if(incx < 0) ix = (-n+1)*incx;
+                if(incy < 0)iy = (-n+1)*incy;
+                for (i = 0;i < n; i++) {
+                        dy[iy] = dy[iy] + da*dx[ix];
+                        ix = ix + incx;
+                        iy = iy + incy;
+                     
+                }
+                return;
+        }
+        
+        /* code for both increments equal to 1 */
+        
+
+#ifdef ROLL
+
+        for (i = 0;i < n; i++) {
+                dy[i] = dy[i] + da*dx[i];
+        }
+
+
+#endif
+
+#ifdef UNROLL
+
+        m = n % 4;
+        if ( m != 0) {
+                for (i = 0; i < m; i++) 
+                        dy[i] = dy[i] + da*dx[i];
+                        
+                if (n < 4) return;
+        }
+        for (i = m; i < n; i = i + 4) {
+                dy[i] = dy[i] + da*dx[i];
+                dy[i+1] = dy[i+1] + da*dx[i+1];
+                dy[i+2] = dy[i+2] + da*dx[i+2];
+                dy[i+3] = dy[i+3] + da*dx[i+3];
+                
+        }
+
+#endif
+return;
+}
+   
+/*----------------------*/ 
+
+REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
+/*
+     forms the dot product of two vectors.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+{
+        REAL dtemp;
+        int i,ix,iy,m,mp1;
+
+        mp1 = 0;
+        m = 0;
+
+        dtemp = ZERO;
+
+        if(n <= 0) return(ZERO);
+
+        if(incx != 1 || incy != 1) {
+
+                /* code for unequal increments or equal increments
+                   not equal to 1                                       */
+
+                ix = 0;
+                iy = 0;
+                if (incx < 0) ix = (-n+1)*incx;
+                if (incy < 0) iy = (-n+1)*incy;
+                for (i = 0;i < n; i++) {
+                        dtemp = dtemp + dx[ix]*dy[iy];
+                        ix = ix + incx;
+                        iy = iy + incy;
+                       
+                }
+                return(dtemp);
+        }
+
+        /* code for both increments equal to 1 */
+
+
+#ifdef ROLL
+
+        for (i=0;i < n; i++)
+                dtemp = dtemp + dx[i]*dy[i];
+               
+        return(dtemp);
+
+#endif
+
+#ifdef UNROLL
+
+
+        m = n % 5;
+        if (m != 0) {
+                for (i = 0; i < m; i++)
+                        dtemp = dtemp + dx[i]*dy[i];
+                if (n < 5) return(dtemp);
+        }
+        for (i = m; i < n; i = i + 5) {
+                dtemp = dtemp + dx[i]*dy[i] +
+                dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
+                dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
+        }
+        return(dtemp);
+
+#endif
+
+}
+
+/*----------------------*/ 
+void dscal(int n, REAL da, REAL dx[], int incx)
+
+/*     scales a vector by a constant.
+      jack dongarra, linpack, 3/11/78.
+*/
+
+{
+        int i,m,mp1,nincx;
+
+        mp1 = 0;
+        m = 0;
+
+        if(n <= 0)return;
+        if(incx != 1) {
+
+                /* code for increment not equal to 1 */
+
+                nincx = n*incx;
+                for (i = 0; i < nincx; i = i + incx)
+                        dx[i] = da*dx[i];
+                        
+                return;
+        }
+
+        /* code for increment equal to 1 */
+
+
+#ifdef ROLL
+
+        for (i = 0; i < n; i++)
+                dx[i] = da*dx[i];
+                
+
+#endif
+
+#ifdef UNROLL
+
+
+        m = n % 5;
+        if (m != 0) {
+                for (i = 0; i < m; i++)
+                        dx[i] = da*dx[i];
+                if (n < 5) return;
+        }
+        for (i = m; i < n; i = i + 5){
+                dx[i] = da*dx[i];
+                dx[i+1] = da*dx[i+1];
+                dx[i+2] = da*dx[i+2];
+                dx[i+3] = da*dx[i+3];
+                dx[i+4] = da*dx[i+4];
+        }
+
+#endif
+
+}
+
+/*----------------------*/ 
+int idamax(int n, REAL dx[], int incx)
+
+/*
+     finds the index of element having max. absolute value.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+
+{
+        REAL dmax;
+        int i, ix, itemp;
+
+        if( n < 1 ) return(-1);
+        if(n ==1 ) return(0);
+        if(incx != 1) {
+
+                /* code for increment not equal to 1 */
+
+                ix = 1;
+                dmax = fabs((double)dx[0]);
+                ix = ix + incx;
+                for (i = 1; i < n; i++) {
+                        if(fabs((double)dx[ix]) > dmax)  {
+                                itemp = i;
+                                dmax = fabs((double)dx[ix]);
+                        }
+                        ix = ix + incx;
+                }
+        }
+        else {
+
+                /* code for increment equal to 1 */
+
+                itemp = 0;
+                dmax = fabs((double)dx[0]);
+                for (i = 1; i < n; i++) {
+                        if(fabs((double)dx[i]) > dmax) {
+                                itemp = i;
+                                dmax = fabs((double)dx[i]);
+                        }
+                }
+        }
+        return (itemp);
+}
+
+/*----------------------*/ 
+REAL epslon (REAL x)
+
+/*
+     estimate unit roundoff in quantities of size x.
+*/
+
+{
+        REAL a,b,c,eps;
+/*
+     this program should function properly on all systems
+     satisfying the following two assumptions,
+        1.  the base used in representing dfloating point
+            numbers is not a power of three.
+        2.  the quantity  a  in statement 10 is represented to 
+            the accuracy used in dfloating point variables
+            that are stored in memory.
+     the statement number 10 and the go to 10 are intended to
+     force optimizing compilers to generate code satisfying 
+     assumption 2.
+     under these assumptions, it should be true that,
+            a  is not exactly equal to four-thirds,
+            b  has a zero for its last bit or digit,
+            c  is not exactly equal to one,
+            eps  measures the separation of 1.0 from
+                 the next larger dfloating point number.
+     the developers of eispack would appreciate being informed
+     about any systems where these assumptions do not hold.
+
+     *****************************************************************
+     this routine is one of the auxiliary routines used by eispack iii
+     to avoid machine dependencies.
+     *****************************************************************
+
+     this version dated 4/6/83.
+*/
+
+        a = 4.0e0/3.0e0;
+        eps = ZERO;
+        while (eps == ZERO) {
+                b = a - ONE;
+                c = b + b + b;
+                eps = fabs((double)(c-ONE));
+        }
+        return(eps*fabs((double)x));
+}
+ 
+/*----------------------*/ 
+void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
+
+
+/* We would like to declare m[][ldm], but c does not allow it.  In this
+function, references to m[i][j] are written m[ldm*i+j].  */
+
+/*
+   purpose:
+     multiply matrix m times vector x and add the result to vector y.
+
+   parameters:
+
+     n1 integer, number of elements in vector y, and number of rows in
+         matrix m
+
+     y double [n1], vector of length n1 to which is added 
+         the product m*x
+
+     n2 integer, number of elements in vector x, and number of columns
+         in matrix m
+
+     ldm integer, leading dimension of array m
+
+     x double [n2], vector of length n2
+
+     m double [ldm][n2], matrix of n1 rows and n2 columns
+
+ ----------------------------------------------------------------------
+*/
+{
+        int j,i,jmin;
+        /* cleanup odd vector */
+
+        j = n2 % 2;
+        if (j >= 1) {
+                j = j - 1;
+                for (i = 0; i < n1; i++) 
+                        y[i] = (y[i]) + x[j]*m[ldm*j+i];
+        } 
+
+        /* cleanup odd group of two vectors */
+
+        j = n2 % 4;
+        if (j >= 2) {
+                j = j - 1;
+                for (i = 0; i < n1; i++)
+                        y[i] = ( (y[i])
+                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+        } 
+
+        /* cleanup odd group of four vectors */
+
+        j = n2 % 8;
+        if (j >= 4) {
+                j = j - 1;
+                for (i = 0; i < n1; i++)
+                        y[i] = ((( (y[i])
+                               + x[j-3]*m[ldm*(j-3)+i]) 
+                               + x[j-2]*m[ldm*(j-2)+i])
+                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+        } 
+
+        /* cleanup odd group of eight vectors */
+
+        j = n2 % 16;
+        if (j >= 8) {
+                j = j - 1;
+                for (i = 0; i < n1; i++)
+                        y[i] = ((((((( (y[i])
+                               + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
+                               + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
+                               + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
+                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
+        } 
+        
+        /* main loop - groups of sixteen vectors */
+
+        jmin = (n2%16)+16;
+        for (j = jmin-1; j < n2; j = j + 16) {
+                for (i = 0; i < n1; i++) 
+                        y[i] = ((((((((((((((( (y[i])
+                                + x[j-15]*m[ldm*(j-15)+i]) 
+                                + x[j-14]*m[ldm*(j-14)+i])
+                                + x[j-13]*m[ldm*(j-13)+i]) 
+                                + x[j-12]*m[ldm*(j-12)+i])
+                                + x[j-11]*m[ldm*(j-11)+i]) 
+                                + x[j-10]*m[ldm*(j-10)+i])
+                                + x[j- 9]*m[ldm*(j- 9)+i]) 
+                                + x[j- 8]*m[ldm*(j- 8)+i])
+                                + x[j- 7]*m[ldm*(j- 7)+i]) 
+                                + x[j- 6]*m[ldm*(j- 6)+i])
+                                + x[j- 5]*m[ldm*(j- 5)+i]) 
+                                + x[j- 4]*m[ldm*(j- 4)+i