8087 linear algebra routines (to be called from C)

Sun Feb 10 03:12:09 AEST 1985

This shell script contains 8087 assmebler language functions for some
linear algebra applications. The files in here are
1. 8087.doc -- a small explanation.
2. test.c   -- a C program to test for speed etc.
3. qsol.c   -- a C function to solve linear systems by QR decomposition
	       includes the decomposition routines. Illustrates the use
               of the assmebler functions.
4. ssdaxpy.asm,  ssxdot.asm,  dnrm2.asm   -- the assembler functions.
------------   cut here ---------
: Run this shell script with "sh" not "csh"
PATH=:/bin:/usr/bin:/usr/ucb
export PATH
all=FALSE
if [ $1x = -ax ]; then
	all=TRUE
fi
/bin/echo 'Extracting 8087.doc'
sed 's/^X//' <<'//go.sysin dd *' >8087.doc

     These routines are assembler versions of  some  of  the
basic  linear  algebra  subroutines  (BLAS) that are used in
packages like LINPACK [1].  The routines are designed to  be
called  by  the  CI-C86  C compiler and are roughly twice as
fast as routines coded in C. The routines and what  they  do
are  summarized in the comments at the beginning of the each
source file.  As an example of their use I'm including a  QR
factorization  routine  (written  in  C) and a function that
calls it to solve a system  of  linear  equations.   The  QR
function  and  the  supporting  routines  are taken from the
algorithms in [2] which are based on [3]. To use these  rou-
tines in nonroutine ways you should look at the references.

     Ref. [1] is particularly useful for those who  want  to
roll their own linear algebra routines.

                             References

1. LINPACK Users Guide, Dongarra, Bunch, Moler, and Stewart,
   SIAM, 1979.

2. Numerical Methods for Unconstrained Optimization and Non-
   linear Equations, Dennis and Schnabel, Prentice Hall, 1983.

3. Introduction to  Matrix  Computation,  Stewart,  Academic
   Press, 1973.

        Another good general reference on these topics is

4. Matrix Computations, Golub and Van Loan,  Johns  Hopkins,
   1984

     The 8087 routines should be  assembled  using  the  IBM
assembler  version 2.0 or later and the .h files from the CI
compiler disks should be on the same disk. I would  appreci-
ate hearing about any bugs that you find in these routines.

     The CI documentation (for v. 2.1) says  that  the  pro-
cedure  for  returning  doubles and floats may change. Func-
tions that need to return doubles do this  through  pointers
to  avoid  this.  Currently  doubles  are  returned  in  the
ax,bx,cx,and dx registers.  Other compilers  return  doubles
on the top of the 8087 stack. I play it safe.

     These functions  work  in  the  **small  model**  only.
Conversion  to the large model should be straightforward but
very boring. If anyone does this please  send  me  the  code
before I have to do it myself (in about 6 mos.).

    C.T. Kelley
    Dept. of Math.
    N.C. State U. Box 8205
    Raleigh, N.C. 27695-8205
    919-737-3300
    decvax!mcnc!ecsvax!ctk
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 8087.doc
	/bin/echo -n '	'; /bin/ls -ld 8087.doc
fi
/bin/echo 'Extracting test.c'
sed 's/^X//' <<'//go.sysin dd *' >test.c
#include "stdio.h"

#define fabs(A) (((A)>0.0) ? (A) : -(A))
#define sign(x) ((x == 0.0) ? 1.0 : x/fabs(x))

#define MAXDIM 81
X/* MAXDIM = 81 is a little dangerous. If you are storing much else use 41.*/

extern double cos();

X/* This program is a test for the program qsol. The program creates an
   orthogonal matrix, a, and a right hand side b. The solution to Ax=b
   is known exactly and subtracted from the x given by qsol.
   The function fabs() is given by a macro to speed things up.
*/
main()
{
	double a[MAXDIM][MAXDIM],b[MAXDIM],x[MAXDIM],z,y,y1;
	double u[MAXDIM],norm;
	double *g[MAXDIM],b1[MAXDIM];
	int i,j,n1,j1,n;

	/* n = dimension of problem <= MAXDIM.	*/

	n=50;  /* With n=50 this code runs in 11.20 sec on an AT. */

	n1=n-1;
	norm=0.0;

	/* Create the matrix and the right hand side. */

	for(i=0;i<n;i++)
	{
		u[i]=cos(3.0+i);
		b[i]=-u[i];
		norm=norm+(u[i]*u[i]);
	}
	for(i=0;i<n;i++)
	{
		for(j=0;j<i;j++)
		{
			a[i][j]=-2.0*u[i]*u[j]/norm;
			a[j][i]=a[i][j];
		}
		a[i][i]=1.0-2.0*u[i]*u[j]/norm;
	}

	/* The qsol function requires an array of pointers to the rows of A. */

	for(i=0;i<n;i++)
	{
		b1[i]=b[i];
		g[i]=a[i];
	}

	/* Solve the system. */

	qsol(g,b1,x,n);

	/* Compute and print the exact residual. */

	z=0.0;
	for(i=0;i<n;i++)
	{	z+=fabs(u[i]-x[i]);
	}
	printf("%30.15e\n",z);
}
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 test.c
	/bin/echo -n '	'; /bin/ls -ld test.c
fi
/bin/echo 'Extracting qsol.c'
sed 's/^X//' <<'//go.sysin dd *' >qsol.c
#include "stdio.h"

#define fabs(A) (((A)>0.0) ? (A) : -(A))
#define sign(x) ((x == 0.0) ? 1.0 : x/fabs(x))

#define MAXDIM 81
X/* MAXDIM = 81 is a little dangerous. If you are storing much else use 41.*/

X/* qsol.c -- Solve linear systems by QR factorization

   Functions in this file:
		qsol()
		qrdecm()
		qrsolv()
		rsolv()

   m is an array of pointers to rows of matrix, b is right hand side,
   x is solution, n is dimension of the problem. Indices vary from 0 to
   n-1, calling program must adjust to this.

   The algorithm is the one given in G.W. Stewart's book, Introduction to
   Matrix Computations. The code is based on the algorithms given in
   Dennis and Schnabel's book, Numerical Methods for Unconstrained Optimization
   and Nonlinear Equations. If you are unfamiliar with these ideas you should
   consult one or both of these books.

*/

qsol(m,b1,x,n)

double *m[],b1[],x[];
int n;

X/* This is the driver function for the QR decomposition routines and
   the solvers. In using these you should be sure to pass an array of
   pointers to the rows of the matrix rather than the matrix itself.
   That is the setup should include something like

	for(i=0;i<n;i++) m[i]=a[i];

   where a is the matrix and m is an array of pointers allocated for this
   purpose. If you don't do this many compilers will be upset.

   n is the dimension of the problem.
   b1 is the right hand side.
   The solution is returned in x.
   MAXDIM should be defined in an include file. The value 41 is safe for
   small model code. Note that the assembler routines assume small model
   and would have to be reworked to use the large model.
*/

{
	double m1[MAXDIM],m2[MAXDIM],c[MAXDIM];
	int i,*isng,sgt;

	/* The vector b1 is copied to c because qrsolv overwrites the right
	   hand side with the solution.
	*/

	for(i=0;i<n;i++) c[i]=b1[i];

	isng= &sgt;

	qrdecm(m,m1,m2,n,isng);

	if((*isng) == 1) {
		printf("singularity detected in QR");
		exit(1);
	}

	qrsolv(m,m1,m2,c,n);

	for(i=0;i<n;i++) x[i]=c[i];
}

qrdecm(m,m1,m2,n,isng)
double *m[],m1[],m2[];
int n,*isng;

X/* Form the QR decomposition of an nxn matrix. m is an array of pointers to
   the rows of the matrix. m1 and m2 are work arrays. The decomposition is
   stored in m, m1, and m2 as in Stewart's book. isng is a pointer to an
   int and is set to 1 if the matrix is singular. This should be tested on
   return to avoid a possible zero divide in qsolv.
*/

{
	double eta,sigma,tau,sum,*work1,*work2,mtmp;
	int i,k,j,stride;

	/* Stride is the distance in bytes between consecutive rows. Note
	   the implicit assumption of small model code here. When calling
	   those assembler routines that need stride be sure to multiply
	   the pointer difference by 8. I know that I am using the fact
	   that in the small model sizeof(int)=sizeof(*int) but I don't
	   care.
	*/
	stride=m[1]-m[0];
	stride*=8;

	*isng=0;

	for(k=0;k<n-1;k++)
	{
	       work1=&m[k][k];
	       dnrm2(work1,n-k,&sum,stride);

	       /* Test for singularity. */

	       if(sum < 1.0e-15) {
			*isng=1;
			m1[k]=0.0;
			m2[k]=0.0;
		}

		else
		{
			sigma=sign(m[k][k])*sum;
			m[k][k]+=sigma;
			m1[k]=sigma*m[k][k];
			mtmp=m1[k];
			m2[k]= -sigma;

			/* This is the critical loop. */
			for(j=k+1;j<n;j++)
			{
				work2=&m[k][j];
				ssxdot(work1,work2,n-k,&tau,stride,stride);
				ssdaxpy(work1,work2,n-k,-tau/mtmp,stride,stride);
			}

		}
	}
	m2[n-1]=m[n-1][n-1];
}

qrsolv(m,m1,m2,b,n)
double *m[],m1[],m2[],b[];
int n;

X/* The qrsolv() funciton uses the results of qrdecm to solve Ax=b. The vector
   b is over written by the solution x.
*/

{
	double tau;
	int i,j,stride;
	/* Compute the stride. */
	stride=(m[1]-m[0])*8;

	/* Multiply b by the transpose of Q. */

	for(j=0;j<n-1;j++)
	{
		ssxdot(&m[j][j],&b[j],n-j,&tau,stride,8);
		ssdaxpy(&m[j][j],&b[j],n-j,-tau/m1[j],stride,8);
	}

	/* Solve the resulting triangular system. */

	rsolv(n,m,m2,b);

}

X/* Solve a triangular system. */

rsolv(n,m,m2,b)
double *m[],m2[],b[];
int n;

{
	double sum;
	int i,j,n1;
	n1=n-1;
	b[n1]/=m2[n1];

	/* We assume that the vector b is stored in consecutive locations
	   in memory so stride is sizeof(double)=8.
	*/

	for(i=n1-1;i>=0;i--)
	{
		ssxdot(m[i]+i+1,b+i+1,n-i-1,&sum,8,8);
		b[i]-=sum;
		b[i]/=m2[i];
	}
}
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 qsol.c
	/bin/echo -n '	'; /bin/ls -ld qsol.c
fi
/bin/echo 'Extracting ssxdot.asm'
sed 's/^X//' <<'//go.sysin dd *' >ssxdot.asm
;
; See the comment on header files in ssdaxpy.asm.
;
include model.h
INCLUDE PROLOGUE.H
	public ssxdot
; 8087 dot product
; ssxdot ()
;
X.8087
ssxdot	  proc	  near
	push	bp
	mov	bp,sp
; Current syntax ssxdot(&f,&g,n,&h,stride1,stride2) -- > dot product stored in h
; &f in di,  &g in bx , n in cx. This function stores the dot product rather
; than returning it because different C compilers return doubles in different
; ways.
;
;This is equivalent to the following C function.
;
;	ssxdot(v1,v2,n,dptr,stride1,stride2)
;	double *v1,*v2,*dptr;
;	int n,stride1,stride2;
;	{
;		int i;
;		*dptr=0.0;
;		for(i=0;i<n;i++){
;			*dptr+=(*v1)*(*v2);
;			v1+=stride1;
;			v2+=stride2;
;		}
;	}
;
;      The strides are the distance between consecutive elements of v1, and v2
;      in bytes.
;
	push di
	push si
	push bx
	push ax
	mov di,4[bp]   ;vector 1
	mov si,6[bp]   ;vector 2
	mov cx,8[bp]   ;vector length
	mov ax,12[bp]  ;stride for vector 1
	mov bx,14[bp]  ;stride for vector 2
	finit
	fldz
done:
	fld qword ptr [si]	;get an element of vector 2
	add si,bx		;increment the pointer
	fmul qword ptr [di]	;multiply by vector 1
	add di,ax		;increment its pointer
	fadd			;accumulate the dot product
	loop done
	fwait			;hang around
	mov di,10[bp]		;address of dot product
	fstp qword ptr [di]	;store the dot product
	pop ax
	pop bx
	pop si
	pop di
	pop bp
	ret
ssxdot	  endp
INCLUDE EPILOGUE.H
end
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 ssxdot.asm
	/bin/echo -n '	'; /bin/ls -ld ssxdot.asm
fi
/bin/echo 'Extracting ssdaxpy.asm'
sed 's/^X//' <<'//go.sysin dd *' >ssdaxpy.asm
;
; The header files model.h, prologue.h, epilogue.h are specific to the
; Computer Innovations CI-C86 compiler. Your compiler is probably different.
;
include model.h
INCLUDE PROLOGUE.H
	public ssdaxpy
; 8087 main loop for lsol
; ssdaxpy ()
;
X.8087
ssdaxpy   proc	  near
	push	bp
	mov	bp,sp
; Current syntax ssdaxpy(x,y,n,a,stride1,stride2) y=y+ax
;
;	This routine is equivalent to the following C funciton.
;
;	ssdaxpy(x,y,n,a,stride1,stride2)
;	double *x,*y,a;
;	int n,stride1,stride2;
;	{
;		int i;
;		for(i=0;i<n;i++){
;			*y=*y+a*(*x);
;			 x+=stride1;
;			 y+=stride2;
;		}
;	}
;
;	This code is only good for the small model.
;
	push di
	push si
	push cx
	push ax
	mov di,4[bp]   ;vector 1 (x)
	mov si,6[bp]   ;vector 2 (y)
	mov cx,8[bp]   ;vector length
	finit
	fld qword ptr 10[bp]	;get a
	mov ax,18[bp]		;stride for vector 1
	mov bx,20[bp]		;stride for vector 2
done:
	fld qword ptr [di]   ;get *x
	fmul st,st(1)	     ;mult by a
	add di,ax
	fadd qword ptr [si] ;add ax to y
	fwait		      ;let the 8087 finish
	fstp qword ptr [si]   ;put it back in y
	add si,bx
	loop done
	finit
	pop cx
	pop cx
	pop si
	pop di
	pop bp
	ret
ssdaxpy   endp
INCLUDE EPILOGUE.H
end
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 ssdaxpy.asm
	/bin/echo -n '	'; /bin/ls -ld ssdaxpy.asm
fi
/bin/echo 'Extracting dnrm2.asm'
sed 's/^X//' <<'//go.sysin dd *' >dnrm2.asm
;
; See the comments on header files in ssdaxpy.asm.
;
include model.h
INCLUDE PROLOGUE.H
	public dnrm2
; 8087 dot product
; dnrm2 ()
;
X.8087
X.286C
dnrm2	 proc	 near
	push	bp
	mov	bp,sp
; Current syntax dnrm2(&x,n,&nrm,stride) -- > l2 norm of x stored in h
; &x in s , n in cx , &nrm in di
;
;	This is equivalent to the following C function.
;	dnrm2(xptr,n,normptr,stride)
;	double *xptr,*normptr;
;	int n,stride;
;	{
;		int i;
;		*xptr=0.0;
;		for(i=0;i<n;i++)
;			  *normptr+=(*xptr)*(*xptr);
;			  xptr+=stride;
;		}
;	}
;
;	The stride is the distance between consecutive elements of x in bytes.
;
;	For most computers you must be carefull to avoid overflow and underflow
;	in this calculation. As the 8087 registers have extra width in the
;	exponent field you can relax if you do it this way.
;
	push di
	push si
	push cx
	push ax
	mov si,4[bp]   ;vector
	mov cx,6[bp]   ;vector length
	mov ax,10[bp]  ;stride
	finit
	fldz	       ;begin with zero
done:
	fld qword ptr [si]    ;put two copies of an element of x on the stack
	fld st(0)
	fmul		      ;multiply, don't pop the stack
	add si,ax	      ;increment the pointer to the vector
	fadd		      ;accumulate the norm
	loop done
	fsqrt
	fwait		      ;wait for the square root to finish
	mov di,8[bp]	      ;get the address of the norm
	fstp qword ptr [di]   ;and store
	pop ax
	pop cx
	pop si
	pop di
	pop bp
	ret
dnrm2	 endp
INCLUDE EPILOGUE.H
end
//go.sysin dd *
made=TRUE
if [ $made = TRUE ]; then
	/bin/chmod 644 dnrm2.asm
	/bin/echo -n '	'; /bin/ls -ld dnrm2.asm
fi
-- 
C.T. Kelley  decvax!mcnc!ecsvax!ctk
Dept. of Math.    N.C. State U. Box 8205
Raleigh, N.C. 27695-8205,  919-737-3300