Paragon Program Listing

      program filtered_bak
C****************************************************************
C PROGRAM FOR CARRYING OUT TOMOGRAPHIC RECONSTRUCTION
C FOR 2d PROJECTION DATA FOR PARALLEL BEAM PROJECTIONS
C ON THE INTEL PARAGON.
C
C PROGRAMMER: RAMAN RAO
C DATE OF LAST REVISION: 1/2/95
C
C VIRGINIA POLYTECHNIC INSTITUTE AND STATE UNIVERSITY
C BLACKSBURG, VA 24061
C***************************************************************
C
C  u = # of rows per processor(=# of angles/nproc )
C  nproc = np = number of processors used
C np1 = np-1
C nnodes=number ofnodes (generated by the system)
C iam = my processor number
C ptype = process type
C szreal = number of bytes in type real in fortran
C power2 = length of the final image in the power of 2
C MAX = actual length of the final image
C rays = # of rays used in the forward projection
C rows=number of anglesa
C cols = number of detectors(rays)
C odim = object dimension
C rdim = reconstruction dimension (=xdim=ydim in this object only)
C xdim, ydim = size of the final image
C etime = elapsed time
C etime1-6 = elapsed times for intermediate stages of the program
C step = step size for generating the checker board
C blank = density allocated inside the object
C den = same as the above
C object = object to be projected and then backprojected
C r=xcos(theta)+ysin(theta) normalized to lie in the range (-1,1)
C pwr2=power of 2 (real), ipower2=int(pwr2)
C mat= initial projection matrix 
C pmat = projection matrix on each processor. This is the same
C        as the matrix 'mat' except that the data contained in 
C	  'pmat' is different on each processor (contains data 
C	   pertinant to that processor)
C nbytes_pmat=number of bytes in the 'pmat' matrix
C H = row matrix of the filter/response function
C nbytes_h = number of bytes in the H matrix 
C P = temporary row matrix for storing the rows of 'pmat' 
C lb = low bin; hb = high bin; mb = mid bin
C i,j,k,iall,jall,ii1,l1,ii,ii2,ik: all are temporary variables
C for running the 'do loops'
C tmp_xy= 2D array for storing the partial summations in the 
C          back-projection  stage
C sum_xy = 2D array for storing the summation for all (x,y) 
C msg_xy = 2D array which is the same as tmp_xy but is a 
C          message sent from some processor to the root 
C          or zeroth processor
C f = the final matrix containing the reconstructed densities

      implicit none
      include 'fnx.h'
      integer max,angles,rays,xdim,ydim,u,nproc,power2,iix,ij
      integer odim,step,iproc
      parameter(MAX=64,angles=128,rays=16,xdim=64,ydim=64)
      parameter(odim=16,step=4)

	  parameter(u=128,nproc=1)
	  parameter(power2=6)
      complex xx,H,pmat,mat,P,tmp2
      real tau,theta,r,beta,frac,x_max,y_max,mb,mp,tw
      real delq,tmp_xy,T,sum,f,msg,pwr2,pi,x_cen,y_cen
      real tot_angle_sum,proc_angle_sum,sum_s,msg_s,const
      real proc_zero_ang_sum,sum_xy,msg_xy
      double precision stime,etime,etime1,etime2,etime3,etime4
      double precision etime5,etime6
      integer rows,cols,n1,k1,ptype,nnodes,iam,x,y,int_val
      integer iall,jall,i,j,k,nbytes_pmat,ii1,l1,ii,ii2,ik
      integer ipwr2,szreal,npp,np,nbytes_p,nbytes_h,lb,hb
      integer blank,den,i1,j1,np1
      real x_cen_o,y_cen_o,object
      dimension xx(16),H(MAX),mat(angles,rays),pmat(u,MAX)
      dimension P(MAX),T(30),f(xdim,ydim),sum(ydim),msg(ydim)
      dimension object(odim,odim)
      dimension proc_zero_ang_sum(MAX),proc_angle_sum(MAX)
      dimension tot_angle_sum(MAX),tmp_xy(xdim,ydim)
      dimension sum_xy(xdim,ydim),msg_xy(xdim,ydim)

C begin initializations
         etime=0.0
	  etime1=0.0
	  etime2=0.0
	  etime3=0.0
	  etime4=0.0
	  etime5=0.0
	  etime6=0.0
	  stime=0.0
	  PI = acos(-1.0)
c  stime = dclock()
c     write(*,*) 'PI=',PI
      np=nproc
      np1 = nproc-1
      write(*,*) 'np1=',np1
	  szreal = 4
      rows = angles 
      cols = rays 
      tau = 1.0
      npp = rows/np
      nbytes_pmat = MAX*u*szreal*2
      nbytes_h = MAX*szreal*2
      x_max = real(xdim)
      y_max = real(ydim)
      x_cen = x_max/2.
      y_cen = y_max/2.
      x_cen_o=real(odim)/2.0
      y_cen_o=real(odim)/2.0

      mp = real(rays)/2.
      tw = 1./(1.15*sqrt(2.))
      const = PI/real(rows)
      write(*,*) 'const=',const
	  ipwr2 = power2

C initialize system variables on each node
      iam = mynode()
      nnodes = numnodes()
      ptype = myptype() 

C find the start time
	  stime = dclock()

C init pmat in all processors to zero: this step takes care 
C of the additional padding with zero's which is required later
       do iall=1,u
         do jall = 1,MAX 
	    pmat(iall,jall)=(0.,0.)
         enddo
       enddo
          etime1 = dclock()-stime
	   write(*,*) 'etime1=',etime1

	do iall=1,odim
	  do jall=1,odim
	    object(iall,jall)=0.
         enddo
        enddo
 
      if (iam. eq. 0) then
          write(*,*) 'Number of processors is',np
          write(*,*) 'angles=',rows,'rays=',cols  

C generating the checker board (object to be reconstructed)
          blank =0
	  do i=1,odim,step
	    if (blank .eq. 0) then
	      blank = 255
           else
              blank = 0
           endif

	    do j=1,odim,step
	      if (blank .eq. 0) then
		blank = 255
             else
                blank = 0
             endif
            
	       do i1=i,i+step
                 do j1=j,j+step
                   object(i1,j1)=real(blank)
c  	  write(*,*) object(i1,j1),'i1=',i1,'j1=',j1
                 enddo
               enddo

            enddo
           enddo

           etime2 = dclock()-stime
	    write(*,*) 'etime2=',etime2

      write(*,*) 'finished generating the object'

C uncomment the following lines if you need to write the object to a file
c     open (unit = 19,file ='object.fil',status='old',form='formatted')
c     do iix=1,odim
c       do ii=1,odim
c         write(19,*) object(iix,ii)
c       enddo
c     enddo
c     close(19)
c     write(*,*) 'finished writing the object.fil'

C Evaluate the response function, find its fft
C and send this data to the other nodes (p. 72 of kak)
           call evalh2(H,1.0,cols)
           write(*,*) 'H evaluated'

C uncomment the following line if you need to write H to a file
c          open (unit = 15,file ='h.fil',status='old',form='formatted')
c          do i=1,MAX
c	     write(15,*) 'i=',i,'H=',H(i)
c          enddo
c          close(15)

C broadcast the object and H to all the processors
           call csend(20,object,odim*odim*szreal,-1,ptype)
           call csend(20,H,nbytes_h,-1,ptype)

       else

C Give the corresponding receives to synchronize send's and receive's
           call crecv(20,object,odim*odim*szreal,0,ptype)
           call crecv(20,H,nbytes_h,0,ptype)  

       end if

         etime3 = dclock()-stime
	  write(*,*) 'etime3=',etime3

C forward project the object.
C This step directly generates the 'pmat' matrices
C for all the processors
C finding the forward projections in parallel on all the nodes

      do i=1,odim
          do j=1,odim
		 do k=1,u

		   theta = 2.0*real((u*iam)+k)*pi/real(angles)
		   r = cos(theta)*((real(i)-x_cen_o)/x_cen_o)
     $                +sin(theta)*((real(j)-y_cen_o)/y_cen_o)
                 mb=(r*mp*tw)+mp
		   lb=int(mb)
c	   write(*,*) 'lb=',lb
		   hb=lb+1
		   frac=mb-lb
                 pmat(k,lb)=pmat(k,lb)+cmplx((1.0-frac)*object(i,j))
		   pmat(k,hb)=pmat(k,hb)+cmplx(frac*object(i,j))
                 
                enddo
            enddo
       enddo
	 write(*,*) 'finished the forward projections'


          etime4 = dclock()-stime
	   write(*,*) 'etime4=',etime4

C Now find the FFT of the input matrix row-wise on all the nodes
C this is done in parallel but row-wise serial on all the nodes
C The results of the FFT are again stored in the pmat matrix
       do i=1,u

C write a row of PMAT to P
         do j=1,MAX
           P(j) = PMAT(i,j)
         enddo

C find the FFT
         call diffft(P,ipwr2)

C write back P into PMAT
	      do l1=1,MAX
               PMAT(i,l1) = P(l1)
             enddo
       enddo

C uncomment if you need to verify this step 
c     open (unit = 18,file ='pmat1.fil',status='old'
c    $,form='formatted')
c     do i =1,np
c      if (iam .eq. i-1) then
c       do ii=1,u
c         write(18,*) 'iam=',iam,'row=',ii
c         write(18,*) (pmat(ii,ij), ij=1,MAX)
c       enddo
c      endif
c     enddo
c     close(18)

C multiply the two funs H and and pmat. Multiply here
C means scalar multiplication of equivalent elements in the 
C same position (becomes convolution in the time domain).
C This part is executed in parallel on all the nodes
C ref( p. 75 kak). Multiplication with a Hamming function
C is also done right here

        do i=1,u
	     do j=1,MAX
		 PMAT(i,j) = PMAT(i,j) * H(j) *
     $ (0.54-0.46*cos((2.0*(real(j-1))*pi)/real(MAX)))
c       write(*,*) 'i=',i,'j=',j,'pmat=',PMAT(i,j)
            enddo
        enddo

C uncomment if you need to verify this step         
c       open (unit = 17,file ='pmat2.fil',status='old',form='formatted')
c       do iix=1,nproc
c       call gsync()
c       if (iam .eq. iix-1) then
c       do ii=1,u
c       write(17,*) 'iam=',iam,'row=',ii
c       write(17,*) (pmat(ii,ij), ij=1,MAX)
c       enddo
c       endif
c       enddo
c       write(*,*) 'I finished the point to point multiply','iam=',iam
c       write(*,*) 'iam=',iam
c       close(17)

C find the row-wise IFFT of the new matrix
C IFFT is found by: cmplx conj. of the data row by row, finding the
C forward transform, again cmplx conj. and then multiply by the 
C scale factor. This step proceeds in parallel on all the
C nodes(ref. any d.s.p. book or d.s.p. by Oppenheim and Schafer.

C for all rows on the processor do:
       do i=1,u

C move a row from PMAT to P
	   do j=1,MAX
	      P(j) = PMAT(i,j)
          enddo

C find the complex conj. of each element
          do ij=1,MAX
          tmp2 = P(ij)   
           P(ij) = conjg(tmp2)
          enddo

C find the FFT
	   call diffft(P,ipwr2)
        
C find the complex conj. of each element
           do ij=1,MAX
             tmp2 = P(ij)   
           P(ij) = conjg(tmp2)
           enddo

C normalize by the scale factor
            do  ik =1,MAX
		tmp2= P(ik)
		P(ik) = (tmp2)/real(MAX)
            enddo

C put it bak in the original matrix
            do j=1,MAX
	      PMAT(i,j) = P(j)
            enddo

         enddo

        write(*,*) 'Finished calculating the filtered vals'

C synchronize all processors: This is done to time the filtered 
C projections
            call gsync()
              etime5 = dclock() - stime
		write(*,*) 'etime5=',etime5
        etime = etime1 + (etime5-etime4)

            if (iam .eq. 0) then
        write(*,*) 'time for calculating filtered projections is',etime
	 write(*,*) 'iam=',iam
            endif

C uncomment if you need to verify results at this stage
c       open (unit = 13,file ='pmat3.fil',status='old',form='formatted')
c       do iix=1,nproc
c         call gsync()
c         if (iam .eq. iix-1) then
c            do ii=1,u
c            write(13,*) 'iam=',iam,'row=',ii
c            write(13,*) (PMAT(ii,ij), ij=1,MAX)
c      enddo
c         endif
c       enddo
c       close(13)

c         do i=1,u
c           theta = real((u*iam)+i)*PI/real(angles)
c           write(*,*) 'theta=',theta,'iam=',iam
c         enddo

C Backprojection begins here
C 
C initialize 
      do x=1,xdim
	 do y=1,ydim
	   tmp_xy(x,y)=0.0
	   msg_xy(x,y)=0.0
	   sum_xy(x,y)=0.0
        enddo
      enddo

C now carrying out the summation in the backprojection stage
C (ref. p.67 kak. The summation on p.67 is split into smaller
C summations on each processor. Then these smaller summations
C are transferred to the zeroth processor for finding the 
C total sum.
C
C finds the tmp_xy's for each x and y

C for all x and y do:
      do x = 1,xdim
         do y= 1,ydim

C fo all rows on each processor do:
            do i= 1,u
C theta evaluated: Note 2.0*PI  may not be required
C
C Just PI will suffice as the multiplier
               theta = 2.0*PI*real((u*iam)+i)/real(angles)

C find the value of 't'(ref. p49 kak)
               r = cos(theta)*((x-x_cen)/x_cen)
     $               + sin(theta)*((y-y_cen)/y_cen)
C
C find the mid bin, low bin and high bin
               mb= (r*mp*tw) + mp 
               lb = int(mb)
               hb = int(mb)+1
C
C find the fractional value of the mid bin
               frac = mb-lb
               tmp_xy(x,y)=tmp_xy(x,y)+
     $   	                real(PMAT(i,lb))*(1.0-frac) + 
     $                         frac*real(pmat(i,hb))
c	 write(*,*) 'mb=',mb,'lb=',lb,'hb=',hb,'frac=',frac
	     enddo
          enddo
       enddo

C transfer all the tmp_xy's to the zeroth processor
C and add them all up.

          if (iam .ne. 0) then
               call csend(20,tmp_xy,xdim*ydim*szreal,0,ptype)
          else

C if iam the zeoro'th processor:
C initialize the sum to zero
	       do x=1,xdim
                do y=1,ydim
                  sum_xy(x,y) = 0.0
                enddo
              enddo

C for all the processors on the system other than zero do:
              do ii1 = 1,np1
C
C receive the tmp_xy sent earlier
                call crecv(20,msg_xy,xdim*ydim*szreal,ii1,ptype)
C
C add them all up as they are received
		  do x=1,xdim
		    do y=1,ydim
                    sum_xy(x,y)=sum_xy(x,y)+msg_xy(x,y)
                  enddo
                enddo

              enddo

C find the density matrix
	      do x=1,xdim
               do y=1,ydim
                 f(x,y)=const*(sum_xy(x,y)+tmp_xy(x,y))
               enddo
              enddo

C endif for the zero'th processor
           endif

C synchronize all the processors for getting the timing information
       call gsync()

       if (iam .eq. 0) then
         etime6 = dclock() - stime
	  write(*,*) 'etime6=',etime6
      etime = etime1+(etime5-etime4)+(etime6-etime5)
         write(*,*) 'total elapsed time=',etime,'iam=',iam
C
C write the densities to a file
      open (unit = 11,file ='f.fil',status='old',
     $	  form='formatted')
      write(11,*), ((f(i,j),j=1,ydim),i=1,xdim)
      close(11)
       endif

C end of main program
	 stop
	 end

C finds the FFT. The method used is 'decimation in time'
          SUBROUTINE DIFFFT(X,NU) 
          COMPLEX X(1024),U,W,T 
          N=2**NU 
          PI=3.14159265358979


          DO 20 L=1,NU 
          LE=2**(NU+1-L) 
          LE1=LE/2 
		  U=(1.,0.)
	   W=CMPLX(COS(PI/FLOAT(LE1)),-SIN(PI/FLOAT(LE1)))
		  DO 20 J=1,LE1
		      DO 10 I=J,N,LE
			    IP=I+LE1
			    T=X(I)+X(IP)
			    X(IP)=(X(I)-X(IP))*U
   10               X(I)=T
   20            U=U*W

	   NV2=N/2
	   NM1=N-1
	   J=1
	   DO 30 I=1,NM1
		  IF (I .GE. J) GO TO 25
			 T=X(J)
			 X(J)=X(I)
			 X(I)=T
   25            K=NV2
   26            IF (K .GE. J) GO TO 30
			 J=J-K
			 K=K/2
                 GO TO 26
   30     J=J+K

	    RETURN
	    END

C evaluates the filter (or response function) in the 
C frequency domain.
      subroutine evalh2(H,tau,N)
      real tau, PI
      integer N,j,i,k,ii,MAX,HMAX
      parameter(MAX=64,HMAX=32)
      complex H(MAX),a1(HMAX)
      PI = acos(-1.0)
      do j=1,MAX 
        H(j) = (0.,0.)
      enddo

      do i=1,HMAX
        a1(i) = real(i)
      enddo

      do i =1,MAX
        if (i .le. HMAX) then
          H(i) = real(i)
        else
          H(i) = real(MAX)+1.0 -real(i) 
        endif
      enddo

C Optional: can multiply with a Hamming window function here. 
C Not required really, but can be used to get better(?) images. 
C Note that the filter function C changes from a ramp on using 
C this Hamming multiply.
c     do i = 1,MAX
c       H(i) = H(i)*(.54 + .46*cos(2.0*pi*real(i)/(real(MAX))))
c     enddo

      return
      end