From 33a869db80df867d67374a22d858e9bdd7bfd994 Mon Sep 17 00:00:00 2001
From: Paul-Gerhard Reinhard <mpt218@homework.(none)>
Date: Fri, 6 May 2016 20:04:09 +0200
Subject: [PATCH] eliminate guru mode in FFTW3

---
 Code/Changes.txt       |  6 ++++
 Code/Makefile          |  5 +--
 Code/Makefile.gfortran | 78 ++++++++++++++++++++++++++++++++++++++++++
 Code/fourier.f90       | 38 ++++++++++----------
 Code/levels.f90        | 34 +++++++++++++-----
 5 files changed, 131 insertions(+), 30 deletions(-)
 create mode 100644 Code/Makefile.gfortran

diff --git a/Code/Changes.txt b/Code/Changes.txt
index 4262b8c..859c3bf 100644
--- a/Code/Changes.txt
+++ b/Code/Changes.txt
@@ -29,6 +29,12 @@ Step 2.0:
 - static.f90: Accelerated iteration by flexible step size.
               'npsi' always set to even for consistent pairing.
 
+- New makefiles 'Makefile.gfortran' and 'Makefile.gfortran.openmp'
+  which invoke more optimizations.
+
+- Eliminate guru mode in FFTW3 to make the code compatible with
+  Intels MKL implementation of FFTW3.
+
 
 
 
diff --git a/Code/Makefile b/Code/Makefile
index 20bf828..3c1eb50 100644
--- a/Code/Makefile
+++ b/Code/Makefile
@@ -1,6 +1,7 @@
 COMPILER = gfortran
-COMPILERFLAGS  =  -O3
-LINKERFLAGS    =  -O3
+COMPILERFLAGS  =  -O3 -msse4.2 -mfpmath=sse -ffast-math -finline-functions -funroll-loops
+#COMPILERFLAGS  = -g -fbacktrace
+LINKERFLAGS    =  $(COMPILERFLAGS)
 
 SOURCE_TRAILER = .f90
 EXEC           = sky3d.seq
diff --git a/Code/Makefile.gfortran b/Code/Makefile.gfortran
new file mode 100644
index 0000000..15a3b97
--- /dev/null
+++ b/Code/Makefile.gfortran
@@ -0,0 +1,78 @@
+COMPILER = gfortran
+COMPILERFLAGS  =  -O3 -msse4.2 -mfpmath=sse -ffast-math -finline-functions -funroll-loops
+#COMPILERFLAGS  =  -g -fbacktrace
+LINKERFLAGS    =  $(COMPILERFLAGS)
+
+SOURCE_TRAILER = .f90
+EXEC           = sky3d.omp
+OBJS           = params.o grids.o levels.o fourier.o forces.o sequential.o \
+	inout.o coulomb.o trivial.o densities.o fragments.o twobody.o \
+	energies.o static.o meanfield.o dynamic.o pairs.o moment.o \
+	main3d.o user.o external.o
+
+SHELL = /bin/sh
+
+#---------------------------------------------------------	
+all:$(EXEC) 
+$(EXEC):$(OBJS) 	
+	$(COMPILER) $(LINKERFLAGS) -o $@  $(OBJS) -lfftw3 -llapack
+#
+
+user.o : user.f90 params.o grids.o levels.o
+
+external.o : external.f90 params.o sequential.o grids.o levels.o meanfield.o
+
+coulomb.o : coulomb.f90 params.o grids.o densities.o
+
+inout.o : inout.f90 params.o sequential.o grids.o moment.o densities.o \
+	levels.o  coulomb.o forces.o meanfield.o forces.o
+
+trivial.o : trivial.f90 params.o grids.o
+
+forces.o : forces.f90 forces.data params.o 
+
+fourier.o : fourier.f90 params.o grids.o
+
+params.o : params.f90
+
+moment.o : moment.f90 params.o grids.o densities.o
+
+sequential.o: sequential.f90 levels.o params.o grids.o densities.o
+
+levels.o : levels.f90 params.o grids.o forces.o fourier.o trivial.o
+
+grids.o : grids.f90 params.o forces.o
+
+densities.o : densities.f90 params.o grids.o levels.o trivial.o 
+
+fragments.o : fragments.f90 params.o grids.o forces.o levels.o twobody.o \
+	sequential.o 
+
+twobody.o : twobody.f90 params.o grids.o densities.o moment.o forces.o
+
+energies.o :   energies.f90 params.o forces.o densities.o levels.o grids.o \
+	trivial.o pairs.o
+
+meanfield.o : meanfield.f90 params.o densities.o forces.o grids.o coulomb.o \
+	trivial.o levels.o
+
+pairs.o : pairs.f90 forces.o grids.o levels.o densities.o
+
+static.o : static.f90 params.o densities.o grids.o \
+	levels.o moment.o energies.o inout.o pairs.o meanfield.o
+
+dynamic.o : dynamic.f90 params.o densities.o grids.o trivial.o \
+	levels.o moment.o energies.o inout.o meanfield.o sequential.o \
+	twobody.o external.o
+
+main3d.o : main3d.f90 params.o fourier.o forces.o densities.o meanfield.o levels.o grids.o fragments.o sequential.o dynamic.o static.o coulomb.o user.o
+
+.SUFFIXES:  .f90 .f .o 
+
+%.o : %.mod
+
+.f90.o:	
+	$(COMPILER) $(COMPILERFLAGS)  -c $<
+
+.f.o:
+	$(COMPILER) $(COMPILERFLAGS)  -c $<
diff --git a/Code/fourier.f90 b/Code/fourier.f90
index d56e3fe..75bb3c8 100644
--- a/Code/fourier.f90
+++ b/Code/fourier.f90
@@ -9,33 +9,31 @@ CONTAINS
   SUBROUTINE init_fft
     INCLUDE 'fftw3.f'
     COMPLEX(db),ALLOCATABLE :: p(:,:,:,:,:)
+    INTEGER,SAVE :: FFTW_planflag             ! set option here
+    FFTW_planflag=FFTW_EXHAUSTIVE
     ALLOCATE(p(nx,ny,nz,2,2))
     CALL dfftw_plan_dft_3d(pforward,nx,ny,nz,p(:,:,:,1,1),p(:,:,:,1,1), &
-         FFTW_FORWARD, FFTW_MEASURE+FFTW_UNALIGNED)
+         FFTW_FORWARD, FFTW_planflag)
     CALL dfftw_plan_dft_3d(pbackward,nx,ny,nz,p(:,:,:,1,1),p(:,:,:,1,1), &
-         FFTW_BACKWARD, FFTW_MEASURE+FFTW_UNALIGNED)
+         FFTW_BACKWARD, FFTW_planflag)
     CALL dfftw_plan_many_dft(xforward,1,(/nx/),2*ny*nz, &
          p(:,:,:,:,1),0,1,nx,p(:,:,:,:,2),0,1,nx, &
-         FFTW_FORWARD, FFTW_MEASURE+FFTW_UNALIGNED)
+         FFTW_FORWARD, FFTW_planflag)
     CALL dfftw_plan_many_dft(xbackward,1,(/nx/),2*ny*nz, &
          p(:,:,:,:,1),0,1,nx,p(:,:,:,:,1),0,1,nx, &
-         FFTW_BACKWARD, FFTW_MEASURE+FFTW_UNALIGNED)
-    CALL dfftw_plan_guru_dft(yforward,1,ny,nx,nx, &
-         3,(/nx,nz,2/),(/1,nx*ny,nx*ny*nz/),(/1,nx*ny,nx*ny*nz/), &
-         p(:,:,:,:,1),p(:,:,:,:,2), &
-         FFTW_FORWARD, FFTW_MEASURE+FFTW_UNALIGNED)
-    CALL dfftw_plan_guru_dft(ybackward,1,ny,nx,nx, &
-         3,(/nx,nz,2/),(/1,nx*ny,nx*ny*nz/),(/1,nx*ny,nx*ny*nz/), &
-         p(:,:,:,:,1),p(:,:,:,:,1), &
-         FFTW_BACKWARD, FFTW_MEASURE+FFTW_UNALIGNED)
-    CALL dfftw_plan_guru_dft(zforward,1,nz,nx*ny,nx*ny, &
-         3,(/nx,ny,2/),(/1,nx,nx*ny*nz/),(/1,nx,nx*ny*nz/), &
-         p(:,:,:,:,1),p(:,:,:,:,2), &
-         FFTW_FORWARD, FFTW_MEASURE+FFTW_UNALIGNED)
-    CALL dfftw_plan_guru_dft(zbackward,1,nz,nx*ny,nx*ny, &
-         3,(/nx,ny,2/),(/1,nx,nx*ny*nz/),(/1,nx,nx*ny*nz/), &
-         p(:,:,:,:,1),p(:,:,:,:,1), &
-         FFTW_BACKWARD, FFTW_MEASURE+FFTW_UNALIGNED)
+         FFTW_BACKWARD, FFTW_planflag)
+  CALL dfftw_plan_many_dft(yforward,1,(/ny/),nx, &
+       p(:,:,:,:,1),0,nx,1,p(:,:,:,:,2),0,nx,1, &
+       FFTW_FORWARD, FFTW_planflag)
+  CALL dfftw_plan_many_dft(ybackward,1,(/ny/),nx, &
+       p(:,:,:,:,1),0,nx,1,p(:,:,:,:,1),0,nx,1, &
+       FFTW_BACKWARD, FFTW_planflag)
+  CALL dfftw_plan_many_dft(zforward,1,(/nz/),nx*ny, &
+       p(:,:,:,:,1),0,nx*ny,1,p(:,:,:,:,2),0,nx*ny,1, &
+       FFTW_FORWARD, FFTW_planflag)
+  CALL dfftw_plan_many_dft(zbackward,1,(/nz/),nx*ny, &
+       p(:,:,:,:,1),0,nx*ny,1,p(:,:,:,:,1),0,nx*ny,1, &
+       FFTW_BACKWARD, FFTW_planflag)
     IF(wflag) WRITE(*,*) '***** FFTW3 plans established *****'
     DEALLOCATE(p)
   END SUBROUTINE init_fft
diff --git a/Code/levels.f90 b/Code/levels.f90
index eb63a65..6d34dcb 100644
--- a/Code/levels.f90
+++ b/Code/levels.f90
@@ -56,15 +56,23 @@ CONTAINS
     COMPLEX(db), INTENT(OUT):: d1psout(:,:,:,:)
     COMPLEX(db), INTENT(OUT), OPTIONAL :: d2psout(:,:,:,:)  
     REAL(db) :: kfac
-    INTEGER :: iy
+    INTEGER :: iy,is,k
     kfac=(PI+PI)/(dy*ny)
-    CALL dfftw_execute_dft(yforward,psin,d1psout)
+    DO is=1,2
+       DO k=1,nz
+          CALL dfftw_execute_dft(yforward,psin(:,:,k,is),d1psout(:,:,k,is))
+       END DO
+    END DO
     IF(PRESENT(d2psout)) THEN
        DO iy=1,ny/2
           d2psout(:,iy,:,:)=-((iy-1)*kfac)**2*d1psout(:,iy,:,:)/REAL(ny)
           d2psout(:,ny-iy+1,:,:)=-(iy*kfac)**2*d1psout(:,ny-iy+1,:,:)/REAL(ny)
        ENDDO
-       CALL dfftw_execute_dft(ybackward,d2psout,d2psout)
+       DO is=1,2
+          DO k=1,nz
+             CALL dfftw_execute_dft(ybackward,d2psout(:,:,k,is),d2psout(:,:,k,is))
+          END DO
+       END DO
     ENDIF
     d1psout(:,1,:,:)=(0.D0,0.D0)
     DO iy=2,ny/2
@@ -75,7 +83,11 @@ CONTAINS
             /REAL(ny)
     ENDDO
     d1psout(:,ny/2+1,:,:)=(0.D0,0.D0)
-    CALL dfftw_execute_dft(ybackward,d1psout,d1psout)
+    DO is=1,2
+       DO k=1,nz
+          CALL dfftw_execute_dft(ybackward,d1psout(:,:,k,is),d1psout(:,:,k,is))
+       END DO
+    END DO
   END SUBROUTINE cdervy
   !************************************************************
   SUBROUTINE cdervz(psin,d1psout,d2psout)  
@@ -83,15 +95,19 @@ CONTAINS
     COMPLEX(db), INTENT(OUT):: d1psout(:,:,:,:)
     COMPLEX(db), INTENT(OUT), OPTIONAL :: d2psout(:,:,:,:)  
     REAL(db) :: kfac
-    INTEGER :: iz
+    INTEGER :: iz,is
     kfac=(PI+PI)/(dz*nz)
-    CALL dfftw_execute_dft(zforward,psin,d1psout)
+    DO is=1,2
+       CALL dfftw_execute_dft(zforward,psin(:,:,:,is),d1psout(:,:,:,is))
+    END DO
     IF(PRESENT(d2psout)) THEN
        DO iz=1,nz/2
           d2psout(:,:,iz,:)=-((iz-1)*kfac)**2*d1psout(:,:,iz,:)/REAL(nz)
           d2psout(:,:,nz-iz+1,:)=-(iz*kfac)**2*d1psout(:,:,nz-iz+1,:)/REAL(nz)
        ENDDO
-       CALL dfftw_execute_dft(zbackward,d2psout,d2psout)
+       DO is=1,2
+          CALL dfftw_execute_dft(zbackward,d2psout(:,:,:,is),d2psout(:,:,:,is))
+       END DO
     ENDIF
     d1psout(:,:,1,:)=(0.D0,0.D0)
     DO iz=2,nz/2
@@ -102,7 +118,9 @@ CONTAINS
             /REAL(nz)
     ENDDO
     d1psout(:,:,nz/2+1,:)=(0.D0,0.D0)
-    CALL dfftw_execute_dft(zbackward,d1psout,d1psout)
+    DO is=1,2
+       CALL dfftw_execute_dft(zbackward,d1psout(:,:,:,is),d1psout(:,:,:,is))
+    END DO
   END SUBROUTINE cdervz
   !************************************************************
   SUBROUTINE laplace(psin,psout,e0inv)  
-- 
GitLab