      SUBROUTINE tce_mo2e_zones_4a_disk_ga_act(rtdb,d_v2,
     1                                kax_v2_alpha_offset,
     1                                size_2e)
C     $Id: tce_mo2e_zones_4a_disk_ga_act.F 27406 2015-08-24 14:44:13Z jhammond $
C     This is a Fortran77 program generated by Tensor Contraction Engine v.1.0
C     Copyright (c) Battelle & Pacific Northwest National Laboratory (2002)
C     t ( p1 p2 h3 h4 )_t
      IMPLICIT NONE
#include "rtdb.fh"
#include "global.fh"
#include "mafdecls.fh"
#include "stdio.fh"
#include "util.fh"
#include "bas.fh"
#include "schwarz.fh"
#include "sym.fh"
#include "sf.fh"
#include "errquit.fh"
#include "tce.fh"
#include "tce_main.fh"
c
c written by K. Kowalski
c
c
c     max. number of p2 groups =200
c
c
      integer rtdb                 ! Run-time database
      integer d_v2                 ! MO integrals
      integer kax_v2_alpha_offset  ! MO integrals offset
      integer size_2e              ! 2e file size
c
      INTEGER size_2g2a,l_2g2a,k_2g2a
      INTEGER azone1,azone2,azone3,azone4
      INTEGER g1b,g2b,g3b,g4b
      INTEGER igi1,igi2,igi3,igi4
      INTEGER ii,i,j,k,l,N,ipos1,ipos2,ipos3,ipos4
      INTEGER del1,del2,p1rel,p2rel
      INTEGER size_4a,l_4a,k_4a
c
      integer mu,nu,rho,sigma
      integer mu_lo,mu_hi
      integer nu_lo,nu_hi
      integer rho_lo,rho_hi
      integer sigma_lo,sigma_hi
      integer mu_range
      integer nu_range
      integer rho_range
      integer sigma_range
      integer mu1,nu1,rho1,sigma1
      integer shift_mu,shift_nu
      integer shift_rho,shift_sigma
      integer work1,work2          ! Work array sizes
      integer l_work1,k_work1      ! Work array 1
      integer l_work2,k_work2      ! Work array 2
      integer imu1,inu1,irho1,isigma1
c
      integer l_movecs_orb,k_movecs_orb
      integer l_gpair,k_gpair
      integer len_pair,g12_shift
c ATTENTION,ACHTUNG,UWAGA 2000 - max # of CPU
c
      integer size_2g2z,l_2g2z,k_2g2z
      integer tot_azone1_sh,tot_azone2_sh
      integer tot_azone3_sh,tot_azone4_sh
      integer ixi,jxi,point_pair
      integer size_stripe,l_p34,k_p34
      integer addr,xoffset_34
      integer size_g3g4,xoffset_p34
      integer size_g4321,k_g4321,l_g4321
c
      integer l_integral,l_coeff
      integer k_integral,k_coeff
      integer size_ic,size_icc,size_integral,size_coeff,max_na
      integer l_aux,k_aux,size_aux,iend
c
      integer iha,ihb !number of corr. alpha, beta holes
      integer ipa,ipb !number of corr. alpha, beta particles
      integer INDEX_PAIR,icol,irow
c compression
      integer xoffset_g12(20000),xoffset_size(20000)
      integer xoffset_size_p1p2(20000)
      integer key_start(20000),key_end(20000)
      integer size_temp,size_temp_4g,xoffset_temp_4g,size_p1p2
      integer max_size_temp,xoffset_temp,iclose,iopen,offset_2g2z
      integer imaxp12,istart,ibuba,sumx
      double precision wall,cpu,wall1,cpu1,wall2,cpu2,wall3,cpu3
c *** debug ***
       double precision xtot1,xtot2,xtot3
c *************
      double precision tot_zone(20000)
      integer l_g12piece,k_g12piece,size_g12p
c - exascale ---
      LOGICAL is_active_4_o
c --------------
      integer l_4af_offset,k_4af_offset,d_4af
      integer sf_chunk,request
      integer key_4af,offset_4af,size_4af
      character*255 filename
c 
      logical parallel
c
      INTEGER length
      INTEGER next
      INTEGER nprocs
      INTEGER count
      integer nxtask
      external nxtask
      logical nodezero
      logical idiskl
c
c
c
c
cccx      parallel = (ga_nnodes().gt.1)
      parallel = .true.
c
      if(idisk.eq.0) then
       idiskl=.false.
      else
       idiskl=.true.
      end if
c
c
cccx      max_size_temp=100000000
      max_size_temp=imaxsize**4
c
c
c
      do ii=1,20000
       tot_zone(ii)=0.0d0
      enddo
      if(atpart.gt.20000) 
     &  call errquit('tce_zones: atpart too big',1,MA_ERR)
      sumx=0
      do ii=1,atpart
       tot_zone(ii)=sumx
       sumx=sumx+nalength(ii)
      enddo
c
      nodezero=(ga_nodeid().eq.0)
c
c
c this module is called only if intorb = .true.
c N is the number of correlated orbitals
        N = nmo(1) - nfc(1) - nfv(1)
        iha = nocc(1)-nfc(1)
        ihb = nocc(ipol)-nfc(ipol)
        ipa = nmo(1)-nocc(1)-nfv(1)
        ipb = nmo(ipol)-nocc(ipol)-nfv(ipol)
c
c     Offset for 4a file
c
cc      sf_chunk=(imaxsize+10)**4
      sf_chunk=(imaxsize)**4
      if(.not.idiskl) then
c GA is assumed
       call tce_4a_offset(l_4af_offset,k_4af_offset,size_4af)
      else
cccx  call tce_filename('4aint',filename)
       call tce_4a_offset(l_4af_offset,k_4af_offset,size_4af)
cccx       call util_file_name('4aintx',.true.,.false.,filename)
       call util_file_name('4aintx',.false.,.false.,filename)
      end if 
      if(idiskl) then
       if(.not.parallel) 
     1      call errquit('sf only for parallel runs',1,DISK_ERR)
       if(parallel) call ga_sync()
        if (sf_create(filename,dfloat(bytes)*dfloat(size_4af),
     1    dfloat(bytes)*dfloat(size_4af),sf_chunk,d_4af).ne.0)
     2    call errquit('4-index: sf problem',0,DISK_ERR)
ccx        if (parallel) then
ccx          if (sf_close(d_4af).ne.0)
ccx     1      call errquit('createfile: sf problem',1,DISK_ERR)
ccx          call ga_sync()
ccx        end if      
      else
       call createfile(filename,d_4af,size_4af)
       call reconcilefile(d_4af,size_4af)
       call ga_zero(d_4af)
      end if 
c
c
c *** debug ***
c        write(6,*)'step-1'
c        call util_flush(6)
c *************
c
c     Pair's structure of the integral file
      call tce_mo2e_pairs_act(l_gpair,k_gpair,len_pair)
      len_pair = int_mb(k_gpair)
c
c *** debug ***
c        write(6,*)'step-0'
c        call util_flush(6)
c *************
c
c alpha orbitals only
c
      if (.not.ma_push_get(mt_dbl,nbf*(iha+ipa)
     1  ,"sorted MO coeffs",
     2  l_movecs_orb,k_movecs_orb))
     3  call errquit('tce_mo2e_zone: MA problem 1',0,
     2    BASIS_ERR)
      call dfill(nbf*(iha+ipa),0.0d0, dbl_mb(k_movecs_orb), 1)
      do i=1,iha
      do isigma1=1,nbf
       dbl_mb(k_movecs_orb+(i-1)*nbf+isigma1-1)=
     & dbl_mb(k_movecs_sorted+(i-1)*nbf+isigma1-1)
      enddo
      enddo
      do i=iha+1,iha+ipa
      do isigma1=1,nbf
       dbl_mb(k_movecs_orb+(i-1)*nbf+isigma1-1)=
     & dbl_mb(k_movecs_sorted+(i+ihb-1)*nbf+isigma1-1)
      enddo
      enddo
c
c
      call int_mem_2e4c(work1,work2)
      if (.not.ma_push_get(mt_dbl,work1,'work1',l_work1,k_work1))
     1  call errquit('tce_ao2e: MA problem work1',0,MA_ERR)
      if (.not.ma_push_get(mt_dbl,work2,'work2',l_work2,k_work2))
     1  call errquit('tce_ao2e: MA problem work2',1,MA_ERR)
c
c
c *** debug ***
c        write(6,*)'step1'
c        call util_flush(6)
c *************
c 
c 4af file formed here
c
c
             cpu1 = - util_cpusec()
             wall1 = - util_wallsec()
c
      nprocs = GA_NNODES()
      count = 0
      next = NXTASK(nprocs, 1)
      DO azone1 = 1,atpart      !nu
      DO azone2 = azone1,atpart !mu
      DO azone3 = 1,atpart      !sigma
      DO azone4 = azone3,atpart !rho
      IF (next.eq.count) THEN
c ---------------------------
        size_4a = nalength(azone1)*nalength(azone2)*
     1            nalength(azone3)*nalength(azone4)
        if(.not.ma_push_get(mt_dbl,size_4a,'4a',l_4a,k_4a))
     1     call errquit('tce_4af_zones1: MA problem',0,MA_ERR)
        call dfill(size_4a, 0.0d0, dbl_mb(k_4a), 1)
         shift_mu = 0
         do mu    = a2length(azone2)+1,a2length(azone2+1)
            if (.not.bas_cn2bfr(ao_bas_han,mu,mu_lo,mu_hi))
     1      call errquit('tce_ao2e: basis fn range problem 1',0,
     2      BASIS_ERR)
            mu_range = mu_hi - mu_lo + 1
         shift_nu = 0
         do nu    = a2length(azone1)+1,a2length(azone1+1)
            if (.not.bas_cn2bfr(ao_bas_han,nu,nu_lo,nu_hi))
     1      call errquit('tce_ao2e: basis fn range problem 1',0,
     2      BASIS_ERR)
            nu_range = nu_hi - nu_lo + 1
         shift_rho = 0
         do rho   = a2length(azone4)+1,a2length(azone4+1)
            if (.not.bas_cn2bfr(ao_bas_han,rho,rho_lo,rho_hi))
     1      call errquit('tce_ao2e: basis fn range problem 1',0,
     2      BASIS_ERR)
            rho_range = rho_hi - rho_lo + 1
         shift_sigma = 0
         do sigma = a2length(azone3)+1,a2length(azone3+1)
            if (.not.bas_cn2bfr(ao_bas_han,sigma,sigma_lo,sigma_hi))
     1      call errquit('tce_ao2e: basis fn range problem 1',0,
     2      BASIS_ERR)
            sigma_range = sigma_hi - sigma_lo + 1
            if (schwarz_shell(rho,sigma)*schwarz_shell(mu,nu)
     1          .ge. tol2e) then
cccx            call dfill(work1, 0.0d0, dbl_mb(k_work1), 1)
cccx            call dfill(work2, 0.0d0, dbl_mb(k_work2), 1)
            call int_2e4c(ao_bas_han,mu,nu,ao_bas_han,rho,sigma,
     1           work2,dbl_mb(k_work2),work1,dbl_mb(k_work1))
c
            i=0
             do mu1     = 1,mu_range
             do nu1     = 1,nu_range
             do rho1    = 1,rho_range
             do sigma1  = 1,sigma_range
            i=i+1
            inu1=nu1+shift_nu
            isigma1=sigma1+shift_sigma
            imu1=mu1+shift_mu
            irho1=rho1+shift_rho
c (isigma1,irho1|inu1, imu1)
            ipos1=(((imu1-1)*nalength(azone1)+inu1-1)*
     1            nalength(azone4)+irho1-1)*nalength(azone3)
     2            +isigma1
            dbl_mb(k_4a+ipos1-1)=dbl_mb(k_work1+i-1)
            enddo
            enddo
            enddo
            enddo
            end if !schwarz  screening
         shift_sigma = shift_sigma + sigma_range
         enddo !sigma
         shift_rho   = shift_rho + rho_range
         enddo !rho
         shift_nu    = shift_nu + nu_range
         enddo !nu
         shift_mu    = shift_mu + mu_range
         enddo !mu
c
c fixing offsets and sf_writing
         key_4af=azone4 - 1 + atpart * (azone3 - 1 +
     &          atpart * (azone2 - 1 + atpart * (azone1 - 1)))
         call tce_hash(int_mb(k_4af_offset),key_4af,offset_4af)
      if(idiskl) then
        if (sf_write(d_4af,dfloat(bytes)*dfloat(offset_4af),
     1    dfloat(bytes)*dfloat(size_4a),dbl_mb(k_4a),request).ne.0)
     2    call errquit('zones put: sf problem2',1,DISK_ERR)
        if (sf_wait(request).ne.0)
     1    call errquit('zones put: sf problem3',2,DISK_ERR)
      else
        call ga_put(d_4af,offset_4af+1,offset_4af+size_4a,1,1,
     1    dbl_mb(k_4a),1)
      end if 
c closing l_4a file
        if (.not.ma_pop_stack(l_4a))
     1   call errquit('tcc_mo2e_4af2: l_4a',15,MA_ERR)
c ---------------------------
      next = NXTASK(nprocs, 1)
      END IF
      count = count + 1
      ENDDO !azone4
      ENDDO !azone3
      ENDDO !azone2
      ENDDO !azone1
c
c
c
       call ga_sync()
c
c
c
         call ga_sync()
c
c
c *** debug ***
c        write(6,*)'step2'
c        call util_flush(6)
c *************
c
c
      next = nxtask(-nprocs,1)
c
c
      if(.not.idiskl) then
      call reconcilefile(d_4af,size_4af)
      end if
c
c
c
      max_na=0
      do ixi=1,atpart
       if(nalength(ixi).gt.max_na) max_na=nalength(ixi)
      enddo
        size_ic=2*((max_na)**4)+1
        size_icc=tile_dim*max_na
c
c
c
       if (.not.ma_push_get(mt_dbl,size_ic,'l_int',
     1  l_integral,k_integral))
     1  call errquit('tce_4s: MA problem l_int',0,MA_ERR)
c
       if (.not.ma_push_get(mt_dbl,size_icc,'l_coeff',
     1  l_coeff,k_coeff))
     1  call errquit('tce_4s: MA problem l_coeff',0,MA_ERR)
c
c
c
c
      nprocs = GA_NNODES()
      count = 0
      next = NXTASK(nprocs, 1)
      DO g3b = 1,noa+nva   !k
      DO g4b = g3b,noa+nva !l
      DO azone1 = 1,atpart !nu
      DO azone2 = azone1,atpart !mu
      IF (next.eq.count) THEN
c
             cpu1 = - util_cpusec()
             wall1 = - util_wallsec()
c
      size_2g2a=int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)
     1          *nalength(azone1)*nalength(azone2)
      if (.not.ma_push_get(mt_dbl,size_2g2a,'2g2a',l_2g2a,k_2g2a))
     1    call errquit('tce_r2_divide1: MA problem',0,MA_ERR)
      call dfill(size_2g2a, 0.0d0, dbl_mb(k_2g2a), 1)
c *** debug ***
c        write(6,*)'step3'
c        call util_flush(6)
c *************
c
      DO azone3 = 1,atpart      !sigma
      DO azone4 = azone3,atpart !rho
        size_4a = nalength(azone1)*nalength(azone2)*
     1            nalength(azone3)*nalength(azone4)
        if(.not.ma_push_get(mt_dbl,size_4a,'4a',l_4a,k_4a))
     1     call errquit('tce_r2_divide2: MA problem',0,MA_ERR)
cccx        call dfill(size_4a, 0.0d0, dbl_mb(k_4a), 1)
c
c
c
c
         key_4af=azone4 - 1 + atpart * (azone3 - 1 +
     &          atpart * (azone2 - 1 + atpart * (azone1 - 1)))
         call tce_hash(int_mb(k_4af_offset),key_4af,offset_4af)
       if(idiskl) then 
        if (sf_read(d_4af,dfloat(bytes)*dfloat(offset_4af),
     1    dfloat(bytes)*dfloat(size_4a),dbl_mb(k_4a),request).ne.0)
     1     call errquit('zones get2: sf problem',1,DISK_ERR)
        if (sf_wait(request).ne.0)
     1    call errquit('zones get3: sf problem',2,DISK_ERR)
       else
        call ga_get(d_4af,offset_4af+1,offset_4af+size_4a,1,1,
     1    dbl_mb(k_4a),1)
       end if 
c
c
       tot_azone3_sh=tot_zone(azone3)
       tot_azone4_sh=tot_zone(azone4)
c
      if(azone3.ne.azone4) then !!============azone3.ne.azone4
c --- C_[g3][sigma]
       i = 0 
       do isigma1 =  1,nalength(azone3)
       do igi3    =  1,int_mb(k_range_alpha+g3b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g3b-1)+igi3-1)*nbf+tot_azone3_sh
     &       +isigma1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c --- C_[g3][sigma]([sigma]<[rho]|[nu] [mu])
c            zone3   zone3  zone4
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g3b-1),  !m
     1 nalength(azone1)*nalength(azone2)*nalength(azone4), !n
     1 nalength(azone3), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g3b-1), !lda
     1 dbl_mb(k_4a),nalength(azone3),          ! B,ldb
     1 0.0d0,dbl_mb(k_integral),
     1 int_mb(k_range_alpha+g3b-1))
c transposition ([g3][rho]|[nu][mu]) => ([rho][g3]|[nu][mu]) 
c
       size_aux=int_mb(k_range_alpha+g3b-1)*nalength(azone4)*
     1          nalength(azone1)*nalength(azone2)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux1: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     & int_mb(k_range_alpha+g3b-1),nalength(azone4),
     & nalength(azone1),nalength(azone2),
     &2,1,3,4,1.0d0)
      do i=1,size_aux
       dbl_mb(k_integral+i-1)=dbl_mb(k_aux+i-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux1',15,MA_ERR)
c C([g4][azone4=rho])
       i = 0
       do irho1   =  1,nalength(azone4)
       do igi4    =  1,int_mb(k_range_alpha+g4b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g4b-1)+igi4-1)*nbf+tot_azone4_sh
     &       +irho1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c C([g4][azone4=rho])([rho][g3]|[azone1][azone2])
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g4b-1),  !m
     1 nalength(azone1)*nalength(azone2)*int_mb(k_range_alpha+g3b-1), !n
     1 nalength(azone4), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g4b-1), !lda
     1 dbl_mb(k_integral),nalength(azone4),          ! B,ldb
     1 1.0d0,dbl_mb(k_2g2a),
     1 int_mb(k_range_alpha+g4b-1))
c  done with first  part of azone3.ne.azone4 case ----------------
c  transposition of atomic integrals
c  ([sigma=azone3][rho=azone4]|[nu][mu]) ==>
c  ([rho][sigma]|[nu][mu])
       size_aux=size_4a
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux2: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_4a),dbl_mb(k_aux),
     & nalength(azone3),nalength(azone4),
     & nalength(azone1),nalength(azone2),
     &2,1,3,4,1.0d0)
ccx      if (.not.ma_pop_stack(l_aux))
ccx     1     call errquit('l_aux2',15,MA_ERR) ! see after dgemm
c C([g3][azone4=rho])
       i = 0
       do irho1   =  1,nalength(azone4)
       do igi3    =  1,int_mb(k_range_alpha+g3b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g3b-1)+igi3-1)*nbf+tot_azone4_sh
     &       +irho1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c C([g3][azone4=rho])([rho=azone4][sigma=azone3]|[nu][mu])
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g3b-1),  !m
     1 nalength(azone1)*nalength(azone2)*nalength(azone3), !n
     1 nalength(azone4), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g3b-1), !lda
     1 dbl_mb(k_aux),nalength(azone4),          ! B,ldb
     1 0.0d0,dbl_mb(k_integral),
     1 int_mb(k_range_alpha+g3b-1))
c
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux2',15,MA_ERR)
c transposition ([g3][sigma]|[nu][mu]) => ([sigma][g3]|[nu][mu])
c
       size_aux=int_mb(k_range_alpha+g3b-1)*nalength(azone3)*
     1          nalength(azone1)*nalength(azone2)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux1: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     & int_mb(k_range_alpha+g3b-1),nalength(azone3),
     & nalength(azone1),nalength(azone2),
     &2,1,3,4,1.0d0)
      do i=1,size_aux
       dbl_mb(k_integral+i-1)=dbl_mb(k_aux+i-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux1',15,MA_ERR)
c C([g4][azone3=sigma])
       i = 0
       do isigma1   =  1,nalength(azone3)
       do igi4    =  1,int_mb(k_range_alpha+g4b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g4b-1)+igi4-1)*nbf+tot_azone3_sh
     &       +isigma1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c C([g4][azone3=sigma])([sigma][g3]|[azone1][azone2])
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g4b-1),  !m
     1 nalength(azone1)*nalength(azone2)*int_mb(k_range_alpha+g3b-1), !n
     1 nalength(azone3), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g4b-1), !lda
     1 dbl_mb(k_integral),nalength(azone3),          ! B,ldb
     1 1.0d0,dbl_mb(k_2g2a),
     1 int_mb(k_range_alpha+g4b-1))
      end if !azone3.ne.azone4
c
c
c
      if(azone3.eq.azone4) then
c --- C_[g3][sigma]
       i = 0 
       do isigma1 =  1,nalength(azone3)
       do igi3    =  1,int_mb(k_range_alpha+g3b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g3b-1)+igi3-1)*nbf+tot_azone3_sh
     &       +isigma1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c --- C_[g3][sigma]([sigma]=[rho]|[nu] [mu])
c            zone3   zone3  zone4
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g3b-1),  !m
     1 nalength(azone1)*nalength(azone2)*nalength(azone4), !n
     1 nalength(azone3), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g3b-1), !lda
     1 dbl_mb(k_4a),nalength(azone3),          ! B,ldb
     1 0.0d0,dbl_mb(k_integral),
     1 int_mb(k_range_alpha+g3b-1))
c transposition ([g3][rho]|[nu][mu]) => ([rho][g3]|[nu][mu]) 
c
       size_aux=int_mb(k_range_alpha+g3b-1)*nalength(azone4)*
     1          nalength(azone1)*nalength(azone2)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux1: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     & int_mb(k_range_alpha+g3b-1),nalength(azone4),
     & nalength(azone1),nalength(azone2),
     &2,1,3,4,1.0d0)
      do i=1,size_aux
       dbl_mb(k_integral+i-1)=dbl_mb(k_aux+i-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux1',15,MA_ERR)
c C([g4][azone4=rho])
       i = 0
       do irho1   =  1,nalength(azone4)
       do igi4    =  1,int_mb(k_range_alpha+g4b-1)
       i = i+1
       ipos1=(int_mb(k_offset_alpha+g4b-1)+igi4-1)*nbf+tot_azone4_sh
     &       +irho1
       dbl_mb(k_coeff+i-1)=dbl_mb(k_movecs_orb+ipos1-1)
       enddo
       enddo
c C([g4][azone4=rho])([rho][g3]|[azone1][azone2])
      call  dgemm('N','N',
     1 int_mb(k_range_alpha+g4b-1),  !m
     1 nalength(azone1)*nalength(azone2)*int_mb(k_range_alpha+g3b-1), !n
     1 nalength(azone4), !k
     1 1.0d0,
     1 dbl_mb(k_coeff),                    !A
     1 int_mb(k_range_alpha+g4b-1), !lda
     1 dbl_mb(k_integral),nalength(azone4),          ! B,ldb
     1 1.0d0,dbl_mb(k_2g2a),
     1 int_mb(k_range_alpha+g4b-1))
      end if !azone3.eq.azone4
c
c
c
        if (.not.ma_pop_stack(l_4a))
     1   call errquit('tcc_mo2e_trans_zones: l_4a',15,MA_ERR)
      ENDDO !azone4
c one step ahead
      ENDDO !azone3
c
c
c
c
c *** debug ***
c        write(6,*)'step4'
c        call util_flush(6)
c *************
c
c
c g2 g1 do loops
c open (g4 g3 | all symmetry allowed g2 g1) - equally good we can split it into 
c                                              pieces
c (k,l|i,j) pieces
c
c   calculate point_pair index here
         ixi=noa+nva-g3b+1
         jxi=noa+nva-g4b+1
         point_pair=((noa+nva)*(noa+nva+1))/2-((ixi-1)*ixi)/2-jxi+1
         size_stripe=int_mb(k_gpair+point_pair)
         xoffset_p34 = int_mb(k_gpair+len_pair+point_pair)
         addr=int_mb(k_gpair+2*len_pair+point_pair)
c
c - exascale ---
       if((size_stripe.eq.0).and.(xoffset_p34.eq.0)) go to 300
c --- 
c
c
c offset for blocking the (nu mu | g2 g1 ) = C*C matrix
c
      do i=1,20000
       key_start(i) = 0
       key_end(i)   = 0
       xoffset_g12(i)  = 0
       xoffset_size(i) = 0
       xoffset_size_p1p2(i) = 0
      enddo 
      imaxp12=0
      size_temp=0
      size_temp_4g=0
      xoffset_temp=0 
      xoffset_temp_4g=0
      size_p1p2=0
      i=1
      DO g1b = 1,noa+nva   !l
      DO g2b = g1b,noa+nva !k
c - exascale ---
      IF(.not.((g3b.gt.noa).and.(g4b.gt.noa).and.(g1b.gt.noa).and.
     &   (g2b.gt.noa).and.(.not.is_active_4_o(g3b,g4b,g1b,g2b)))) THEN
c ---
       IF (int_mb(k_spin_alpha+g3b-1)+int_mb(k_spin_alpha+g4b-1).eq.
     & int_mb(k_spin_alpha+g1b-1)+int_mb(k_spin_alpha+g2b-1)) THEN
       IF (ieor(int_mb(k_sym_alpha+g3b-1),ieor(int_mb(k_sym_alpha+g4b-1)
     & ,ieor(int_mb(k_sym_alpha+g1b-1),int_mb(k_sym_alpha+g2b-1)))) .eq.
     & irrep_v) THEN
       IROW=INDEX_PAIR(g4b,g3b)
       ICOL=INDEX_PAIR(g2b,g1b)
       IF(IROW.GE.ICOL) THEN
         if(size_temp_4g.eq.0) then
          key_start(i)=icol
         end if
         size_temp=size_temp+int_mb(k_range_alpha+g2b-1)
     1          *int_mb(k_range_alpha+g1b-1)
     1          *nalength(azone1)*nalength(azone2)
         size_temp_4g=size_temp_4g+int_mb(k_range_alpha+g2b-1)
     1          *int_mb(k_range_alpha+g1b-1)
     2          *int_mb(k_range_alpha+g3b-1) 
     3          *int_mb(k_range_alpha+g4b-1)
         size_p1p2=size_p1p2+int_mb(k_range_alpha+g2b-1)
     1          *int_mb(k_range_alpha+g1b-1)
         ibuba=icol
c
c         if(size_temp.gt.max_size_temp) then
c has to be fully consistent with later part
c
         if(size_temp_4g.gt.max_size_temp) then
           xoffset_g12(i)=xoffset_temp_4g
           xoffset_size(i)=size_temp
           xoffset_size_p1p2(i)=size_p1p2
           xoffset_temp=xoffset_temp+size_temp
           xoffset_temp_4g=xoffset_temp_4g+size_temp_4g
           key_end(i)=icol
           size_temp=0
           size_temp_4g=0
           size_p1p2=0
           imaxp12=i
           i=i+1
          end if
       END IF
       END IF
       END IF
c --- exascale -
       END IF
c --------------
      ENDDO
      ENDDO
c
      if(size_temp_4g.ne.0) then
           xoffset_g12(i)=xoffset_temp_4g
           xoffset_size(i)=size_temp
           xoffset_size_p1p2(i)=size_p1p2
           key_end(i)=ibuba
           imaxp12=i
      end if
c
      if(i.gt.20000) 
     1  call errquit('tce_zone: xoffset-size-problem',0,MA_ERR)
c
c *** debug ***
c        write(6,*)'step5'
c        call util_flush(6)
c *************
c
      do i =1, imaxp12
        size_g12p=int_mb(k_range_alpha+g4b-1)*
     1                int_mb(k_range_alpha+g3b-1)*
     1                xoffset_size_p1p2(i)
        if (.not.ma_push_get(mt_dbl,size_g12p,'g12piece',
     1      l_g12piece,k_g12piece))
     1      call errquit('tce_zone: MA g12-piece ',0,MA_ERR)
c TRY IF THIS CAN BE BY-PASSED
        call dfill(size_g12p, 0.0d0, dbl_mb(k_g12piece), 1)
c
c
c
c
      offset_2g2z=0 !
c
c
c
      iend=0
      istart=0
      DO g1b = 1,noa+nva     !l
      DO g2b = g1b,noa+nva   !k 
       ICOL=INDEX_PAIR(g2b,g1b)
       IF(iend.eq.1) go to 4444
       if(istart.ne.1) then
        IF(ICOL.eq.key_start(i)) istart=1
       end if
       if(icol.eq.key_end(i)) iend=1
       IF(istart.eq.1) THEN
c - exascale ---
      IF(.not.((g3b.gt.noa).and.(g4b.gt.noa).and.(g1b.gt.noa).and.
     &   (g2b.gt.noa).and.(.not.is_active_4_o(g3b,g4b,g1b,g2b)))) THEN
c ---
       IF (int_mb(k_spin_alpha+g3b-1)+int_mb(k_spin_alpha+g4b-1).eq.
     & int_mb(k_spin_alpha+g1b-1)+int_mb(k_spin_alpha+g2b-1)) THEN
       IF (ieor(int_mb(k_sym_alpha+g3b-1),ieor(int_mb(k_sym_alpha+g4b-1)
     & ,ieor(int_mb(k_sym_alpha+g1b-1),int_mb(k_sym_alpha+g2b-1)))) .eq.
     & irrep_v) THEN
       IROW=INDEX_PAIR(g4b,g3b)
       ICOL=INDEX_PAIR(g2b,g1b)
       IF(IROW.GE.ICOL) THEN
        tot_azone1_sh=tot_zone(azone1)
        tot_azone2_sh=tot_zone(azone2)
       if(azone1.ne.azone2) then !-----------------
c C_([mu-azone2][g2])
        ii=0
        do igi2    =  1,int_mb(k_range_alpha+g2b-1)
        do imu1    =  1,nalength(azone2)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g2b-1)+igi2-1)*nbf+tot_azone2_sh
     &           +imu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo 
c ([g4][g3]|[nu-azone1][mu-azone2])*C_([mu-azone2][g2])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   nalength(azone1),                                        !m
     1   int_mb(k_range_alpha+g2b-1),                             !n
     3   nalength(azone2),                                        !k
     4   1.0d0,dbl_mb(k_2g2a),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   nalength(azone1),
     6   dbl_mb(k_coeff),nalength(azone2),
     7   0.0d0,dbl_mb(k_integral),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   nalength(azone1))
c transposition ([g4][g3]|[nu-azone1][g2])=>([g4][g3]|[g2][nu-azone1])
       size_aux=int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)
     1 *int_mb(k_range_alpha+g2b-1)*nalength(azone1)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux2: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     &int_mb(k_range_alpha+g4b-1),int_mb(k_range_alpha+g3b-1), 
     &nalength(azone1),int_mb(k_range_alpha+g2b-1), 
     &1,2,4,3,1.0d0)
      do ii=1,size_aux
       dbl_mb(k_integral+ii-1)=dbl_mb(k_aux+ii-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux2',15,MA_ERR)
c C_([nu-azone1][g1])
        ii=0
        do igi1    =  1,int_mb(k_range_alpha+g1b-1)
        do inu1    =  1,nalength(azone1)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g1b-1)+igi1-1)*nbf+tot_azone1_sh
     &           +inu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo
c ([g4][g3]|[g2][nu-azone1])*C_([nu-azone1][g1])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   int_mb(k_range_alpha+g2b-1),                             !m
     1   int_mb(k_range_alpha+g1b-1),                             !n
     3   nalength(azone1),                                        !k
     4   1.0d0,dbl_mb(k_integral),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   int_mb(k_range_alpha+g2b-1),
     6   dbl_mb(k_coeff),nalength(azone1),
     7   1.0d0,dbl_mb(k_g12piece+offset_2g2z),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   int_mb(k_range_alpha+g2b-1))
c done with first  part of azone3.ne.azone4 case ----------------
c  transposition of atomic integrals
c  ([g4][g3]|[azone1-nu][azone2-mu]) ==>
c  ([g4][g3]|[azone2-mu][azone1-nu])
       size_aux=int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)
     1 *nalength(azone1)*nalength(azone2) 
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux2: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_2g2a),dbl_mb(k_aux),
     & int_mb(k_range_alpha+g4b-1),int_mb(k_range_alpha+g3b-1),
     & nalength(azone1),nalength(azone2), 
     & 1,2,4,3,1.0d0)
ccx      if (.not.ma_pop_stack(l_aux))
ccx     1     call errquit('l_aux2',15,MA_ERR) ! see after dgemm
c C([azone1-nu][g2])
        ii=0
        do igi2    =  1,int_mb(k_range_alpha+g2b-1)
        do inu1    =  1,nalength(azone1)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g2b-1)+igi2-1)*nbf+tot_azone1_sh
     &           +inu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo
c ([g4][g3]|[azone2-mu][azone1-nu])*C([azone1-nu][g2])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   nalength(azone2),                                        !m
     1   int_mb(k_range_alpha+g2b-1),                             !n
     3   nalength(azone1),                                        !k
     4   1.0d0,dbl_mb(k_aux),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   nalength(azone2),
     6   dbl_mb(k_coeff),nalength(azone1),
     7   0.0d0,dbl_mb(k_integral),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   nalength(azone2))
c
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux2',15,MA_ERR)
c transposition ([g4][g3]|[azone2-mu][g2])=>([g4][g3]|[g2][azone2-mu])
       size_aux=int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)
     1 *int_mb(k_range_alpha+g2b-1)*nalength(azone2)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux2: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     &int_mb(k_range_alpha+g4b-1),int_mb(k_range_alpha+g3b-1),
     &nalength(azone2),int_mb(k_range_alpha+g2b-1),
     &1,2,4,3,1.0d0)
      do ii=1,size_aux
       dbl_mb(k_integral+ii-1)=dbl_mb(k_aux+ii-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux2',15,MA_ERR)
c C([azone2-mu][g1])
        ii=0
        do igi1    =  1,int_mb(k_range_alpha+g1b-1)
        do imu1    =  1,nalength(azone2)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g1b-1)+igi1-1)*nbf+tot_azone2_sh
     &           +imu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo
c ([g4][g3]|[g2][azone2-mu])*C([azone2-mu][g1])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   int_mb(k_range_alpha+g2b-1),                             !m
     1   int_mb(k_range_alpha+g1b-1),                             !n
     3   nalength(azone2),                                        !k
     4   1.0d0,dbl_mb(k_integral),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   int_mb(k_range_alpha+g2b-1),
     6   dbl_mb(k_coeff),nalength(azone2),
     7   1.0d0,dbl_mb(k_g12piece+offset_2g2z),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   int_mb(k_range_alpha+g2b-1))
c
        offset_2g2z=offset_2g2z+int_mb(k_range_alpha+g4b-1)*
     1    int_mb(k_range_alpha+g3b-1)*int_mb(k_range_alpha+g2b-1)*
     1    int_mb(k_range_alpha+g1b-1)
       end if ! azone1.ne.azone2 !-----------------
c
c second part 
c
        if(azone1.eq.azone2) then
c C_([mu-azone2][g2])
        ii=0
        do igi2    =  1,int_mb(k_range_alpha+g2b-1)
        do imu1    =  1,nalength(azone2)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g2b-1)+igi2-1)*nbf+tot_azone2_sh
     &           +imu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo 
c ([g4][g3]|[nu-azone1][mu-azone2])*C_([mu-azone2][g2])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   nalength(azone1),                                        !m
     1   int_mb(k_range_alpha+g2b-1),                             !n
     3   nalength(azone2),                                        !k
     4   1.0d0,dbl_mb(k_2g2a),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   nalength(azone1),
     6   dbl_mb(k_coeff),nalength(azone2),
     7   0.0d0,dbl_mb(k_integral),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   nalength(azone1))
c transposition ([g4][g3]|[nu-azone1][g2])=>([g4][g3]|[g2][nu-azone1])
       size_aux=int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)
     1 *int_mb(k_range_alpha+g2b-1)*nalength(azone1)
       if(.not.ma_push_get(mt_dbl,size_aux,'aux',l_aux,k_aux))
     1     call errquit('tce_aux2: MA problem',0,MA_ERR)
      CALL TCE_SORT_4KG_(dbl_mb(k_integral),dbl_mb(k_aux),
     &int_mb(k_range_alpha+g4b-1),int_mb(k_range_alpha+g3b-1), 
     &nalength(azone1),int_mb(k_range_alpha+g2b-1), 
     &1,2,4,3,1.0d0)
      do ii=1,size_aux
       dbl_mb(k_integral+ii-1)=dbl_mb(k_aux+ii-1)
      enddo
      if (.not.ma_pop_stack(l_aux))
     1     call errquit('l_aux2',15,MA_ERR)
c C_([nu-azone1][g1])
        ii=0
        do igi1    =  1,int_mb(k_range_alpha+g1b-1)
        do inu1    =  1,nalength(azone1)
         ii = ii+1
         ipos1=(int_mb(k_offset_alpha+g1b-1)+igi1-1)*nbf+tot_azone1_sh
     &           +inu1
         dbl_mb(k_coeff+ii-1)=dbl_mb(k_movecs_orb+ipos1-1)
        enddo
        enddo
c ([g4][g3]|[g2][nu-azone1])*C_([nu-azone1][g1])
         call dgemm('N','N',
     1   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     1   int_mb(k_range_alpha+g2b-1),                             !m
     1   int_mb(k_range_alpha+g1b-1),                             !n
     3   nalength(azone1),                                        !k
     4   1.0d0,dbl_mb(k_integral),
     5   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     5   int_mb(k_range_alpha+g2b-1),
     6   dbl_mb(k_coeff),nalength(azone1),
     7   1.0d0,dbl_mb(k_g12piece+offset_2g2z),
     8   int_mb(k_range_alpha+g4b-1)*int_mb(k_range_alpha+g3b-1)*
     8   int_mb(k_range_alpha+g2b-1))
c
        offset_2g2z=offset_2g2z+int_mb(k_range_alpha+g4b-1)*
     1    int_mb(k_range_alpha+g3b-1)*int_mb(k_range_alpha+g2b-1)*
     1    int_mb(k_range_alpha+g1b-1)
c
        end if !azone1.eq.azone2
c !---------------------------------------------------------------------
       END IF ! irow.gt.icol
       END IF ! symmetry
       END IF ! spin
c --- exascale -
       END IF
c --------------
       END IF ! istart
      ENDDO  ! g2b-loop ends up here
      ENDDO  ! g1b-loop ends up here
c 
 4444 continue
c
c now put here add_block
c *** debug ***
c        write(6,*)'step5a'
c        call util_flush(6)
c *************
      call add_block(d_v2,dbl_mb(k_g12piece),size_g12p,
     &               xoffset_p34+xoffset_g12(i))
c *** debug ***
c        write(6,*)'step5b'
c        call util_flush(6)
c *************
      istart=0
c
c
cccx         if (.not.ma_pop_stack(l_2g2z))
cccx     1   call errquit('tcc_mo2e_trans_zones: l_2g2z',15,MA_ERR)
         if (.not.ma_pop_stack(l_g12piece))
     1   call errquit('tcc_mo2e_trans_zones: l_g12piece',15,MA_ERR)
      ENDDO  ! i = 1,imaxp12 
c
c *** debug ***
c        write(6,*)'step100'
c        call util_flush(6)
c *************
c
c - exascale ---
  300  continue
c ---
c

      if (.not.ma_pop_stack(l_2g2a))
     1  call errquit('tcc_mo2e_trans_zones: MA problem',15,MA_ERR)
c
c
      next = NXTASK(nprocs, 1)
      END IF
      count = count + 1
      ENDDO !azone2
      ENDDO !azone1
      ENDDO !g4b
      ENDDO !g3b
c
      next = nxtask(-nprocs,1)
c
       call reconcilefile(d_v2,size_2e)
c
c
      if (.not.ma_pop_stack(l_coeff))
     1  call errquit('tcc_off_4a: MA problem',15,MA_ERR)
c
      if (.not.ma_pop_stack(l_integral))
     1  call errquit('tcc_off_4a: MA problem',15,MA_ERR)
c
c
c *** debug ***
c
c      if(nodezero) then
c      write(6,*)'DONE --- DONE ---- DONE ---- DONE'
c      end if
c
c *************
      if (.not.ma_pop_stack(l_work2))
     1  call errquit('tcc_ao2e: MA problem',14,MA_ERR)
      if (.not.ma_pop_stack(l_work1))
     1  call errquit('tcc_ao2e: MA problem',15,MA_ERR)
c
      if (.not.ma_pop_stack(l_movecs_orb))
     1  call errquit('tcc_ao2e: MA problem',15,MA_ERR)
c
      if (.not.ma_pop_stack(l_gpair))
     1  call errquit('tcc_ao2e: MA problem',15,MA_ERR)
c
c
c
      if(idiskl) then
cccx        if (parallel) then
cccx          call ga_sync()
cccx          if (sf_open(d_4af).ne.0)
cccx     1      call errquit('deletefile: sf problem',0,DISK_ERR)
cccx        endif
ccx      if (.not.sf_destroy(d_4af))
ccx     1  call errquit('tcc_sf_destroy2: sf problem',15,MA_ERR)
        if (parallel) call ga_sync()
      else
       call deletefile(d_4af)
      end if
c
c
c
      if (.not.ma_pop_stack(l_4af_offset))
     1  call errquit('tcc_ao2e: MA problem',15,MA_ERR)
c
      call ga_sync()
c *** debug ***
c 800  format('DGEMM1 MAX',i5,2x,3f15.5)
c 801  format('DGEMM2 ',i5,2x,3f15.5)
 9000 format('PART1',i4,1x,'Cpu  wall ',2(f17.12,1x),3x,'g4b g3b',2i5)
c 9001 format('PART2',i4,1x,'Cpu  wall ',2(f17.12,1x),3x,'g4b g3b',2i5)
c 9003 format('PART1-4a',i4,1x,'Cpu  wall ',2(f17.12,1x))
c 9004 format('PART1-2g2z',i4,1x,'Cpu  wall ',2(f17.12,1x))
c 9005 format('PART1-dgemm',i4,1x,'Cpu  wall ',2(f17.12,1x))
c 9010 format('  P1-mnrs',i3,1x,2i5,1x,2i5,1x,'Cpu  wall ',2(f17.12,1x))
  555  format('atom loop ',2x,i5,3x,2i5,3x,2i5,i12)
  556  format('atom time',2x,i5,3x,2i5,3x,2i5,'Cpu wall ',2(f12.7,1x))
  777  format('main do loop ',2x,i5,3x,2i5,3x,2i5)
  775  format('main loop step1 ',2x,i5,3x,2i5,3x,2i5)
  776  format('main loop step2 ',2x,i5,3x,2i5,3x,2i5)
  778  format('PART1',2x,i5,3x,2i5,3x,2i5,2x,'Cpu  wall ',2(f17.12,1x))
  779  format('PART2',2x,i5,3x,2i5,3x,2i5,2x,'Cpu  wall ',2(f17.12,1x))
  780  format('ADD BLOCK',2x,i5,3x,2i5,3x,2i5,2x,'Cpu  wall ',
     &        2(f17.12,1x))
c *************
c
      RETURN
      END
c
c
c
c
