CP2K 2.4 (Revision 12889)

task_list_methods.f90

Go to the documentation of this file.
00001 !-----------------------------------------------------------------------------!
00002 !   CP2K: A general program to perform molecular dynamics simulations         !
00003 !   Copyright (C) 2000 - 2013  CP2K developers group                          !
00004 !-----------------------------------------------------------------------------!
00005 
00006 ! *****************************************************************************
00012 MODULE task_list_methods
00013   USE array_types,                     ONLY: array_data
00014   USE atomic_kind_types,               ONLY: atomic_kind_type,&
00015                                              get_atomic_kind
00016   USE basis_set_types,                 ONLY: get_gto_basis_set,&
00017                                              gto_basis_set_p_type,&
00018                                              gto_basis_set_type
00019   USE cell_types,                      ONLY: cell_type,&
00020                                              pbc
00021   USE cp_control_types,                ONLY: dft_control_type
00022   USE cp_dbcsr_interface,              ONLY: cp_dbcsr_col_block_sizes,&
00023                                              cp_dbcsr_finalize,&
00024                                              cp_dbcsr_get_block_p,&
00025                                              cp_dbcsr_get_info,&
00026                                              cp_dbcsr_put_block,&
00027                                              cp_dbcsr_row_block_sizes,&
00028                                              cp_dbcsr_work_create
00029   USE cp_dbcsr_types,                  ONLY: cp_dbcsr_type
00030   USE cube_utils,                      ONLY: compute_cube_center,&
00031                                              cube_info_type,&
00032                                              return_cube,&
00033                                              return_cube_nonortho
00034   USE dbcsr_util,                      ONLY: convert_sizes_to_offsets
00035   USE f77_blas
00036   USE gaussian_gridlevels,             ONLY: gaussian_gridlevel,&
00037                                              gridlevel_info_type
00038   USE input_constants,                 ONLY: use_aux_fit_basis_set,&
00039                                              use_orb_basis_set
00040   USE kinds,                           ONLY: dp,&
00041                                              dp_size,&
00042                                              int_8,&
00043                                              int_8_size,&
00044                                              int_size
00045   USE memory_utilities,                ONLY: reallocate
00046   USE message_passing,                 ONLY: mp_allgather,&
00047                                              mp_alltoall,&
00048                                              mp_gather,&
00049                                              mp_scatter
00050   USE particle_types,                  ONLY: particle_type
00051   USE pw_env_types,                    ONLY: pw_env_get,&
00052                                              pw_env_type
00053   USE qs_environment_types,            ONLY: get_qs_env,&
00054                                              qs_environment_type
00055   USE qs_interactions,                 ONLY: exp_radius_very_extended
00056   USE qs_neighbor_list_types,          ONLY: get_iterator_info,&
00057                                              neighbor_list_iterate,&
00058                                              neighbor_list_iterator_create,&
00059                                              neighbor_list_iterator_p_type,&
00060                                              neighbor_list_iterator_release,&
00061                                              neighbor_list_set_p_type
00062   USE realspace_grid_types,            ONLY: realspace_grid_desc_p_type,&
00063                                              realspace_grid_desc_type,&
00064                                              realspace_grid_p_type,&
00065                                              rs_grid_create,&
00066                                              rs_grid_locate_rank,&
00067                                              rs_grid_release,&
00068                                              rs_grid_reorder_ranks
00069   USE task_list_types,                 ONLY: task_list_type
00070   USE termination,                     ONLY: stop_memory,&
00071                                              stop_program
00072   USE timings,                         ONLY: timeset,&
00073                                              timestop
00074   USE util,                            ONLY: sort
00075 #include "cp_common_uses.h"
00076 
00077   IMPLICIT NONE
00078 
00079   PRIVATE
00080 
00081   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
00082 
00083   PUBLIC :: generate_qs_task_list,&
00084             task_list_inner_loop
00085   PUBLIC :: distribute_tasks,&
00086             pair2int,&
00087             int2pair,&
00088             rs_distribute_matrix
00089 
00090 CONTAINS
00091 
00092 ! *****************************************************************************
00105   SUBROUTINE generate_qs_task_list(qs_env, task_list,&
00106                                    reorder_rs_grid_ranks, skip_load_balance_distributed, &
00107                                    soft_valid, basis_set_id, pw_env_external, sab_orb_external, error)
00108 
00109 
00110     TYPE(qs_environment_type), POINTER       :: qs_env
00111     TYPE(task_list_type), POINTER            :: task_list
00112     LOGICAL, INTENT(IN)                      :: reorder_rs_grid_ranks, 
00113                                                 skip_load_balance_distributed
00114     LOGICAL, INTENT(IN), OPTIONAL            :: soft_valid
00115     INTEGER, INTENT(IN), OPTIONAL            :: basis_set_id
00116     TYPE(pw_env_type), OPTIONAL, POINTER     :: pw_env_external
00117     TYPE(neighbor_list_set_p_type), 
00118       DIMENSION(:), OPTIONAL, POINTER        :: sab_orb_external
00119     TYPE(cp_error_type), INTENT(inout)       :: error
00120 
00121     CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list', 
00122       routineP = moduleN//':'//routineN
00123     INTEGER, PARAMETER                       :: add_tasks = 1000, 
00124                                                 max_tasks = 2000
00125     REAL(kind=dp), PARAMETER                 :: mult_tasks = 2.0_dp
00126 
00127     INTEGER :: curr_tasks, handle, i, iatom, iatom_old, igrid_level, 
00128       igrid_level_old, ikind, inode, ipair, ipgf, iset, jatom, jatom_old, 
00129       jkind, jpgf, jset, maxpgf, maxset, my_basis_set_id, natoms, nkind, 
00130       nseta, nsetb, stat
00131     INTEGER, DIMENSION(:), POINTER           :: la_max, la_min, lb_max, 
00132                                                 lb_min, npgfa, npgfb
00133     LOGICAL                                  :: failure, my_soft
00134     REAL(KIND=dp)                            :: kind_radius_a, kind_radius_b
00135     REAL(KIND=dp), DIMENSION(3)              :: ra, rab
00136     REAL(KIND=dp), DIMENSION(:), POINTER     :: set_radius_a, set_radius_b
00137     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rpgfa, rpgfb, zeta, zetb
00138     TYPE(atomic_kind_type), DIMENSION(:), 
00139       POINTER                                :: atomic_kind_set
00140     TYPE(atomic_kind_type), POINTER          :: atomic_kind
00141     TYPE(cell_type), POINTER                 :: cell
00142     TYPE(cube_info_type), DIMENSION(:), 
00143       POINTER                                :: cube_info
00144     TYPE(dft_control_type), POINTER          :: dft_control
00145     TYPE(gridlevel_info_type), POINTER       :: gridlevel_info
00146     TYPE(gto_basis_set_p_type), 
00147       DIMENSION(:), POINTER                  :: basis_set_list
00148     TYPE(gto_basis_set_type), POINTER        :: basis_set_a, basis_set_b, 
00149                                                 orb_basis_set
00150     TYPE(neighbor_list_iterator_p_type), 
00151       DIMENSION(:), POINTER                  :: nl_iterator
00152     TYPE(neighbor_list_set_p_type), 
00153       DIMENSION(:), POINTER                  :: sab_orb
00154     TYPE(particle_type), DIMENSION(:), 
00155       POINTER                                :: particle_set
00156     TYPE(pw_env_type), POINTER               :: pw_env
00157     TYPE(realspace_grid_desc_p_type), 
00158       DIMENSION(:), POINTER                  :: rs_descs
00159     TYPE(realspace_grid_p_type), 
00160       DIMENSION(:), POINTER                  :: rs_grids
00161 
00162     CALL timeset(routineN,handle)
00163 
00164     failure=.FALSE.
00165 
00166     ! by default, the full density is calculated
00167     my_soft=.FALSE.
00168     IF (PRESENT(soft_valid)) my_soft = soft_valid
00169 
00170     IF( PRESENT(basis_set_id)) THEN
00171       my_basis_set_id = basis_set_id
00172     ELSE
00173       my_basis_set_id = use_orb_basis_set
00174     END IF
00175 
00176     SELECT CASE (my_basis_set_id)
00177     CASE (use_orb_basis_set)
00178       CALL get_qs_env(qs_env=qs_env,&
00179            atomic_kind_set=atomic_kind_set,&
00180            cell=cell,&
00181            particle_set=particle_set,&
00182            dft_control=dft_control,&
00183            sab_orb=sab_orb,&
00184            pw_env=pw_env,error=error)
00185     CASE (use_aux_fit_basis_set)
00186       CALL get_qs_env(qs_env=qs_env,&
00187            atomic_kind_set=atomic_kind_set,&
00188            cell=cell,&
00189            particle_set=particle_set,&
00190            dft_control=dft_control,&
00191            sab_aux_fit=sab_orb,&
00192            pw_env=pw_env,error=error)
00193     END SELECT
00194 
00195     ! maybe just don't pas qs_env instead ??
00196     IF (PRESENT(pw_env_external)) pw_env=>pw_env_external
00197     IF (PRESENT(sab_orb_external)) sab_orb=>sab_orb_external
00198 
00199     CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids, error=error)
00200 
00201     ! *** assign from pw_env
00202     gridlevel_info=>pw_env%gridlevel_info
00203     cube_info=>pw_env%cube_info
00204 
00205     ! find maximum numbers
00206     nkind = SIZE(atomic_kind_set)
00207     natoms = SIZE( particle_set )
00208     maxset=0
00209     maxpgf=0
00210     DO ikind=1,nkind
00211        atomic_kind => atomic_kind_set(ikind)
00212        SELECT CASE (my_basis_set_id)
00213        CASE (use_orb_basis_set)
00214          CALL get_atomic_kind(atomic_kind=atomic_kind,&
00215               softb = my_soft, &
00216               orb_basis_set=orb_basis_set)
00217        CASE (use_aux_fit_basis_set)
00218           CALL get_atomic_kind(atomic_kind=atomic_kind,&
00219               softb = my_soft, &
00220               aux_fit_basis_set=orb_basis_set,&
00221               basis_set_id=my_basis_set_id)
00222        END SELECT
00223 
00224        IF (.NOT.ASSOCIATED(orb_basis_set)) CYCLE
00225        CALL get_gto_basis_set(gto_basis_set=orb_basis_set,&
00226             npgf=npgfa, nset=nseta )
00227 
00228        maxset=MAX(nseta,maxset)
00229        maxpgf=MAX(MAXVAL(npgfa),maxpgf)
00230     END DO
00231 
00232     ! free the atom_pair lists if allocated
00233     IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
00234     IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
00235 
00236     ! construct a list of all tasks
00237     IF (.NOT.ASSOCIATED(task_list%tasks)) THEN
00238        CALL reallocate(task_list%tasks,1,6,1,max_tasks)
00239        CALL reallocate(task_list%dist_ab,1,3,1,max_tasks)
00240     ENDIF
00241     task_list%ntasks = 0
00242     curr_tasks = SIZE(task_list%tasks,2)
00243 
00244     ALLOCATE (basis_set_list(nkind))
00245     DO ikind=1,nkind
00246       atomic_kind => atomic_kind_set(ikind)
00247       SELECT CASE (my_basis_set_id)
00248       CASE (use_orb_basis_set)
00249         CALL get_atomic_kind(atomic_kind=atomic_kind,softb=my_soft,orb_basis_set=basis_set_a)
00250       CASE (use_aux_fit_basis_set)
00251         CALL get_atomic_kind(atomic_kind=atomic_kind,softb=my_soft,&
00252              aux_fit_basis_set=basis_set_a,basis_set_id=my_basis_set_id)
00253       END SELECT
00254       IF (ASSOCIATED(basis_set_a)) THEN
00255         basis_set_list(ikind)%gto_basis_set => basis_set_a
00256       ELSE
00257         NULLIFY(basis_set_list(ikind)%gto_basis_set)
00258       END IF
00259     END DO
00260     CALL neighbor_list_iterator_create(nl_iterator,sab_orb)
00261     DO WHILE (neighbor_list_iterate(nl_iterator)==0)
00262        CALL get_iterator_info(nl_iterator,ikind=ikind,jkind=jkind,inode=inode,&
00263                               iatom=iatom,jatom=jatom,r=rab)
00264        basis_set_a => basis_set_list(ikind)%gto_basis_set
00265        IF (.NOT.ASSOCIATED(basis_set_a)) CYCLE
00266        basis_set_b => basis_set_list(jkind)%gto_basis_set
00267        IF (.NOT.ASSOCIATED(basis_set_b)) CYCLE
00268        ra(:) = pbc(particle_set(iatom)%r,cell)
00269        ! basis ikind
00270        la_max       =>  basis_set_a%lmax
00271        la_min       =>  basis_set_a%lmin
00272        npgfa        =>  basis_set_a%npgf
00273        nseta        =   basis_set_a%nset
00274        rpgfa        =>  basis_set_a%pgf_radius
00275        set_radius_a =>  basis_set_a%set_radius
00276        kind_radius_a=   basis_set_a%kind_radius
00277        zeta         =>  basis_set_a%zet
00278        ! basis jkind
00279        lb_max       =>  basis_set_b%lmax
00280        lb_min       =>  basis_set_b%lmin
00281        npgfb        =>  basis_set_b%npgf
00282        nsetb        =   basis_set_b%nset
00283        rpgfb        =>  basis_set_b%pgf_radius
00284        set_radius_b =>  basis_set_b%set_radius
00285        kind_radius_b=   basis_set_b%kind_radius
00286        zetb         =>  basis_set_b%zet
00287 
00288        CALL task_list_inner_loop(task_list%tasks, task_list%dist_ab, task_list%ntasks, curr_tasks, &
00289                 rs_descs,dft_control,cube_info,gridlevel_info,cell,&
00290                 iatom,jatom,rpgfa,rpgfb,zeta,zetb,kind_radius_b,set_radius_a,set_radius_b,ra,rab,&
00291                 la_max,la_min,lb_max,lb_min,npgfa,npgfb,maxpgf,maxset,natoms,nseta,nsetb,error)
00292 
00293     END DO
00294     CALL neighbor_list_iterator_release(nl_iterator)
00295 
00296     DEALLOCATE (basis_set_list)
00297 
00298     ! redistribute the task list so that all tasks map on the local rs grids
00299     CALL distribute_tasks (rs_descs,task_list%ntasks, natoms,&
00300             maxset,maxpgf, task_list%tasks, rval=task_list%dist_ab, atom_pair_send=task_list%atom_pair_send,&
00301             atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
00302             reorder_rs_grid_ranks=reorder_rs_grid_ranks, skip_load_balance_distributed=skip_load_balance_distributed,&
00303             error=error)
00304 
00305     ! If the rank order has changed, reallocate any of the distributed rs_grids
00306 
00307     IF (reorder_rs_grid_ranks) THEN
00308       DO i=1,gridlevel_info%ngrid_levels
00309         IF (rs_descs(i)%rs_desc%distributed) THEN
00310           CALL rs_grid_release(rs_grids(i)%rs_grid, error)
00311           NULLIFY(rs_grids(i)%rs_grid)
00312           CALL rs_grid_create(rs_grids(i)%rs_grid, rs_descs(i)%rs_desc, error)
00313         END IF
00314       END DO
00315     END IF
00316 
00317     ! Now we have the final list of tasks, setup the task_list with the
00318     ! data needed for the loops in integrate_v/calculate_rho
00319 
00320     IF (ASSOCIATED(task_list%taskstart)) DEALLOCATE (task_list%taskstart)
00321     IF (ASSOCIATED(task_list%taskstop)) DEALLOCATE (task_list%taskstop)
00322     IF (ASSOCIATED(task_list%npairs)) DEALLOCATE (task_list%npairs)
00323 
00324     ! First, count the number of unique atom pairs per grid level
00325 
00326     ALLOCATE (task_list%npairs(SIZE(rs_descs)),STAT=stat)
00327     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
00328                                     "task_list%npairs",int_size*SIZE(rs_descs))
00329 
00330     iatom_old = -1 ; jatom_old = -1 ; igrid_level_old = -1
00331     ipair = 0
00332     task_list%npairs = 0
00333 
00334     DO i=1,task_list%ntasks
00335       CALL int2pair(task_list%tasks(3,i),igrid_level,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
00336       IF (igrid_level .NE. igrid_level_old) THEN
00337         IF (igrid_level_old .NE. -1) THEN
00338           task_list%npairs(igrid_level_old) = ipair
00339         END IF
00340         ipair = 1
00341 
00342         igrid_level_old = igrid_level
00343         iatom_old = iatom
00344         jatom_old = jatom
00345       ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old) THEN
00346         ipair = ipair + 1
00347         iatom_old = iatom
00348         jatom_old = jatom
00349       END IF
00350     END DO
00351     ! Take care of the last iteration
00352     IF (task_list%ntasks /= 0) THEN
00353       task_list%npairs(igrid_level) = ipair
00354     END IF
00355 
00356     ! Second, for each atom pair, find the indices in the task list
00357     ! of the first and last task
00358 
00359     ! Array sized for worst case
00360     ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs),SIZE(rs_descs)), STAT=stat)
00361     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
00362                                     "taskstart",int_size*MAXVAL(task_list%npairs)*SIZE(rs_descs))
00363     ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs),SIZE(rs_descs)), STAT=stat)
00364     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
00365                                     "taskstop",int_size*MAXVAL(task_list%npairs)*SIZE(rs_descs))
00366 
00367     iatom_old = -1 ; jatom_old = -1 ; igrid_level_old = -1
00368     ipair = 0
00369     task_list%taskstart = 0
00370     task_list%taskstop = 0
00371 
00372     DO i=1,task_list%ntasks
00373       CALL int2pair(task_list%tasks(3,i),igrid_level,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
00374       IF (igrid_level .NE. igrid_level_old) THEN
00375         IF (igrid_level_old .NE. -1) THEN
00376           task_list%taskstop(ipair,igrid_level_old) = i - 1
00377         END IF
00378         ipair = 1
00379         task_list%taskstart(ipair,igrid_level) = i
00380 
00381         igrid_level_old = igrid_level
00382         iatom_old = iatom
00383         jatom_old = jatom
00384       ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old) THEN
00385         ipair = ipair + 1
00386         task_list%taskstart(ipair,igrid_level) = i
00387         task_list%taskstop(ipair-1,igrid_level) = i - 1
00388 
00389         iatom_old = iatom
00390         jatom_old = jatom
00391       END IF
00392     END DO
00393     ! Take care of the last iteration
00394     IF (task_list%ntasks /= 0) THEN
00395       task_list%taskstop(ipair,igrid_level) = task_list%ntasks
00396     END IF
00397 
00398     CALL timestop(handle)
00399 
00400   END SUBROUTINE generate_qs_task_list
00401 
00402 ! *****************************************************************************
00406   SUBROUTINE task_list_inner_loop(tasks, dist_ab, ntasks, curr_tasks, rs_descs,dft_control,cube_info,gridlevel_info,cell,&
00407                                   iatom,jatom,rpgfa,rpgfb,zeta,zetb,kind_radius_b,set_radius_a,set_radius_b,ra,rab,&
00408                                   la_max,la_min,lb_max,lb_min,npgfa,npgfb,maxpgf,maxset,natoms,nseta,nsetb,error)
00409 
00410     INTEGER(KIND=int_8), DIMENSION(:, :), 
00411       POINTER                                :: tasks
00412     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: dist_ab
00413     INTEGER                                  :: ntasks, curr_tasks
00414     TYPE(realspace_grid_desc_p_type), 
00415       DIMENSION(:), POINTER                  :: rs_descs
00416     TYPE(dft_control_type), POINTER          :: dft_control
00417     TYPE(cube_info_type), DIMENSION(:), 
00418       POINTER                                :: cube_info
00419     TYPE(gridlevel_info_type), POINTER       :: gridlevel_info
00420     TYPE(cell_type), POINTER                 :: cell
00421     INTEGER                                  :: iatom, jatom
00422     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rpgfa, rpgfb, zeta, zetb
00423     REAL(KIND=dp)                            :: kind_radius_b
00424     REAL(KIND=dp), DIMENSION(:), POINTER     :: set_radius_a, set_radius_b
00425     REAL(KIND=dp), DIMENSION(3)              :: ra, rab
00426     INTEGER, DIMENSION(:), POINTER           :: la_max, la_min, lb_max, 
00427                                                 lb_min, npgfa, npgfb
00428     INTEGER                                  :: maxpgf, maxset, natoms, 
00429                                                 nseta, nsetb
00430     TYPE(cp_error_type), INTENT(inout)       :: error
00431 
00432     CHARACTER(LEN=*), PARAMETER :: routineN = 'task_list_inner_loop', 
00433       routineP = moduleN//':'//routineN
00434 
00435     INTEGER                                  :: cube_center(3), igrid_level, 
00436                                                 ipgf, iset, jpgf, jset, 
00437                                                 lb_cube(3), ub_cube(3)
00438     REAL(KIND=dp)                            :: dab, rab2, radius, zetp
00439 
00440     rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
00441     dab = SQRT(rab2)
00442 
00443     loop_iset: DO iset=1,nseta
00444 
00445        IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE
00446 
00447        loop_jset: DO jset=1,nsetb
00448 
00449           IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE
00450 
00451           loop_ipgf: DO ipgf=1,npgfa(iset)
00452 
00453              IF (rpgfa(ipgf,iset) + set_radius_b(jset) < dab) CYCLE
00454 
00455              loop_jpgf: DO jpgf=1,npgfb(jset)
00456 
00457                 IF (rpgfa(ipgf,iset) + rpgfb(jpgf,jset) < dab) CYCLE
00458 
00459                 zetp = zeta(ipgf,iset) + zetb(jpgf,jset)
00460                 igrid_level = gaussian_gridlevel(gridlevel_info,zetp)
00461 
00462                 CALL compute_pgf_properties(cube_center,lb_cube,ub_cube,radius,&
00463                            rs_descs(igrid_level)%rs_desc,cube_info(igrid_level),&
00464                            la_max(iset),zeta(ipgf,iset),la_min(iset),&
00465                            lb_max(jset),zetb(jpgf,jset),lb_min(jset),&
00466                            ra,rab,rab2,dft_control%qs_control%eps_rho_rspace)
00467 
00468                 CALL pgf_to_tasks(tasks,dist_ab,ntasks,curr_tasks,&
00469                           rab,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf,la_max(iset),lb_max(jset),&
00470                           rs_descs(igrid_level)%rs_desc,igrid_level,gridlevel_info%ngrid_levels,cube_center,&
00471                           lb_cube,ub_cube,error)
00472 
00473              END DO loop_jpgf
00474 
00475           END DO loop_ipgf
00476 
00477        END DO loop_jset
00478 
00479     END DO loop_iset
00480 
00481   END SUBROUTINE task_list_inner_loop
00482 
00483 ! *****************************************************************************
00501   SUBROUTINE compute_pgf_properties(cube_center,lb_cube,ub_cube,radius, &
00502                rs_desc,cube_info,la_max,zeta,la_min,lb_max,zetb,lb_min,ra,rab,rab2,eps)
00503 
00504     INTEGER, DIMENSION(3), INTENT(OUT)       :: cube_center, lb_cube, ub_cube
00505     REAL(KIND=dp), INTENT(OUT)               :: radius
00506     TYPE(realspace_grid_desc_type), POINTER  :: rs_desc
00507     TYPE(cube_info_type), INTENT(IN)         :: cube_info
00508     INTEGER, INTENT(IN)                      :: la_max
00509     REAL(KIND=dp), INTENT(IN)                :: zeta
00510     INTEGER, INTENT(IN)                      :: la_min, lb_max
00511     REAL(KIND=dp), INTENT(IN)                :: zetb
00512     INTEGER, INTENT(IN)                      :: lb_min
00513     REAL(KIND=dp), INTENT(IN)                :: ra(3), rab(3), rab2, eps
00514 
00515     INTEGER, DIMENSION(:), POINTER           :: sphere_bounds
00516     REAL(KIND=dp)                            :: cutoff, extent(3), f, 
00517                                                 prefactor, rb(3), zetp
00518     REAL(KIND=dp), DIMENSION(3)              :: rp
00519 
00520 ! the radius for this task
00521 
00522     zetp = zeta + zetb
00523     rp(:) = ra(:) + zetb/zetp*rab(:)
00524     rb(:) = ra(:)+rab(:)
00525     cutoff    = 1.0_dp
00526     f         = zetb/zetp
00527     prefactor = EXP(-zeta*f*rab2)
00528     radius=exp_radius_very_extended(la_min,la_max,lb_min,lb_max,ra=ra,rb=rb,rp=rp,&
00529                zetp=zetp,eps=eps, prefactor=prefactor,cutoff=cutoff)
00530 
00531     CALL compute_cube_center(cube_center,rs_desc,zeta,zetb,ra,rab)
00532     ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
00533     cube_center(:) = MODULO ( cube_center(:), rs_desc%npts(:) )
00534     cube_center(:) = cube_center(:) + rs_desc%lb(:)
00535 
00536     IF (rs_desc%orthorhombic ) THEN
00537        CALL return_cube(cube_info,radius,lb_cube,ub_cube,sphere_bounds)
00538     ELSE
00539        ! untested so far
00540        CALL return_cube_nonortho(cube_info,radius,lb_cube,ub_cube,rp)
00541        extent(:)  = ub_cube(:) - lb_cube(:)
00542        lb_cube(:) = -extent(:) / 2 - 1
00543        ub_cube(:) = extent(:) / 2
00544     ENDIF
00545 
00546   END SUBROUTINE compute_pgf_properties
00547 ! *****************************************************************************
00556   INTEGER FUNCTION cost_model(lb_cube,ub_cube,fraction,lmax,is_ortho)
00557     INTEGER, DIMENSION(3), INTENT(IN)       :: lb_cube, ub_cube
00558     REAL(KIND=dp), INTENT(IN)               :: fraction
00559 
00560     INTEGER :: cmax,lmax
00561     LOGICAL :: is_ortho
00562     REAL(KIND=dp) :: v1,v2,v3,v4,v5
00563 
00564     cmax=MAXVAL(((ub_cube-lb_cube)+1)/2)
00565 
00566     IF (is_ortho) THEN
00567        v1=1.504760E+00_dp
00568        v2=3.126770E+00_dp
00569        v3=5.074106E+00_dp
00570        v4=1.091568E+00_dp
00571        v5=1.070187E+00_dp
00572     ELSE
00573        v1=7.831105E+00_dp
00574        v2=2.675174E+00_dp
00575        v3=7.546553E+00_dp
00576        v4=6.122446E-01_dp
00577        v5=3.886382E+00_dp
00578     ENDIF
00579     cost_model = CEILING(((lmax+v1)*(cmax+v2)**3*v3*fraction+v4+v5*lmax**7)/1000.0_dp)
00580 
00581   END FUNCTION cost_model
00582 ! *****************************************************************************
00596   SUBROUTINE pgf_to_tasks(tasks,dist_ab,ntasks,curr_tasks,&
00597                           rab,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf,la_max,lb_max,&
00598                           rs_desc,igrid_level,n_levels,cube_center,lb_cube,ub_cube,error)
00599 
00600     INTEGER(KIND=int_8), DIMENSION(:, :), 
00601       POINTER                                :: tasks
00602     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: dist_ab
00603     INTEGER, INTENT(INOUT)                   :: ntasks, curr_tasks
00604     REAL(KIND=dp), DIMENSION(3), INTENT(IN)  :: rab
00605     INTEGER, INTENT(IN)                      :: iatom, jatom, iset, jset, 
00606                                                 ipgf, jpgf, natoms, maxset, 
00607                                                 maxpgf, la_max, lb_max
00608     TYPE(realspace_grid_desc_type), POINTER  :: rs_desc
00609     INTEGER, INTENT(IN)                      :: igrid_level, n_levels
00610     INTEGER, DIMENSION(3), INTENT(IN)        :: cube_center, lb_cube, ub_cube
00611     TYPE(cp_error_type), INTENT(inout)       :: error
00612 
00613     INTEGER, PARAMETER                       :: add_tasks = 1000
00614     REAL(kind=dp), PARAMETER                 :: mult_tasks = 2.0_dp
00615 
00616     INTEGER                                  :: added_tasks, cost, j, lmax
00617     LOGICAL                                  :: is_ortho
00618     REAL(KIND=dp)                            :: fraction
00619 
00620     ntasks = ntasks + 1
00621     IF ( ntasks > curr_tasks ) THEN
00622        curr_tasks = (curr_tasks+add_tasks)*mult_tasks
00623        CALL reallocate(tasks,1,6,1,curr_tasks)
00624     END IF
00625 
00626     IF(rs_desc%distributed) THEN
00627 
00628        ! finds the node(s) that need to process this task
00629        ! on exit tasks(4,:) is 1 for distributed tasks and 2 for generalised tasks
00630        CALL rs_find_node(rs_desc,igrid_level,n_levels,cube_center, &
00631             ntasks=ntasks,tasks=tasks,lb_cube=lb_cube,ub_cube=ub_cube,added_tasks=added_tasks,error=error)
00632 
00633     ELSE
00634        tasks(1,ntasks) = encode_rank(rs_desc % my_pos, igrid_level, n_levels)
00635        tasks(4,ntasks) = 0
00636        tasks(6,ntasks) = 0
00637        added_tasks = 1
00638     ENDIF
00639 
00640     IF (SIZE(dist_ab,2).NE.SIZE(tasks,2)) THEN
00641        CALL reallocate(dist_ab,1,3,1,SIZE(tasks,2))
00642     ENDIF
00643 
00644     lmax=la_max+lb_max
00645     is_ortho=(tasks(4,ntasks)==0 .OR. tasks(4,ntasks)==1) .AND. rs_desc%orthorhombic
00646     ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
00647     ! this could be refined in the future
00648     fraction=1.0_dp/added_tasks
00649 
00650     cost=cost_model(lb_cube,ub_cube,fraction,lmax,is_ortho)
00651 
00652     DO j= 1, added_tasks
00653 
00654        tasks (2,ntasks-added_tasks+j) = encode_rank(rs_desc % my_pos, igrid_level, n_levels)
00655        tasks (5,ntasks-added_tasks+j) = cost
00656 
00657        !encode the atom pairs and basis info as a single long integer
00658        CALL pair2int(tasks(3,ntasks-added_tasks+j),igrid_level,&
00659             iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
00660 
00661        dist_ab (1,ntasks-added_tasks+j) = rab(1)
00662        dist_ab (2,ntasks-added_tasks+j) = rab(2)
00663        dist_ab (3,ntasks-added_tasks+j) = rab(3)
00664 
00665     ENDDO
00666 
00667   END SUBROUTINE pgf_to_tasks
00668 
00669 ! *****************************************************************************
00680   SUBROUTINE pair2int(res,ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natom,maxset,maxpgf)
00681     INTEGER(KIND=int_8), INTENT(OUT)         :: res
00682     INTEGER, INTENT(IN)                      :: ilevel, iatom, jatom, iset, 
00683                                                 jset, ipgf, jpgf, natom, 
00684                                                 maxset, maxpgf
00685 
00686     INTEGER(KIND=int_8)                      :: maxpgf8, maxset8, natom8
00687 
00688     natom8=natom ; maxset8=maxset ; maxpgf8=maxpgf
00689     !
00690     ! this encoding yields the right order of the tasks for collocation after the sort
00691     ! in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
00692     ! The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
00693     ! that a given density matrix block will be decontracted several times, but cache effects on the
00694     ! grid make up for this.
00695     !
00696     res=ilevel*natom8**2*maxset8**2*maxpgf8**2+&
00697         ((iatom-1)*natom8+jatom-1)*maxset8**2*maxpgf8**2 + &
00698         ((iset-1)*maxset8+jset-1)*maxpgf8**2+ &
00699          (ipgf-1)*maxpgf8+jpgf-1
00700 
00701   END SUBROUTINE pair2int
00702 
00703 ! *****************************************************************************
00704   SUBROUTINE int2pair(res,ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natom,maxset,maxpgf)
00705     INTEGER(KIND=int_8), INTENT(IN)          :: res
00706     INTEGER, INTENT(OUT)                     :: ilevel, iatom, jatom, iset, 
00707                                                 jset, ipgf, jpgf
00708     INTEGER, INTENT(IN)                      :: natom, maxset, maxpgf
00709 
00710     INTEGER(KIND=int_8)                      :: iatom8, ijatom, ijpgf, ijset, 
00711                                                 ipgf8, iset8, jatom8, jpgf8, 
00712                                                 jset8, maxpgf8, maxset8, 
00713                                                 natom8, tmp
00714 
00715     natom8=natom ; maxset8=maxset ; maxpgf8=maxpgf
00716 
00717     ilevel=res/(natom8**2*maxset8**2*maxpgf8**2)
00718     tmp=MOD(res,natom8**2*maxset8**2*maxpgf8**2)
00719     ijatom=tmp/(maxpgf8**2*maxset8**2)
00720     iatom8=ijatom/natom8+1
00721     jatom8=MOD(ijatom,natom8)+1
00722     tmp=MOD(tmp,maxpgf8**2*maxset8**2)
00723     ijset=tmp/maxpgf8**2
00724     iset8=ijset/maxset8+1
00725     jset8=MOD(ijset,maxset8)+1
00726     ijpgf=MOD(tmp,maxpgf8**2)
00727     ipgf8=ijpgf/maxpgf8+1
00728     jpgf8=MOD(ijpgf,maxpgf8)+1
00729 
00730     iatom=iatom8 ; jatom=jatom8; iset=iset8 ; jset=jset8
00731     ipgf=ipgf8 ; jpgf=jpgf8
00732 
00733   END SUBROUTINE int2pair
00734 
00735 ! *****************************************************************************
00740   SUBROUTINE load_balance_distributed (tasks, ntasks, rs_descs, grid_level, natoms, maxset,maxpgf, error)
00741 
00742     INTEGER(KIND=int_8), DIMENSION(:, :), 
00743       POINTER                                :: tasks
00744     INTEGER                                  :: ntasks
00745     TYPE(realspace_grid_desc_p_type), 
00746       DIMENSION(:), POINTER                  :: rs_descs
00747     INTEGER                                  :: grid_level, natoms, maxset, 
00748                                                 maxpgf
00749     TYPE(cp_error_type), INTENT(inout)       :: error
00750 
00751     CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed', 
00752       routineP = moduleN//':'//routineN
00753 
00754     INTEGER                                  :: handle, stat
00755     INTEGER, DIMENSION(:, :, :), POINTER     :: list
00756     LOGICAL                                  :: failure
00757 
00758     CALL timeset(routineN,handle)
00759 
00760     failure=.FALSE.
00761     NULLIFY(list)
00762     ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
00763     ! if a destination would not be in this list, it is a bug
00764     CALL create_destination_list(list,rs_descs,grid_level,error)
00765 
00766     ! now, walk over the tasks, filling in the loads of each destination
00767     CALL compute_load_list(list,rs_descs,grid_level,tasks,ntasks,natoms, maxset,maxpgf, create_list=.TRUE., error=error)
00768 
00769     ! optimize loads & fluxes
00770     CALL optimize_load_list(list,rs_descs(1)%rs_desc%group,rs_descs(1)%rs_desc%my_pos, error)
00771 
00772     ! now, walk over the tasks, using the list to set the destinations
00773     CALL compute_load_list(list,rs_descs,grid_level, tasks,ntasks,natoms, maxset,maxpgf, create_list=.FALSE., error=error)
00774 
00775     DEALLOCATE (list,STAT=stat)
00776     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
00777 
00778     CALL timestop(handle)
00779 
00780   END SUBROUTINE load_balance_distributed
00781 
00782 
00783 ! *****************************************************************************
00789   SUBROUTINE balance_global_list(list_global)
00790     INTEGER, DIMENSION(:, :, 0:)             :: list_global
00791 
00792     CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list', 
00793       routineP = moduleN//':'//routineN
00794     INTEGER, PARAMETER                       :: Max_Iter = 100
00795     REAL(KIND=dp), PARAMETER                 :: Tolerance_factor = 0.005_dp
00796 
00797     INTEGER                                  :: dest, handle, icpu, idest, 
00798                                                 iflux, ilocal, k, maxdest, 
00799                                                 Ncpu, Nflux
00800     INTEGER, ALLOCATABLE, DIMENSION(:, :)    :: flux_connections
00801     LOGICAL                                  :: solution_ok, solution_optimal
00802     REAL(KIND=dp)                            :: average, load_shift, 
00803                                                 max_load_shift, tolerance
00804     REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, 
00805                                                 optimized_load
00806     REAL(KIND=dp), ALLOCATABLE, 
00807       DIMENSION(:, :)                        :: flux_limits
00808 
00809     CALL timeset(routineN,handle)
00810 
00811      Ncpu=SIZE(list_global,3)
00812      maxdest=SIZE(list_global,2)
00813      ALLOCATE (load(0:Ncpu-1))
00814      load=0.0_dp
00815      ALLOCATE (optimized_load(0:Ncpu-1))
00816 
00817 
00818      ! figure out the number of fluxes
00819      ! we assume that the global_list is symmetric
00820      Nflux=0
00821      DO icpu=0,ncpu-1
00822         DO idest=1,maxdest
00823            dest=list_global(1,idest,icpu)
00824            IF (dest<ncpu .AND. dest>icpu) Nflux=Nflux+1
00825         ENDDO
00826      ENDDO
00827      ALLOCATE (optimized_flux(Nflux))
00828      ALLOCATE (flux_limits(2,Nflux))
00829      ALLOCATE (flux_connections(2,Nflux))
00830 
00831      ! reorder data
00832      flux_limits=0
00833      Nflux=0
00834      DO icpu=0,ncpu-1
00835         load(icpu)=SUM(list_global(2,:,icpu))
00836         DO idest=1,maxdest
00837            dest=list_global(1,idest,icpu)
00838            IF (dest<ncpu) THEN
00839               IF (dest.NE.icpu) THEN
00840                  IF (dest>icpu) THEN
00841                     Nflux=Nflux+1
00842                     flux_limits(2,Nflux)=list_global(2,idest,icpu)
00843                     flux_connections(1,Nflux)=icpu
00844                     flux_connections(2,Nflux)=dest
00845                  ELSE
00846                     DO iflux=1,Nflux
00847                        IF (flux_connections(1,iflux)==dest .AND. flux_connections(2,iflux)==icpu) THEN
00848                           flux_limits(1,iflux)=-list_global(2,idest,icpu)
00849                           EXIT
00850                        ENDIF
00851                     ENDDO
00852                  ENDIF
00853               ENDIF
00854            ENDIF
00855         ENDDO
00856      ENDDO
00857 
00858      solution_ok=.TRUE.
00859      solution_optimal=.FALSE.
00860      optimized_flux=0.0_dp
00861 
00862      ! an iterative solver, if iterated till convergence the maximum load is minimal
00863      ! we terminate before things are fully converged, since this does show up in the timings
00864      ! once the largest shift becomes less than a small fraction of the average load, we're done
00865      ! we're perfectly happy if the load balance is within 1 percent or so
00866      ! the maximum load normally converges even faster
00867      average=SUM(load)/SIZE(load)
00868      tolerance=Tolerance_factor*average
00869 
00870      optimized_load=load
00871      DO k=1,Max_iter
00872        max_load_shift=0.0_dp
00873        DO iflux=1,Nflux
00874           load_shift=(optimized_load(flux_connections(1,iflux))-optimized_load(flux_connections(2,iflux)))/2
00875           load_shift=MAX(flux_limits(1,iflux)-optimized_flux(iflux),load_shift)
00876           load_shift=MIN(flux_limits(2,iflux)-optimized_flux(iflux),load_shift)
00877           max_load_shift=MAX(ABS(load_shift),max_load_shift)
00878           optimized_load(flux_connections(1,iflux))=optimized_load(flux_connections(1,iflux))-load_shift
00879           optimized_load(flux_connections(2,iflux))=optimized_load(flux_connections(2,iflux))+load_shift
00880           optimized_flux(iflux)=optimized_flux(iflux)+load_shift
00881        ENDDO
00882        IF (max_load_shift<tolerance) THEN
00883           solution_optimal=.TRUE.
00884           EXIT
00885        ENDIF
00886      ENDDO
00887 
00888      ! now adjust the load list to reflect the optimized fluxes
00889      ! reorder data
00890      Nflux=0
00891      DO icpu=0,ncpu-1
00892         DO idest=1,maxdest
00893            IF (list_global(1,idest,icpu)==icpu) ilocal=idest
00894         ENDDO
00895         DO idest=1,maxdest
00896            dest=list_global(1,idest,icpu)
00897            IF (dest<ncpu) THEN
00898               IF (dest.NE.icpu) THEN
00899                  IF (dest>icpu) THEN
00900                     Nflux=Nflux+1
00901                     IF (optimized_flux(Nflux)>0) THEN
00902                        list_global(2,ilocal,icpu)=list_global(2,ilocal,icpu)+&
00903                                                   list_global(2,idest,icpu)-optimized_flux(Nflux)
00904                        list_global(2,idest,icpu)=optimized_flux(Nflux)
00905                     ELSE
00906                        list_global(2,ilocal,icpu)=list_global(2,ilocal,icpu)+&
00907                                                   list_global(2,idest,icpu)
00908                        list_global(2,idest,icpu)=0
00909                     ENDIF
00910                  ELSE
00911                     DO iflux=1,Nflux
00912                        IF (flux_connections(1,iflux)==dest .AND. flux_connections(2,iflux)==icpu) THEN
00913                           IF (optimized_flux(iflux)>0) THEN
00914                              list_global(2,ilocal,icpu)=list_global(2,ilocal,icpu)+&
00915                                                         list_global(2,idest,icpu)
00916                              list_global(2,idest,icpu)=0
00917                           ELSE
00918                              list_global(2,ilocal,icpu)=list_global(2,ilocal,icpu)+&
00919                                                         list_global(2,idest,icpu)+optimized_flux(iflux)
00920                              list_global(2,idest,icpu)=-optimized_flux(iflux)
00921                           ENDIF
00922                           EXIT
00923                        ENDIF
00924                     ENDDO
00925                  ENDIF
00926               ENDIF
00927            ENDIF
00928         ENDDO
00929      ENDDO
00930 
00931     CALL timestop(handle)
00932 
00933   END SUBROUTINE balance_global_list
00934 
00935 ! *****************************************************************************
00941   SUBROUTINE optimize_load_list(list,group,my_pos,error)
00942     INTEGER, DIMENSION(:, :, 0:)             :: list
00943     INTEGER, INTENT(IN)                      :: group, my_pos
00944     TYPE(cp_error_type), INTENT(inout)       :: error
00945 
00946     CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list', 
00947       routineP = moduleN//':'//routineN
00948 
00949     INTEGER                                  :: handle, i, icpu, idest, itmp, 
00950                                                 jcpu, maxdest, ncpu, stat
00951     INTEGER, ALLOCATABLE, DIMENSION(:)       :: load_all, load_per_source
00952     INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
00953     LOGICAL                                  :: failure
00954 
00955     CALL timeset(routineN,handle)
00956 
00957     failure=.FALSE.
00958 
00959     ncpu=SIZE(list,3)
00960     maxdest=SIZE(list,2)
00961 
00962     ALLOCATE (load_all(maxdest*ncpu),STAT=stat)
00963     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
00964     !
00965     ! XXXXX this allocation can cause an out-of-memory if the number of MPI tasks is sufficient
00966     !
00967     ALLOCATE (load_per_source(maxdest*ncpu*ncpu),STAT=stat)
00968     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
00969     ALLOCATE (list_global(2,maxdest,ncpu),STAT=stat)
00970     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
00971 
00972     load_all=RESHAPE(list(2,:,:),(/maxdest*ncpu/))
00973     CALL mp_gather(load_all,load_per_source,0,group)
00974 
00975     IF (my_pos==0) THEN
00976 
00977        ! collect the load of all CPUs
00978        load_all=0
00979        DO icpu=1,ncpu
00980           load_all=load_all+load_per_source(maxdest*ncpu*(icpu-1)+1:maxdest*ncpu*icpu)
00981        ENDDO
00982        list_global(1,:,:)=list(1,:,:)
00983        list_global(2,:,:)=RESHAPE(load_all,(/maxdest,ncpu/))
00984 
00985        ! optimize the fluxes
00986        CALL balance_global_list(list_global)
00987 
00988        ! figure out the amount of data a given CPU can send to a destination
00989        !
00990        i=0
00991        DO jcpu=1,ncpu
00992          DO  icpu=1,ncpu
00993            DO idest=1,maxdest
00994               i=i+1
00995               itmp=MIN(list_global(2,idest,icpu),load_per_source(i))
00996               load_per_source(i)=itmp
00997               list_global(2,idest,icpu)=list_global(2,idest,icpu)-itmp
00998            ENDDO
00999          ENDDO
01000        ENDDO
01001 
01002     ENDIF
01003 
01004     CALL mp_scatter(load_per_source,load_all,0,group)
01005     list(2,:,:)=RESHAPE(load_all,(/maxdest,ncpu/))
01006 
01007     DEALLOCATE (load_all,STAT=stat)
01008     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01009     DEALLOCATE (load_per_source,STAT=stat)
01010     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01011     DEALLOCATE (list_global,STAT=stat)
01012     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01013 
01014     CALL timestop(handle)
01015 
01016   END SUBROUTINE optimize_load_list
01017 ! *****************************************************************************
01027   SUBROUTINE compute_load_list(list,rs_descs,grid_level,tasks,ntasks,natoms, maxset,maxpgf, create_list, error)
01028     INTEGER, DIMENSION(:, :, 0:)             :: list
01029     TYPE(realspace_grid_desc_p_type), 
01030       DIMENSION(:), POINTER                  :: rs_descs
01031     INTEGER                                  :: grid_level
01032     INTEGER(KIND=int_8), DIMENSION(:, :), 
01033       POINTER                                :: tasks
01034     INTEGER                                  :: ntasks, natoms, maxset, maxpgf
01035     LOGICAL                                  :: create_list
01036     TYPE(cp_error_type), INTENT(inout)       :: error
01037 
01038     CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list', 
01039       routineP = moduleN//':'//routineN
01040 
01041     INTEGER :: dest, handle, i, iatom, ilevel, iopt, ipair_old, ipgf, iset, 
01042       itask, itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, 
01043       ndest_pair, nopt, nshort, rank
01044     INTEGER(KIND=int_8)                      :: bit_pattern, ipair, natom8
01045     INTEGER(KIND=int_8), ALLOCATABLE, 
01046       DIMENSION(:)                           :: loads
01047     INTEGER, ALLOCATABLE, DIMENSION(:)       :: all_dests, index
01048     INTEGER, DIMENSION(6)                    :: options
01049     LOGICAL                                  :: failure
01050 
01051     CALL timeset(routineN,handle)
01052 
01053     ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size-1))
01054     CALL get_current_loads(loads, rs_descs, grid_level, ntasks, natoms, maxset, maxpgf, &
01055                            tasks, use_reordered_ranks=.FALSE. , error=error)
01056 
01057     failure=.FALSE.
01058 
01059     maxdest=SIZE(list,2)
01060     ncpu=SIZE(list,3)
01061     natom8 =natoms
01062 
01063     ! first find the tasks that deal with the same atom pair
01064     itask_stop=0
01065     ipair_old=HUGE(ipair_old)
01066     ALLOCATE (all_dests(0))
01067     ALLOCATE (INDEX(0))
01068 
01069     DO
01070 
01071        ! first find the range of tasks that deal with the same atom pair
01072        itask_start=itask_stop+1
01073        itask_stop =itask_start
01074        IF(itask_stop>ntasks) EXIT
01075        CALL int2pair(tasks(3,itask_stop),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
01076        ipair_old=(iatom-1)*natom8+(jatom-1)
01077        DO
01078           IF(itask_stop+1>ntasks) EXIT
01079           CALL int2pair(tasks(3,itask_stop+1),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
01080           ipair=(iatom-1)*natom8+(jatom-1)
01081           IF (ipair==ipair_old) THEN
01082              itask_stop=itask_stop+1
01083           ELSE
01084              EXIT
01085           ENDIF
01086        ENDDO
01087        ipair=ipair_old
01088        nshort=itask_stop-itask_start+1
01089 
01090        ! find the unique list of destinations on this grid level only
01091        DEALLOCATE (all_dests)
01092        ALLOCATE (all_dests(nshort))
01093        DEALLOCATE (index)
01094        ALLOCATE (INDEX(nshort))
01095        DO i=1,nshort
01096          CALL int2pair(tasks(3,itask_start + i - 1),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
01097            IF (ilevel .EQ. grid_level) THEN
01098              all_dests(i)=decode_rank(tasks(1,itask_start + i - 1), SIZE(rs_descs))
01099            ELSE
01100              all_dests(i)=HUGE(all_dests(i))
01101            END IF
01102        ENDDO
01103        CALL sort(all_dests,nshort,index)
01104        ndest_pair=1
01105        DO i=2,nshort
01106           IF ((all_dests(ndest_pair).NE.all_dests(i) ) .AND. (all_dests(i) .NE. HUGE(all_dests(i)))) THEN
01107              ndest_pair=ndest_pair+1
01108              all_dests(ndest_pair)=all_dests(i)
01109           ENDIF
01110        ENDDO
01111 
01112        DO itask=itask_start,itask_stop
01113 
01114 
01115           dest=decode_rank(tasks(1,itask), SIZE(rs_descs)) ! notice that dest can be changed
01116           CALL int2pair(tasks(3,itask),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
01117           ! Only proceed with tasks which are on this grid level
01118           IF (ilevel .NE. grid_level) CYCLE
01119           ipair=(iatom-1)*natom8+(jatom-1)
01120 
01121           SELECT CASE(tasks(4,itask))
01122           CASE(1)
01123             bit_pattern=tasks(6,itask)
01124             nopt=0
01125             IF (BTEST(bit_pattern,0)) THEN
01126                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/-1,0,0/))
01127                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01128                   nopt=nopt+1
01129                   options(nopt)=rank
01130                ENDIF
01131             ENDIF
01132             IF (BTEST(bit_pattern,1)) THEN
01133                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/+1,0,0/))
01134                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01135                   nopt=nopt+1
01136                   options(nopt)=rank
01137                ENDIF
01138             ENDIF
01139             IF (BTEST(bit_pattern,2)) THEN
01140                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/0,-1,0/))
01141                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01142                   nopt=nopt+1
01143                   options(nopt)=rank
01144                ENDIF
01145             ENDIF
01146             IF (BTEST(bit_pattern,3)) THEN
01147                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/0,+1,0/))
01148                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01149                   nopt=nopt+1
01150                   options(nopt)=rank
01151                ENDIF
01152             ENDIF
01153             IF (BTEST(bit_pattern,4)) THEN
01154                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/0,0,-1/))
01155                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01156                   nopt=nopt+1
01157                   options(nopt)=rank
01158                ENDIF
01159             ENDIF
01160             IF (BTEST(bit_pattern,5)) THEN
01161                rank=rs_grid_locate_rank(rs_descs(ilevel)%rs_desc,dest,(/0,0,+1/))
01162                IF (ANY(all_dests(1:ndest_pair).EQ.rank)) THEN
01163                   nopt=nopt+1
01164                   options(nopt)=rank
01165                ENDIF
01166             ENDIF
01167             IF (nopt>0) THEN
01168               ! set it to the rank with the lowest load
01169               rank=options(1)
01170               DO iopt=2,nopt
01171                  IF (loads(rank)>loads(options(iopt))) rank=options(iopt)
01172               ENDDO
01173             ELSE
01174               rank=dest
01175             ENDIF
01176             li=list_index(list,rank,dest)
01177             IF (create_list) THEN
01178                list(2,li,dest)=list(2,li,dest)+tasks(5,itask)
01179             ELSE
01180                IF (list(1,li,dest)==dest) THEN
01181                    tasks(1,itask)=encode_rank(dest, ilevel, SIZE(rs_descs))
01182                ELSE
01183                  IF (list(2,li,dest)>=tasks(5,itask)) THEN
01184                      list(2,li,dest)=list(2,li,dest)-tasks(5,itask)
01185                      tasks(1,itask)=encode_rank(list(1,li,dest), ilevel, SIZE(rs_descs))
01186                  ELSE
01187                      tasks(1,itask)=encode_rank(dest, ilevel, SIZE(rs_descs))
01188                  ENDIF
01189                ENDIF
01190             ENDIF
01191           CASE(2) ! generalised
01192             li=list_index(list,dest,dest)
01193             IF (create_list) THEN
01194                list(2,li,dest)=list(2,li,dest)+tasks(5,itask)
01195             ELSE
01196                IF (list(1,li,dest)==dest) THEN
01197                    tasks(1,itask)=encode_rank(dest, ilevel, SIZE(rs_descs))
01198                ELSE
01199                  IF (list(2,li,dest)>=tasks(5,itask)) THEN
01200                      list(2,li,dest)=list(2,li,dest)-tasks(5,itask)
01201                      tasks(1,itask)=encode_rank(list(1,li,dest), ilevel, SIZE(rs_descs))
01202                  ELSE
01203                      tasks(1,itask)=encode_rank(dest, ilevel, SIZE(rs_descs))
01204                  ENDIF
01205                ENDIF
01206             ENDIF
01207           CASE DEFAULT
01208             CPPrecondition(.FALSE.,cp_failure_level,routineP,error,failure)
01209           END SELECT
01210 
01211 
01212        ENDDO
01213 
01214     ENDDO
01215 
01216     CALL timestop(handle)
01217 
01218   END SUBROUTINE compute_load_list
01219 ! *****************************************************************************
01225   INTEGER FUNCTION list_index(list,rank,dest)
01226     INTEGER, DIMENSION(:, :,0:), INTENT(IN)  :: list
01227     INTEGER, INTENT(IN)                      :: rank,dest
01228     list_index=1
01229     DO
01230       IF (list(1,list_index,dest)==rank) EXIT
01231       list_index=list_index+1
01232     ENDDO
01233   END FUNCTION list_index
01234 ! *****************************************************************************
01241   SUBROUTINE create_destination_list(list,rs_descs,grid_level,error)
01242     INTEGER, DIMENSION(:, :, :), POINTER     :: list
01243     TYPE(realspace_grid_desc_p_type), 
01244       DIMENSION(:), POINTER                  :: rs_descs
01245     INTEGER, INTENT(IN)                      :: grid_level
01246     TYPE(cp_error_type), INTENT(inout)       :: error
01247 
01248     CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list', 
01249       routineP = moduleN//':'//routineN
01250 
01251     INTEGER                                  :: handle, i, icpu, j, maxcount, 
01252                                                 ncpu, stat, ultimate_max
01253     INTEGER, ALLOCATABLE, DIMENSION(:)       :: index, sublist
01254     INTEGER, DIMENSION(3)                    :: coord
01255     LOGICAL                                  :: failure
01256 
01257     failure=.FALSE.
01258 
01259     CALL timeset(routineN,handle)
01260 
01261     CPPrecondition(.NOT.ASSOCIATED(list),cp_failure_level,routineP,error,failure)
01262     ncpu=rs_descs(grid_level)%rs_desc%group_size
01263     ultimate_max=7
01264 
01265     ALLOCATE (list(2,ultimate_max,0:ncpu-1),STAT=stat)
01266     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01267 
01268     ALLOCATE (INDEX(ultimate_max),STAT=stat)
01269     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01270     ALLOCATE (sublist(ultimate_max),STAT=stat)
01271     CPPrecondition(stat==0,cp_failure_level,routineP,error,failure)
01272     sublist=HUGE(sublist)
01273 
01274     maxcount=1
01275     DO icpu=0,ncpu-1
01276        sublist(1)=icpu
01277        coord=rs_descs(grid_level)%rs_desc%rank2coord(:,icpu)
01278        sublist(2)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/-1,0,0/))
01279        sublist(3)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/+1,0,0/))
01280        sublist(4)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/0,-1,0/))
01281        sublist(5)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/0,+1,0/))
01282        sublist(6)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/0,0,-1/))
01283        sublist(7)=rs_grid_locate_rank(rs_descs(grid_level)%rs_desc,icpu,(/0,0,+1/))
01284        ! only retain unique values of the destination
01285        CALL sort(sublist,ultimate_max,index)
01286        j=1
01287        DO i=2,7
01288           IF(sublist(i).NE.sublist(j)) THEN
01289              j=j+1
01290              sublist(j)=sublist(i)
01291           ENDIF
01292        ENDDO
01293        maxcount=MAX(maxcount,j)
01294        sublist(j+1:ultimate_max)=HUGE(sublist)
01295        list(1,:,icpu)=sublist
01296        list(2,:,icpu)=0
01297     ENDDO
01298 
01299     CALL reallocate(list,1,2,1,maxcount,0,ncpu-1)
01300 
01301     CALL timestop(handle)
01302 
01303   END SUBROUTINE create_destination_list
01304 
01305 ! *****************************************************************************
01313   SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, natom, maxset, maxpgf, tasks, use_reordered_ranks, error)
01314     INTEGER(KIND=int_8), DIMENSION(:)        :: loads
01315     TYPE(realspace_grid_desc_p_type), 
01316       DIMENSION(:), POINTER                  :: rs_descs
01317     INTEGER                                  :: grid_level, ntasks, natom, 
01318                                                 maxset, maxpgf
01319     INTEGER(KIND=int_8), DIMENSION(:, :), 
01320       POINTER                                :: tasks
01321     LOGICAL, INTENT(IN)                      :: use_reordered_ranks
01322     TYPE(cp_error_type), INTENT(inout)       :: error
01323 
01324     CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads', 
01325       routineP = moduleN//':'//routineN
01326 
01327     INTEGER                                  :: handle, i, iatom, ilevel, 
01328                                                 ipgf, iset, jatom, jpgf, 
01329                                                 jset, stat
01330     INTEGER(KIND=int_8)                      :: total_cost_local
01331     INTEGER(KIND=int_8), ALLOCATABLE, 
01332       DIMENSION(:)                           :: recv_buf_i, send_buf_i
01333     TYPE(realspace_grid_desc_type), POINTER  :: desc
01334 
01335     CALL timeset(routineN,handle)
01336 
01337 
01338     desc => rs_descs(grid_level)%rs_desc
01339 
01340     ! allocate local arrays
01341     ALLOCATE (send_buf_i(desc%group_size),STAT=stat)
01342     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01343                                     "send_buf_i",int_8_size*desc%group_size)
01344     ALLOCATE (recv_buf_i(desc%group_size),STAT=stat)
01345     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01346                                     "recv_buf_i",int_8_size*desc%group_size)
01347 
01348     ! communication step 1 : compute the total local cost of the tasks
01349     !                        each proc needs to know the amount of work he will receive
01350 
01351     ! send buffer now contains for each target the cost of the tasks it will receive
01352     send_buf_i=0
01353     DO i = 1, ntasks
01354        CALL int2pair(tasks(3,i),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natom,maxset,maxpgf)
01355        IF (ilevel .NE. grid_level) CYCLE
01356        IF (use_reordered_ranks) THEN
01357          send_buf_i( rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1,i), SIZE(rs_descs))) +1 ) = &
01358          send_buf_i( rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(1,i), SIZE(rs_descs))) +1 ) &
01359                    + tasks(5,i)
01360        ELSE
01361          send_buf_i( decode_rank(tasks(1,i), SIZE(rs_descs)) +1 ) = &
01362          send_buf_i( decode_rank(tasks(1,i), SIZE(rs_descs)) +1 ) &
01363                    + tasks(5,i)
01364        END IF
01365     END DO
01366     CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc % group)
01367 
01368     ! communication step 2 : compute the global cost of the tasks
01369     total_cost_local = SUM(recv_buf_i)
01370 
01371     ! after this step, the recv buffer contains the local cost for each CPU
01372     CALL mp_allgather(total_cost_local, loads, desc % group)
01373 
01374     CALL timestop(handle)
01375 
01376   END SUBROUTINE get_current_loads
01377 ! *****************************************************************************
01385   SUBROUTINE load_balance_replicated ( rs_descs, ntasks, tasks, natoms,maxset,maxpgf, error)
01386 
01387     TYPE(realspace_grid_desc_p_type), 
01388       DIMENSION(:), POINTER                  :: rs_descs
01389     INTEGER                                  :: ntasks
01390     INTEGER(KIND=int_8), DIMENSION(:, :), 
01391       POINTER                                :: tasks
01392     INTEGER, INTENT(IN)                      :: natoms, maxset, maxpgf
01393     TYPE(cp_error_type), INTENT(inout)       :: error
01394 
01395     CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated', 
01396       routineP = moduleN//':'//routineN
01397 
01398     INTEGER :: handle, i, iatom, ilevel, ipgf, iset, j, jatom, jpgf, jset, 
01399       no_overloaded, no_underloaded, proc_receiving, stat
01400     INTEGER(KIND=int_8)                      :: average_cost, cost_task_rep, 
01401                                                 count, offset, 
01402                                                 total_cost_global
01403     INTEGER(KIND=int_8), ALLOCATABLE, 
01404       DIMENSION(:)                           :: load_imbalance, loads, 
01405                                                 recv_buf_i
01406     INTEGER, ALLOCATABLE, DIMENSION(:)       :: index
01407     TYPE(realspace_grid_desc_type), POINTER  :: desc
01408 
01409     CALL timeset(routineN,handle)
01410 
01411     desc=>rs_descs(1)%rs_desc
01412 
01413     ! allocate local arrays
01414     ALLOCATE (recv_buf_i(desc%group_size),STAT=stat)
01415     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01416                                     "recv_buf_i",int_8_size*desc%group_size)
01417     ALLOCATE (loads(desc%group_size),STAT=stat)
01418     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01419                                     "loads",int_8_size*desc%group_size)
01420 
01421     recv_buf_i = 0
01422     DO i = 1, SIZE(rs_descs)
01423       CALL get_current_loads ( loads, rs_descs, i, ntasks, natoms,maxset,maxpgf, tasks, use_reordered_ranks=.TRUE. ,error=error )
01424       recv_buf_i = recv_buf_i + loads
01425     END DO
01426 
01427     total_cost_global = SUM(recv_buf_i)
01428     average_cost = total_cost_global / desc%group_size
01429 
01430 !
01431 ! compute how to redistribute the replicated tasks so that the average cost is reached
01432 !
01433 
01434     ! load imbalance measures the load of a given CPU relative to the optimal load distribution (load=average)
01435     ALLOCATE (load_imbalance(desc%group_size),STAT=stat)
01436     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01437                                     "load_imbalance",int_8_size*desc%group_size)
01438     ALLOCATE (INDEX(desc%group_size),STAT=stat)
01439     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01440                                     "index",int_size*desc%group_size)
01441 
01442     load_imbalance = recv_buf_i - average_cost
01443     no_overloaded = 0
01444     no_underloaded = 0
01445 
01446     DO i = 1, desc%group_size
01447        IF (load_imbalance(i) .GT. 0 ) no_overloaded = no_overloaded + 1
01448        IF (load_imbalance(i) .LT. 0 ) no_underloaded = no_underloaded + 1
01449     ENDDO
01450 
01451     ! sort the recv_buffer on number of tasks, gives us index which provides a
01452     ! mapping between processor ranks and how overloaded the processor
01453     CALL sort(recv_buf_i,SIZE(recv_buf_i),index)
01454 
01455     ! find out the number of replicated tasks each proc has
01456     ! but only those tasks which have not yet been assigned
01457     cost_task_rep = 0
01458     DO i = 1, ntasks
01459        IF(tasks(4,i) .EQ. 0 .AND. decode_rank(tasks(1,i),SIZE(rs_descs))==decode_rank(tasks(2,i), SIZE(rs_descs)) ) THEN
01460           cost_task_rep = cost_task_rep + tasks(5,i)
01461        ENDIF
01462     ENDDO
01463 
01464     ! now, correct the load imbalance for the overloaded CPUs
01465     ! they will send away not more than the total load of replicated tasks
01466     CALL mp_allgather(cost_task_rep,recv_buf_i,desc % group)
01467 
01468     DO i = 1, desc%group_size
01469        ! At the moment we can only offload replicated tasks
01470        IF(load_imbalance(i) .GT. 0)&
01471             load_imbalance(i) = MIN( load_imbalance(i),recv_buf_i(i))
01472     ENDDO
01473 
01474     ! simplest algorithm I can think of of is that the processor with the most
01475     ! excess tasks fills up the process needing most, then moves on to next most.
01476     ! At the moment if we've got less replicated tasks than we're overloaded then
01477     ! task balancing will be incomplete
01478 
01479     ! only need to do anything if I've excess tasks
01480     IF (load_imbalance( desc%my_pos + 1 ) .GT. 0 ) THEN
01481 
01482        count = 0 ! weighted amount of tasks offloaded
01483        offset = 0 ! no of underloaded processes already filled by other more overloaded procs
01484 
01485        ! calculate offset
01486        DO i = desc%group_size, desc%group_size-no_overloaded+1, -1
01487           IF ( INDEX(i) .EQ. desc%my_pos+1 ) THEN
01488              EXIT
01489           ELSE
01490              offset = offset + load_imbalance ( INDEX (i) )
01491           ENDIF
01492        ENDDO
01493 
01494        ! find my starting processor to send to
01495        proc_receiving = HUGE(proc_receiving)
01496        DO i = 1, no_underloaded
01497           offset = offset + load_imbalance ( INDEX (i) )
01498           IF ( offset .LE. 0 ) THEN
01499              proc_receiving = i
01500              EXIT
01501           ENDIF
01502        ENDDO
01503 
01504        ! offset now contains minus the number of tasks proc_receiving requires
01505        ! we fill this up by adjusting the destination of tasks on the replicated grid, then move to next most underloaded proc
01506        DO j = 1, ntasks
01507           IF (tasks(4,j) .EQ. 0 .AND. decode_rank(tasks(1,j),SIZE(rs_descs))==decode_rank(tasks(2,j),SIZE(rs_descs)) ) THEN
01508              !just avoid sending to non existing procs due to integer truncation in the computation of the average
01509              IF(proc_receiving .GT. no_underloaded) EXIT
01510              ! set new destination
01511              CALL int2pair(tasks(3,j),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
01512              tasks (1,j) = encode_rank( INDEX ( proc_receiving ) - 1, ilevel, SIZE(rs_descs))
01513              offset = offset + tasks(5,j)
01514              count = count + tasks(5,j)
01515              IF(count .GE. load_imbalance (desc%my_pos + 1 ) ) EXIT
01516              IF(offset .GT. 0 ) THEN
01517                 proc_receiving = proc_receiving + 1
01518                 !just avoid sending to non existing procs due to integer truncation in the computation of the average
01519                 IF(proc_receiving .GT. no_underloaded) EXIT
01520                 offset = load_imbalance ( INDEX ( proc_receiving ) )
01521              ENDIF
01522           ENDIF
01523        ENDDO
01524     ENDIF
01525 
01526     DEALLOCATE (index,STAT=stat)
01527     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"index")
01528     DEALLOCATE (load_imbalance,STAT=stat)
01529     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"load_imbalance")
01530 
01531     CALL timestop(handle)
01532 
01533   END SUBROUTINE load_balance_replicated
01534 
01535 ! *****************************************************************************
01542   SUBROUTINE create_local_tasks ( rs_descs, ntasks, tasks, rval, ntasks_recv, tasks_recv, rval_recv, error)
01543 
01544     TYPE(realspace_grid_desc_p_type), 
01545       DIMENSION(:), POINTER                  :: rs_descs
01546     INTEGER                                  :: ntasks
01547     INTEGER(KIND=int_8), DIMENSION(:, :), 
01548       POINTER                                :: tasks
01549     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rval
01550     INTEGER                                  :: ntasks_recv
01551     INTEGER(KIND=int_8), DIMENSION(:, :), 
01552       POINTER                                :: tasks_recv
01553     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rval_recv
01554     TYPE(cp_error_type), INTENT(inout)       :: error
01555 
01556     CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks', 
01557       routineP = moduleN//':'//routineN
01558 
01559     INTEGER                                  :: handle, i, j, k, l, rank, 
01560                                                 stat, task_dim
01561     INTEGER(KIND=int_8), ALLOCATABLE, 
01562       DIMENSION(:)                           :: recv_buf_i, send_buf_i
01563     INTEGER, ALLOCATABLE, DIMENSION(:)       :: recv_disps, recv_sizes, 
01564                                                 send_disps, send_sizes
01565     REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r
01566     TYPE(realspace_grid_desc_type), POINTER  :: desc
01567 
01568     CALL timeset(routineN,handle)
01569 
01570     desc=>rs_descs(1)%rs_desc
01571 
01572     ! allocate local arrays
01573     ALLOCATE (send_sizes(desc%group_size),STAT=stat)
01574     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01575                                     "send_sizes",int_size*desc%group_size)
01576     ALLOCATE (recv_sizes(desc%group_size),STAT=stat)
01577     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01578                                     "recv_sizes",int_size*desc%group_size)
01579     ALLOCATE (send_disps(desc%group_size),STAT=stat)
01580     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01581                                     "send_disps",int_size*desc%group_size)
01582     ALLOCATE (recv_disps(desc%group_size),STAT=stat)
01583     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01584                                     "recv_disps",int_size*desc%group_size)
01585     ALLOCATE (send_buf_i(desc%group_size),STAT=stat)
01586     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01587                                     "send_buf_i",int_8_size*desc%group_size)
01588     ALLOCATE (recv_buf_i(desc%group_size),STAT=stat)
01589     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01590                                     "recv_buf_i",int_8_size*desc%group_size)
01591 
01592     ! fill send buffer, now counting how many tasks will be send
01593     send_buf_i=0
01594     DO i = 1, ntasks
01595        rank = rs_descs(decode_level(tasks(1,i), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1,i), SIZE(rs_descs)))
01596        send_buf_i( rank+1 ) = send_buf_i( rank+1 ) + 1
01597     END DO
01598 
01599     CALL mp_alltoall(send_buf_i, recv_buf_i, 1, desc % group)
01600 
01601     ! pack the tasks, and send them around
01602 
01603     task_dim = SIZE(tasks,1)
01604 
01605     send_sizes = 0
01606     send_disps = 0
01607     recv_sizes = 0
01608     recv_disps = 0
01609 
01610     send_sizes(1) = send_buf_i(1) * task_dim
01611     recv_sizes(1) = recv_buf_i(1) * task_dim
01612     DO i = 2,desc%group_size
01613        send_sizes(i) = send_buf_i(i) * task_dim
01614        recv_sizes(i) = recv_buf_i(i) * task_dim
01615        send_disps(i)=send_disps(i-1)+send_sizes(i-1)
01616        recv_disps(i)=recv_disps(i-1)+recv_sizes(i-1)
01617     ENDDO
01618 
01619     ! deallocate old send/recv buffers
01620     DEALLOCATE (send_buf_i,STAT=stat)
01621     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_buf_i")
01622     DEALLOCATE (recv_buf_i,STAT=stat)
01623     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_buf_i")
01624 
01625     ! allocate them with new sizes
01626     ALLOCATE (send_buf_i(SUM(send_sizes)),STAT=stat)
01627     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01628                                     "send_buf_i",int_8_size*SUM(send_sizes))
01629     ALLOCATE (recv_buf_i(SUM(recv_sizes)),STAT=stat)
01630     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01631                                     "recv_buf_i",int_8_size*SUM(recv_sizes))
01632 
01633     ! do packing
01634     send_buf_i = 0
01635     send_sizes = 0
01636     DO j = 1,ntasks
01637        i = rs_descs(decode_level(tasks(1,j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1,j), SIZE(rs_descs))) +1
01638        DO k=1,task_dim
01639           send_buf_i(send_disps(i)+send_sizes(i)+k)=tasks(k,j)
01640        ENDDO
01641        send_sizes(i)=send_sizes(i) + task_dim
01642     ENDDO
01643 
01644     ! do communication
01645     CALL mp_alltoall(send_buf_i, send_sizes, send_disps, recv_buf_i, recv_sizes, recv_disps, desc % group)
01646 
01647     DEALLOCATE (send_buf_i,STAT=stat)
01648     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_buf_i")
01649 
01650     ntasks_recv=SUM(recv_sizes)/task_dim
01651     ALLOCATE (tasks_recv(task_dim,ntasks_recv),STAT=stat)
01652     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01653                                     "tasks_recv",int_8_size*task_dim*ntasks_recv)
01654 
01655     ! do unpacking
01656     l = 0
01657     DO i = 1, desc % group_size
01658        DO j = 0,recv_sizes(i)/task_dim-1
01659           l = l + 1
01660           DO k=1,task_dim
01661              tasks_recv(k,l)=recv_buf_i(recv_disps(i)+j*task_dim+k)
01662           ENDDO
01663        ENDDO
01664     ENDDO
01665 
01666     DEALLOCATE (recv_buf_i,STAT=stat)
01667     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_buf_i")
01668 
01669     ! send rvals (to be removed :-)
01670 
01671     ! take care of the new task_dim in the send_sizes
01672     send_sizes=(send_sizes/task_dim)*SIZE(rval,1)
01673     recv_sizes=(recv_sizes/task_dim)*SIZE(rval,1)
01674     send_disps=(send_disps/task_dim)*SIZE(rval,1)
01675     recv_disps=(recv_disps/task_dim)*SIZE(rval,1)
01676     task_dim=SIZE(rval,1)
01677 
01678     ALLOCATE (send_buf_r(SUM(send_sizes)),STAT=stat)
01679     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01680                                     "send_buf_r",dp_size*SUM(send_sizes))
01681     ALLOCATE (recv_buf_r(SUM(recv_sizes)),STAT=stat)
01682     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01683                                     "recv_buf_r",dp_size*SUM(recv_sizes))
01684 
01685     !do packing
01686     send_sizes = 0
01687     DO j = 1,ntasks
01688        i = rs_descs(decode_level(tasks(1,j), SIZE(rs_descs)))%rs_desc%virtual2real(decode_rank(tasks(1,j), SIZE(rs_descs))) +1
01689        DO k=1,task_dim
01690           send_buf_r(send_disps(i)+send_sizes(i)+k)=rval(k,j)
01691        ENDDO
01692        send_sizes(i)=send_sizes(i) + task_dim
01693     ENDDO
01694 
01695     ! do communication
01696     CALL mp_alltoall(send_buf_r, send_sizes, send_disps,&
01697          recv_buf_r, recv_sizes, recv_disps, desc % group)
01698 
01699     DEALLOCATE (send_buf_r,STAT=stat)
01700     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_buf_r")
01701     ALLOCATE (rval_recv(task_dim,SUM(recv_sizes)/task_dim),STAT=stat)
01702     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01703                                     "rval_recv",dp_size*task_dim*SUM(recv_sizes)/task_dim)
01704 
01705     ! do unpacking
01706     l = 0
01707     DO i = 1,desc % group_size
01708        DO j = 0,recv_sizes(i)/task_dim-1
01709           l = l + 1
01710           DO k=1,task_dim
01711              rval_recv(k,l)=recv_buf_r(recv_disps(i)+j*task_dim+k)
01712           ENDDO
01713        ENDDO
01714     ENDDO
01715 
01716     DEALLOCATE (recv_buf_r,STAT=stat)
01717     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_buf_r")
01718     DEALLOCATE (send_sizes,STAT=stat)
01719     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_sizes")
01720     DEALLOCATE (recv_sizes,STAT=stat)
01721     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_sizes")
01722     DEALLOCATE (send_disps,STAT=stat)
01723     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_disps")
01724     DEALLOCATE (recv_disps,STAT=stat)
01725     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_disps")
01726 
01727     CALL timestop(handle)
01728 
01729   END SUBROUTINE create_local_tasks
01730 
01731 ! *****************************************************************************
01740   SUBROUTINE distribute_tasks ( rs_descs, ntasks, natoms,&
01741        maxset,maxpgf,  tasks, rval, atom_pair_send, atom_pair_recv,&
01742        symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed, error)
01743 
01744     TYPE(realspace_grid_desc_p_type), 
01745       DIMENSION(:), POINTER                  :: rs_descs
01746     INTEGER                                  :: ntasks, natoms, maxset, maxpgf
01747     INTEGER(KIND=int_8), DIMENSION(:, :), 
01748       POINTER                                :: tasks
01749     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rval
01750     INTEGER(KIND=int_8), DIMENSION(:), 
01751       POINTER                                :: atom_pair_send, atom_pair_recv
01752     LOGICAL, INTENT(IN)                      :: symmetric, 
01753                                                 reorder_rs_grid_ranks, 
01754                                                 skip_load_balance_distributed
01755     TYPE(cp_error_type), INTENT(inout)       :: error
01756 
01757     CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks', 
01758       routineP = moduleN//':'//routineN
01759 
01760     INTEGER                                  :: handle, igrid_level, irank, 
01761                                                 k, load_gap, max_load, 
01762                                                 ntasks_recv, replicated_load, 
01763                                                 stat
01764     INTEGER(KIND=int_8), ALLOCATABLE, 
01765       DIMENSION(:)                           :: taskid, total_loads, 
01766                                                 total_loads_tmp, trial_loads
01767     INTEGER(KIND=int_8), DIMENSION(:, :), 
01768       POINTER                                :: loads, tasks_recv
01769     INTEGER, ALLOCATABLE, DIMENSION(:)       :: index, real2virtual, 
01770                                                 total_index
01771     LOGICAL                                  :: distributed_grids, 
01772                                                 fixed_first_grid
01773     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: rval_recv
01774     TYPE(realspace_grid_desc_type), POINTER  :: desc
01775 
01776     CALL timeset(routineN,handle)
01777 
01778     IF ( .NOT. ASSOCIATED ( tasks ) ) &
01779        CALL stop_program(routineN,moduleN,__LINE__,"Tasks not associated")
01780 
01781     ! *** figure out if we have distributed grids
01782     distributed_grids=.FALSE.
01783     DO igrid_level=1,SIZE(rs_descs)
01784        IF ( rs_descs(igrid_level)%rs_desc%distributed ) THEN
01785           distributed_grids=.TRUE.
01786        ENDIF
01787     END DO
01788     desc=>rs_descs(1)%rs_desc
01789 
01790     IF (distributed_grids) THEN
01791 
01792        ALLOCATE (loads(0:desc%group_size-1,SIZE(rs_descs)),STAT=stat)
01793        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01794                                        "loads",int_8_size*desc%group_size*SIZE(rs_descs))
01795        ALLOCATE (total_loads(0:desc%group_size-1),STAT=stat)
01796        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01797                                        "total_loads",int_8_size*desc%group_size)
01798        total_loads = 0
01799 
01800        ! First round of balancing on the distributed grids
01801        ! we just balance each of the distributed grids independently
01802        ! XXXX this call to load_balance_distributed can be a memory hog XXXX
01803        DO igrid_level=1,SIZE(rs_descs)
01804          IF ( rs_descs(igrid_level)%rs_desc%distributed ) THEN
01805 
01806            IF (.NOT.skip_load_balance_distributed) &
01807                CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms, maxset,maxpgf, error)
01808 
01809            CALL get_current_loads( loads(:,igrid_level), rs_descs, igrid_level, ntasks, natoms, maxset,maxpgf, &
01810                                    tasks, use_reordered_ranks=.FALSE. , error=error)
01811            total_loads=total_loads+loads(:,igrid_level)
01812 
01813          END IF
01814        END DO
01815 
01816        ! calculate the total load of replicated tasks, so we can decide if it is worth
01817        ! reordering the distributed grid levels
01818 
01819        replicated_load=0
01820        DO igrid_level=1,SIZE(rs_descs)
01821         IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
01822           CALL get_current_loads( loads(:,igrid_level), rs_descs, igrid_level, ntasks, natoms, maxset,maxpgf, &
01823                                      tasks, use_reordered_ranks=.FALSE., error=error)
01824           replicated_load = replicated_load + SUM(loads(:,igrid_level))
01825         END IF
01826        END DO
01827 
01828        !IF (desc%my_pos==0) THEN
01829        ! WRITE(6,*) "Total replicated load is ",replicated_load
01830        !END IF
01831 
01832        ! Now we adjust the rank ordering based on the current loads
01833        ! we leave the first distributed level and all the replicated levels in the default order
01834        IF (reorder_rs_grid_ranks) THEN
01835          fixed_first_grid=.FALSE.
01836          DO igrid_level=1,SIZE(rs_descs)
01837            IF (rs_descs(igrid_level)%rs_desc%distributed ) THEN
01838              IF (fixed_first_grid .EQV. .FALSE.) THEN
01839                total_loads = loads(:,igrid_level)
01840                fixed_first_grid = .TRUE.
01841              ELSE
01842                ALLOCATE (trial_loads(0:desc%group_size-1),STAT=stat)
01843                IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01844                                                "trial_loads",int_8_size*desc%group_size)
01845 
01846                trial_loads = total_loads + loads(:,igrid_level)
01847                max_load = MAXVAL(trial_loads)
01848                load_gap = 0
01849                DO irank=0, desc%group_size-1
01850                  load_gap = load_gap + max_load - trial_loads(irank)
01851                END DO
01852 
01853                ! If there is not enough replicated load to load balance well enough
01854                ! then we will reorder this grid level
01855                IF (load_gap > replicated_load * 1.05_dp) THEN
01856 
01857                  ALLOCATE (INDEX(0:desc%group_size-1),STAT=stat)
01858                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01859                                                  "index",int_size*desc%group_size)
01860                  ALLOCATE (total_index(0:desc%group_size-1),STAT=stat)
01861                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01862                                                  "total_index",int_size*desc%group_size)
01863                  ALLOCATE (total_loads_tmp(0:desc%group_size-1),STAT=stat)
01864                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01865                                                  "total_loads_tmp",int_8_size*desc%group_size)
01866                  ALLOCATE (real2virtual(0:desc%group_size-1),STAT=stat)
01867                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01868                                                  "real2virtual",int_size*desc%group_size)
01869 
01870                  total_loads_tmp = total_loads
01871                  CALL sort(total_loads_tmp, desc%group_size, total_index)
01872                  CALL sort(loads(:,igrid_level), desc%group_size, index)
01873 
01874                  ! Reorder so that the rank with smallest load on this grid level is paired with
01875                  ! the highest load in total
01876                  DO irank=0, desc%group_size-1
01877                    total_loads(total_index(irank)-1) = total_loads(total_index(irank)-1) + &
01878                                loads(desc%group_size - irank - 1, igrid_level)
01879                    real2virtual(total_index(irank)-1)=INDEX(desc%group_size - irank - 1) - 1
01880                  END DO
01881 
01882                  CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual, error)
01883 
01884                  DEALLOCATE (index,STAT=stat)
01885                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"index")
01886                  DEALLOCATE (total_index,STAT=stat)
01887                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"total_index")
01888                  DEALLOCATE (total_loads_tmp,STAT=stat)
01889                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"total_loads_tmp")
01890                  DEALLOCATE (real2virtual,STAT=stat)
01891                  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"real2virtual")
01892                ELSE
01893                  total_loads = trial_loads
01894                ENDIF
01895 
01896                DEALLOCATE (trial_loads,STAT=stat)
01897                IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"trial_loads")
01898 
01899              ENDIF
01900            ENDIF
01901          END DO
01902        END IF
01903 
01904        ! Now we use the replicated tasks to balance out the rest of the load
01905        CALL load_balance_replicated(rs_descs, ntasks, tasks, natoms,maxset,maxpgf, error)
01906 
01907        !total_loads = 0
01908        !DO igrid_level=1,SIZE(rs_descs)
01909        !  CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, natoms, maxset,maxpgf, &
01910        !                         tasks, use_reordered_ranks=.TRUE. , error=error)
01911        !  total_loads = total_loads + loads(:, igrid_level)
01912        !END DO
01913 
01914        !IF (desc%my_pos==0) THEN
01915        !  WRITE(6,*) ""
01916        !  WRITE(6,*) "At the end of the load balancing procedure"
01917        !  WRITE(6,*) "Maximum  load:",MAXVAL(total_loads)
01918        !  WRITE(6,*) "Average  load:",SUM(total_loads)/SIZE(total_loads)
01919        !  WRITE(6,*) "Minimum  load:",MINVAL(total_loads)
01920        !ENDIF
01921 
01922        ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
01923        CALL create_local_tasks ( rs_descs, ntasks, tasks, rval, ntasks_recv, tasks_recv, rval_recv, error)
01924 
01925        !
01926        ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
01927        ! we will be sending. These lists are needed for redistribute_matrix.
01928        !
01929        ALLOCATE (atom_pair_send(ntasks),STAT=stat)
01930        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01931                                        "atom_pair_send",int_8_size*ntasks)
01932        CALL get_atom_pair ( atom_pair_send, tasks, send=.TRUE.,&
01933             symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, rs_descs=rs_descs )
01934 
01935 
01936        ! natom_send=SIZE(atom_pair_send)
01937        ! CALL mp_sum(natom_send,desc%group)
01938        ! IF (desc%my_pos==0) THEN
01939        !     WRITE(6,*) ""
01940        !     WRITE(6,*) "Total number of atomic blocks to be send:",natom_send
01941        ! ENDIF
01942 
01943        ALLOCATE (atom_pair_recv(ntasks_recv),STAT=stat)
01944        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01945                                        "atom_pair_recv",int_8_size*ntasks_recv)
01946        CALL get_atom_pair ( atom_pair_recv, tasks_recv, send=.FALSE.,&
01947             symmetric=symmetric, natoms=natoms, maxset=maxset, maxpgf=maxpgf, rs_descs=rs_descs )
01948 
01949        ! cleanup, at this point we  don't need the original tasks & rvals anymore
01950        DEALLOCATE (tasks,STAT=stat)
01951        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"tasks")
01952        DEALLOCATE (rval,STAT=stat)
01953        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"rval")
01954        DEALLOCATE (loads,STAT=stat)
01955        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"loads")
01956        DEALLOCATE (total_loads,STAT=stat)
01957        IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"total_loads")
01958 
01959     ELSE
01960 
01961        tasks_recv =>tasks
01962        ntasks_recv=ntasks
01963        rval_recv=>rval
01964 
01965     ENDIF
01966 
01967     !
01968     ! here we sort the task list we will process locally.
01969     ! the sort is determined by pair2int
01970     !
01971     rval => rval_recv
01972 
01973     ALLOCATE (taskid(ntasks_recv),STAT=stat)
01974     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01975                                     "taskid",int_8_size*ntasks_recv)
01976     ALLOCATE (INDEX(ntasks_recv),STAT=stat)
01977     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
01978                                     "index",int_size*ntasks_recv)
01979 
01980     taskid=tasks_recv(3,1:ntasks_recv)
01981     CALL sort(taskid,SIZE(taskid),index)
01982 
01983     DO k=1,SIZE(tasks_recv,1)
01984        tasks_recv(k,1:ntasks_recv)=tasks_recv(k,index)
01985     ENDDO
01986 
01987     DO k=1,SIZE(rval,1)
01988        rval(k,1:ntasks_recv)=rval(k,index)
01989     ENDDO
01990 
01991     DEALLOCATE (taskid,STAT=stat)
01992     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"task_id")
01993     DEALLOCATE (index,STAT=stat)
01994     IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"index")
01995 
01996     !
01997     ! final lists are ready
01998     !
01999 
02000     tasks=>tasks_recv
02001     ntasks=ntasks_recv
02002 
02003     CALL timestop(handle)
02004 
02005   END SUBROUTINE distribute_tasks
02006 
02007 ! *****************************************************************************
02008   SUBROUTINE get_atom_pair ( atom_pair, my_tasks, send, symmetric, natoms, maxset, maxpgf, rs_descs)
02009 
02010     INTEGER(KIND=int_8), DIMENSION(:), 
02011       POINTER                                :: atom_pair
02012     INTEGER(KIND=int_8), DIMENSION(:, :), 
02013       POINTER                                :: my_tasks
02014     LOGICAL                                  :: send, symmetric
02015     INTEGER                                  :: natoms, maxset, maxpgf
02016     TYPE(realspace_grid_desc_p_type), 
02017       DIMENSION(:), POINTER                  :: rs_descs
02018 
02019     CHARACTER(LEN=*), PARAMETER :: routineN = 'get_atom_pair', 
02020       routineP = moduleN//':'//routineN
02021 
02022     INTEGER                                  :: acol, arow, i, iatom, ilevel, 
02023                                                 ipgf, iset, j, jatom, jpgf, 
02024                                                 jset, stat
02025     INTEGER(KIND=int_8)                      :: natom8
02026     INTEGER, DIMENSION(:), POINTER           :: index
02027 
02028 ! calculate list of atom pairs
02029 ! fill pair list taking into acount symmetry
02030 
02031   natom8=natoms
02032 
02033   atom_pair = 0
02034   DO i = 1,SIZE(atom_pair)
02035      CALL int2pair(my_tasks(3,i),ilevel,iatom,jatom,iset,jset,ipgf,jpgf,natoms,maxset,maxpgf)
02036      IF( symmetric ) THEN
02037         IF(iatom.LE.jatom) THEN
02038            arow = iatom
02039            acol = jatom
02040         ELSE
02041            arow = jatom
02042            acol = iatom
02043         ENDIF
02044      ELSE
02045         arow = iatom
02046         acol = jatom
02047      ENDIF
02048      IF ( send ) THEN
02049         ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
02050         ! actually has the correct part of the rs_grid to do the mapping
02051         atom_pair(i) = rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(my_tasks(1,i),SIZE(rs_descs))) &
02052                         *natom8*natom8+(arow-1)*natom8 + (acol-1)
02053      ELSE
02054         ! If we are recieving, then no conversion is needed as the rank is that of the process with the
02055         ! required matrix block, and the ordering of the rs grid is irrelevant
02056         atom_pair(i) = decode_rank(my_tasks(2,i),SIZE(rs_descs)) &
02057                         *natom8*natom8+(arow-1)*natom8 + (acol-1)
02058      ENDIF
02059   ENDDO
02060 
02061   ALLOCATE (INDEX(SIZE(atom_pair)),STAT=stat)
02062   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02063                                   "index",int_size*SIZE(atom_pair))
02064 
02065   ! find unique atom pairs that I'm sending/receiving
02066   CALL sort(atom_pair,SIZE(atom_pair),index)
02067 
02068   DEALLOCATE (index,STAT=stat)
02069   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"index")
02070 
02071   IF (SIZE(atom_pair)>1) THEN
02072      j=1
02073      ! first atom pair must be allowed
02074      DO i = 2,SIZE(atom_pair)
02075         IF( atom_pair(i) .GT. atom_pair(i-1) ) THEN
02076            j = j + 1
02077            atom_pair(j) = atom_pair(i)
02078         ENDIF
02079      ENDDO
02080      ! reallocate the atom pair list
02081      CALL  reallocate(atom_pair,1,j)
02082   ENDIF
02083 
02084 END SUBROUTINE get_atom_pair
02085 
02086 ! *****************************************************************************
02091 SUBROUTINE rs_distribute_matrix( rs_descs, pmat, atom_pair_send, atom_pair_recv, &
02092      natoms, scatter, error, hmat )
02093 
02094 !$ USE omp_lib
02095 
02096     TYPE(realspace_grid_desc_p_type), 
02097       DIMENSION(:), POINTER                  :: rs_descs
02098     TYPE(cp_dbcsr_type), POINTER             :: pmat
02099     INTEGER(KIND=int_8), DIMENSION(:), 
02100       POINTER                                :: atom_pair_send, atom_pair_recv
02101     INTEGER                                  :: natoms
02102     LOGICAL                                  :: scatter
02103     TYPE(cp_error_type), INTENT(inout)       :: error
02104     TYPE(cp_dbcsr_type), OPTIONAL, POINTER   :: hmat
02105 
02106     CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix', 
02107       routineP = moduleN//':'//routineN
02108 
02109     INTEGER :: acol, arow, handle, i, j, k, l, me, nblkcols_total, 
02110       nblkrows_total, ncol, nrow, nthread, nthread_left, stat
02111     INTEGER(KIND=int_8)                      :: natom8, pair
02112     INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, 
02113       last_row, recv_disps, recv_pair_count, recv_pair_disps, recv_sizes, 
02114       send_disps, send_pair_count, send_pair_disps, send_sizes
02115     LOGICAL                                  :: failure, found
02116     REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r
02117     REAL(KIND=dp), DIMENSION(:, :), POINTER  :: h_block, p_block
02118     TYPE(realspace_grid_desc_type), POINTER  :: desc
02119 
02120 !$  INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
02121 
02122   CALL timeset(routineN,handle)
02123 
02124   IF (.not.scatter) THEN
02125     failure = .FALSE.
02126     CPPrecondition(PRESENT(hmat),cp_failure_level,routineP,error,failure)
02127   END IF
02128 
02129   desc => rs_descs(1)%rs_desc
02130   me = desc%my_pos+1
02131 
02132   ! allocate local arrays
02133   ALLOCATE (send_sizes(desc%group_size),STAT=stat)
02134   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02135                                   "send_sizes",int_size*desc%group_size)
02136   ALLOCATE (recv_sizes(desc%group_size),STAT=stat)
02137   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02138                                   "recv_sizes",int_size*desc%group_size)
02139   ALLOCATE (send_disps(desc%group_size),STAT=stat)
02140   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02141                                   "send_disps",int_size*desc%group_size)
02142   ALLOCATE (recv_disps(desc%group_size),STAT=stat)
02143   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02144                                   "recv_disps",int_size*desc%group_size)
02145   ALLOCATE (send_pair_count(desc%group_size),STAT=stat)
02146   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02147                                   "send_pair_count",int_size*desc%group_size)
02148   ALLOCATE (recv_pair_count(desc%group_size),STAT=stat)
02149   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02150                                   "recv_pair_count",int_size*desc%group_size)
02151   ALLOCATE (send_pair_disps(desc%group_size),STAT=stat)
02152   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02153                                   "send_pair_disps",int_size*desc%group_size)
02154   ALLOCATE (recv_pair_disps(desc%group_size),STAT=stat)
02155   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02156                                   "recv_pair_disps",int_size*desc%group_size)
02157 
02158   CALL cp_dbcsr_get_info (pmat, &
02159        nblkrows_total=nblkrows_total,&
02160        nblkcols_total=nblkcols_total)
02161   ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total),&
02162             first_col(nblkcols_total), last_col(nblkcols_total))
02163   CALL convert_sizes_to_offsets (array_data (cp_dbcsr_row_block_sizes (pmat)), first_row, last_row)
02164   CALL convert_sizes_to_offsets (array_data (cp_dbcsr_col_block_sizes (pmat)), first_col, last_col)
02165   ! set up send buffer sizes
02166   natom8=natoms
02167 
02168   send_sizes = 0
02169   send_pair_count = 0
02170   DO i = 1, SIZE(atom_pair_send)
02171 
02172      ! proc we're sending this block to
02173      k = atom_pair_send(i) / natom8**2 + 1
02174 
02175      pair = MOD(atom_pair_send(i),natom8**2)
02176 
02177      arow = pair / natom8 + 1
02178      acol = MOD(pair, natom8) + 1
02179 
02180      nrow = last_row(arow) - first_row(arow) + 1
02181      ncol = last_col(acol) - first_col(acol) + 1
02182 
02183      send_sizes(k) = send_sizes(k) + nrow * ncol
02184      send_pair_count(k) = send_pair_count(k) + 1
02185 
02186   ENDDO
02187 
02188   send_disps = 0
02189   send_pair_disps = 0
02190   DO i = 2, desc % group_size
02191      send_disps(i) = send_disps(i-1) + send_sizes(i-1)
02192      send_pair_disps(i) = send_pair_disps(i-1) + send_pair_count(i-1)
02193   ENDDO
02194 
02195 
02196   ALLOCATE (send_buf_r(SUM(send_sizes)),STAT=stat)
02197   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02198                                   "send_buf_r",dp_size*SUM(send_sizes))
02199 
02200   ! set up recv buffer
02201 
02202   recv_sizes = 0
02203   recv_pair_count = 0
02204   DO i = 1, SIZE(atom_pair_recv)
02205 
02206      ! proc we're receiving this data from
02207      k = atom_pair_recv(i) / natom8**2 + 1
02208 
02209      pair = MOD(atom_pair_recv(i),natom8**2)
02210 
02211      arow = pair / natom8 + 1
02212      acol = MOD(pair,natom8) + 1
02213 
02214      nrow = last_row(arow) - first_row(arow) + 1
02215      ncol = last_col(acol) - first_col(acol) + 1
02216 
02217      recv_sizes(k) = recv_sizes(k) + nrow * ncol
02218      recv_pair_count(k) = recv_pair_count(k) + 1
02219 
02220   ENDDO
02221 
02222   recv_disps = 0
02223   recv_pair_disps = 0
02224   DO i = 2, desc % group_size
02225      recv_disps(i) = recv_disps(i-1) + recv_sizes(i-1)
02226      recv_pair_disps(i) = recv_pair_disps(i-1) + recv_pair_count(i-1)
02227   ENDDO
02228   ALLOCATE (recv_buf_r(SUM(recv_sizes)),STAT=stat)
02229   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,&
02230                                   "recv_buf_r",dp_size*SUM(recv_sizes))
02231 
02232 !$omp parallel default(none), &
02233 !$omp          shared(desc,send_pair_count,send_pair_disps,natom8),&
02234 !$omp          shared(last_row,first_row,last_col,first_col),&
02235 !$omp          shared(pmat,send_buf_r,send_disps,send_sizes),&
02236 !$omp          shared(atom_pair_send,me,hmat,nblkrows_total),&
02237 !$omp          shared(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
02238 !$omp          shared(recv_sizes,recv_disps,recv_pair_count,locks), &
02239 !$omp          private(i,pair,arow,acol,nrow,ncol,p_block,found,j,k,l),&
02240 !$omp          private(nthread,h_block,error,nthread_left)
02241 
02242   nthread = 1
02243 !$ nthread = omp_get_num_threads()
02244   nthread_left = 1
02245 !$ nthread_left = MAX(1, nthread-1)
02246 
02247   ! do packing
02248 !$omp do schedule(guided)
02249   DO l =1, desc%group_size
02250    IF (l .EQ. me) CYCLE
02251     send_sizes(l)=0
02252     DO i = 1, send_pair_count(l)
02253      pair = MOD(atom_pair_send(send_pair_disps(l)+i),natom8**2)
02254 
02255      arow = pair / natom8 + 1
02256      acol = MOD(pair, natom8) + 1
02257 
02258      nrow = last_row(arow)-first_row(arow) + 1
02259      ncol = last_col(acol)-first_col(acol) + 1
02260 
02261      CALL cp_dbcsr_get_block_p(matrix=pmat,row=arow,col=acol,&
02262           block=p_block,found=found)
02263      IF ( .NOT. ASSOCIATED ( p_block ) ) THEN
02264         CALL stop_program(routineN,moduleN,__LINE__,"Matrix block not found")
02265      ENDIF
02266 
02267      DO k = 1, ncol
02268         DO j = 1, nrow
02269            send_buf_r(send_disps(l)+send_sizes(l)+j+(k-1)*nrow) = p_block(j,k)
02270         ENDDO
02271      ENDDO
02272      send_sizes(l)=send_sizes(l)+nrow*ncol
02273     ENDDO
02274   ENDDO
02275 !$omp end do
02276 
02277 !$omp master
02278   ! do communication
02279   CALL mp_alltoall(send_buf_r, send_sizes, send_disps,&
02280        recv_buf_r, recv_sizes, recv_disps, desc % group)
02281 !$omp end master
02282 
02283   ! If this is a scatter, then no need to copy local blocks,
02284   ! If not, we sum them directly into H (bypassing the alltoall)
02285   IF (.NOT. scatter) THEN
02286 
02287   ! We need locks to protect concurrent summation into H
02288 !$omp single
02289 !$  ALLOCATE (locks(nthread*10))
02290 !$omp end single
02291 
02292 !$omp do
02293 !$  do i=1,nthread*10
02294 !$    call omp_init_lock(locks(i))
02295 !$  end do
02296 !$omp end do
02297 
02298   ! Distribute work over remaining threads assuming one is still in the alltoall
02299 !$omp do schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
02300     DO i = 1, send_pair_count(me)
02301      pair = MOD(atom_pair_send(send_pair_disps(me)+i),natom8**2)
02302 
02303      arow = pair / natom8 + 1
02304      acol = MOD(pair, natom8) + 1
02305 
02306      nrow = last_row(arow) - first_row(arow) + 1
02307      ncol = last_col(acol) - first_col(acol) + 1
02308 
02309      CALL cp_dbcsr_get_block_p(matrix=hmat,row=arow,col=acol,&
02310           BLOCK=h_block,found=found)
02311      CALL cp_dbcsr_get_block_p(matrix=pmat,row=arow,col=acol,&
02312           BLOCK=p_block,found=found)
02313 
02314 !$  call omp_set_lock(locks((arow-1)*nthread*10/nblkrows_total+1))
02315         DO k = 1, ncol
02316            DO j = 1, nrow
02317               h_block(j,k) = h_block(j,k) + p_block(j,k)
02318            ENDDO
02319         ENDDO
02320 !$  call omp_unset_lock(locks((arow-1)*nthread*10/nblkrows_total+1))
02321     ENDDO
02322 !$omp end do
02323   END IF
02324 
02325   IF (scatter) THEN
02326     ! We will insert new blocks into P, so create mutable work matrices
02327     CALL cp_dbcsr_work_create(pmat,work_mutable=.TRUE.,&
02328       nblks_guess=SIZE(atom_pair_recv)/nthread,sizedata_guess=SIZE(recv_buf_r)/nthread,&
02329       n=nthread,error=error)
02330 !$omp barrier
02331   END IF
02332 
02333   !do unpacking
02334 !$omp do schedule(guided)
02335   DO l = 1, desc%group_size
02336     IF (l .EQ. me) CYCLE
02337     recv_sizes(l) = 0
02338     DO i = 1, recv_pair_count(l)
02339      pair = MOD(atom_pair_recv(recv_pair_disps(l)+i),natom8**2)
02340 
02341      arow = pair / natom8 + 1
02342      acol = MOD(pair, natom8) + 1
02343 
02344      nrow = last_row(arow)-first_row(arow) + 1
02345      ncol = last_col(acol)-first_col(acol) + 1
02346 
02347      CALL cp_dbcsr_get_block_p(matrix=pmat,row=arow,col=acol,&
02348           BLOCK=p_block,found=found)
02349 
02350      IF ( PRESENT ( hmat ) ) THEN
02351      CALL cp_dbcsr_get_block_p(matrix=hmat,row=arow,col=acol,&
02352           BLOCK=h_block,found=found)
02353      ENDIF
02354 
02355      IF(scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
02356         CALL cp_dbcsr_put_block(pmat, arow, acol,&
02357            block=recv_buf_r(recv_disps(l)+recv_sizes(l)+1:recv_disps(l)+recv_sizes(l)+nrow*ncol))
02358      END IF
02359      IF (.not.scatter) THEN
02360 !$  call omp_set_lock(locks((arow-1)*nthread*10/nblkrows_total+1))
02361         DO k = 1, ncol
02362            DO j = 1, nrow
02363               h_block(j,k) = h_block(j,k) + recv_buf_r( recv_disps(l) + recv_sizes(l)+ j + (k-1)*nrow )
02364            ENDDO
02365         ENDDO
02366 !$  call omp_unset_lock(locks((arow-1)*nthread*10/nblkrows_total+1))
02367       ENDIF
02368      recv_sizes(l)=recv_sizes(l)+nrow*ncol
02369     ENDDO
02370   ENDDO
02371 !$omp end do
02372 
02373 !$ IF (.not.scatter) THEN
02374 !$omp do
02375 !$  do i=1,nthread*10
02376 !$    call omp_destroy_lock(locks(i))
02377 !$  end do
02378 !$omp end do
02379 !$ END IF
02380 
02381   IF (scatter) THEN
02382     ! Blocks were added to P
02383     CALL cp_dbcsr_finalize(pmat, error=error)
02384   END IF
02385 !$omp end parallel
02386 
02387 !$ IF (.not.scatter) THEN
02388 !$  DEALLOCATE (locks,STAT=stat)
02389 !$  IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"locks")
02390 !$ END IF
02391 
02392   DEALLOCATE (send_buf_r,STAT=stat)
02393   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_buf_r")
02394   DEALLOCATE (recv_buf_r,STAT=stat)
02395   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_buf_r")
02396 
02397   DEALLOCATE (send_sizes,STAT=stat)
02398   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_sizes")
02399   DEALLOCATE (recv_sizes,STAT=stat)
02400   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_sizes")
02401   DEALLOCATE (send_disps,STAT=stat)
02402   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_disps")
02403   DEALLOCATE (recv_disps,STAT=stat)
02404   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_disps")
02405   DEALLOCATE (send_pair_count,STAT=stat)
02406   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_pair_count")
02407   DEALLOCATE (recv_pair_count,STAT=stat)
02408   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_pair_count")
02409   DEALLOCATE (send_pair_disps,STAT=stat)
02410   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"send_pair_disps")
02411   DEALLOCATE (recv_pair_disps,STAT=stat)
02412   IF (stat /= 0) CALL stop_memory(routineN,moduleN,__LINE__,"recv_pair_disps")
02413 
02414   DEALLOCATE (first_row, last_row, first_col, last_col)
02415   CALL timestop(handle)
02416 END SUBROUTINE rs_distribute_matrix
02417 
02418 ! *****************************************************************************
02425   SUBROUTINE rs_find_node(rs_desc,igrid_level,n_levels,cube_center,ntasks,tasks,lb_cube,ub_cube,added_tasks,error)
02426 
02427     TYPE(realspace_grid_desc_type), POINTER  :: rs_desc
02428     INTEGER, INTENT(IN)                      :: igrid_level, n_levels
02429     INTEGER, DIMENSION(3), INTENT(IN)        :: cube_center
02430     INTEGER, INTENT(INOUT)                   :: ntasks
02431     INTEGER(KIND=int_8), DIMENSION(:, :), 
02432       POINTER                                :: tasks
02433     INTEGER, DIMENSION(3), INTENT(IN)        :: lb_cube, ub_cube
02434     INTEGER, INTENT(OUT)                     :: added_tasks
02435     TYPE(cp_error_type), INTENT(inout)       :: error
02436 
02437     CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_find_node', 
02438       routineP = moduleN//':'//routineN
02439     INTEGER, PARAMETER                       :: add_tasks = 1000
02440     REAL(kind=dp), PARAMETER                 :: mult_tasks = 2.0_dp
02441 
02442     INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, 
02443       itask, ix, iy, iz, lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), 
02444       ub_domain(3), ubc(3)
02445     INTEGER(KIND=int_8)                      :: bit_pattern
02446     LOGICAL                                  :: dir_periodic(3), failure
02447 
02448     coord(1)=rs_desc % x2coord(cube_center(1))
02449     coord(2)=rs_desc % y2coord(cube_center(2))
02450     coord(3)=rs_desc % z2coord(cube_center(3))
02451     dest=rs_desc%coord2rank(coord(1),coord(2),coord(3))
02452 
02453     failure=.FALSE.
02454 
02455     ! the real cube coordinates
02456     lbc=lb_cube+cube_center
02457     ubc=ub_cube+cube_center
02458 
02459     IF ( ALL ( (rs_desc % lb_global(:,dest)-rs_desc%border) .LE. lbc )  .AND. &
02460          ALL ( (rs_desc % ub_global(:,dest)+rs_desc%border) .GE. ubc )) THEN
02461        !standard distributed collocation/integration
02462        tasks(1,ntasks)=encode_rank(dest, igrid_level, n_levels)
02463        tasks(4,ntasks)=1
02464        tasks(6,ntasks)=0
02465        added_tasks = 1
02466 
02467        ! here we figure out if there is an alternate location for this task
02468        ! i.e. even though the cube_center is not in the real local domain,
02469        ! it might fully fit in the halo of the neighbor
02470        ! if its radius is smaller than the maximum radius
02471        ! the list of possible neighbors is stored here as a bit pattern
02472        ! to reconstruct what the rank of the neigbor is,
02473        ! we can use (note this requires the correct rs_grid)
02474        !  IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/-1,0,0/))
02475        !  IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/+1,0,0/))
02476        !  IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,-1,0/))
02477        !  IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,+1,0/))
02478        !  IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,-1/))
02479        !  IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(1,ntasks),(/0,0,+1/))
02480        bit_index=0
02481        bit_pattern=0
02482        DO i=1,3
02483           IF (rs_desc % perd ( i ) ==  1) THEN
02484              bit_pattern=IBCLR(bit_pattern,bit_index)
02485              bit_index=bit_index+1
02486              bit_pattern=IBCLR(bit_pattern,bit_index)
02487              bit_index=bit_index+1
02488           ELSE
02489              ! fits the left neighbor ?
02490              IF (ubc(i)<=rs_desc%lb_global(i,dest)-1+rs_desc%border) THEN
02491                 bit_pattern=IBSET(bit_pattern,bit_index)
02492                 bit_index=bit_index+1
02493              ELSE
02494                 bit_pattern=IBCLR(bit_pattern,bit_index)
02495                 bit_index=bit_index+1
02496              ENDIF
02497              ! fits the right neighbor ?
02498              IF (lbc(i)>=rs_desc%ub_global(i,dest)+1-rs_desc%border) THEN
02499                 bit_pattern=IBSET(bit_pattern,bit_index)
02500                 bit_index=bit_index+1
02501              ELSE
02502                 bit_pattern=IBCLR(bit_pattern,bit_index)
02503                 bit_index=bit_index+1
02504              ENDIF
02505           ENDIF
02506        ENDDO
02507        tasks(6,ntasks)=bit_pattern
02508 
02509     ELSE
02510        !generalised collocation/integration needed
02511        ! first we figure out how many neighbors we have to add to include the lbc/ubc
02512        ! in the available domains (inclusive of halo points)
02513        ! first we 'ignore' periodic boundary conditions
02514        ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
02515        lb_coord=coord
02516        ub_coord=coord
02517        lb_domain=rs_desc % lb_global(:,dest) - rs_desc % border
02518        ub_domain=rs_desc % ub_global(:,dest) + rs_desc % border
02519        DO i=1,3
02520           ! only if the grid is not periodic in this direction we need to take care of adding neighbors
02521           IF (rs_desc%perd(i)==0) THEN
02522              ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
02523              DO
02524                IF (lb_domain(i)>lbc(i)) THEN
02525                   lb_coord(i)=lb_coord(i)-1
02526                   icoord=MODULO(lb_coord,rs_desc%group_dim)
02527                   idest=rs_desc%coord2rank(icoord(1),icoord(2),icoord(3))
02528                   lb_domain(i)=lb_domain(i)-(rs_desc % ub_global(i,idest)-rs_desc % lb_global(i,idest)+1)
02529                ELSE
02530                   EXIT
02531                ENDIF
02532              ENDDO
02533              ! same for the upper bound
02534              DO
02535                IF (ub_domain(i)<ubc(i)) THEN
02536                   ub_coord(i)=ub_coord(i)+1
02537                   icoord=MODULO(ub_coord,rs_desc%group_dim)
02538                   idest=rs_desc%coord2rank(icoord(1),icoord(2),icoord(3))
02539                   ub_domain(i)=ub_domain(i)+(rs_desc % ub_global(i,idest)-rs_desc % lb_global(i,idest)+1)
02540                ELSE
02541                   EXIT
02542                ENDIF
02543              ENDDO
02544           ENDIF
02545        ENDDO
02546 
02547        ! some care is needed for the periodic boundaries
02548        DO i=1,3
02549           IF (ub_domain(i)-lb_domain(i)+1>=rs_desc%npts(i)) THEN
02550              dir_periodic(i)=.TRUE.
02551              lb_coord(i)=0
02552              ub_coord(i)=rs_desc%group_dim(i)-1
02553           ELSE
02554              dir_periodic(i)=.FALSE.
02555           ENDIF
02556        ENDDO
02557 
02558        added_tasks=PRODUCT(ub_coord-lb_coord+1)
02559        itask=ntasks
02560        ntasks=ntasks+added_tasks-1
02561        IF ( ntasks > SIZE(tasks,2)) THEN
02562           curr_tasks = (SIZE(tasks,2)+add_tasks)*mult_tasks
02563           CALL reallocate(tasks,1,6,1,curr_tasks)
02564        END IF
02565        DO iz=lb_coord(3),ub_coord(3)
02566        DO iy=lb_coord(2),ub_coord(2)
02567        DO ix=lb_coord(1),ub_coord(1)
02568           icoord=MODULO((/ix,iy,iz/),rs_desc%group_dim)
02569           idest=rs_desc%coord2rank(icoord(1),icoord(2),icoord(3))
02570           tasks(1,itask)=encode_rank(idest, igrid_level, n_levels)
02571           tasks(4,itask)=2
02572           tasks(6,itask)=0
02573           ! encode the domain size for this task
02574           ! if the bit is set, we need to add the border in that direction
02575           IF (ix==lb_coord(1).AND..NOT.dir_periodic(1)) tasks(6,itask)=IBSET(tasks(6,itask),0)
02576           IF (ix==ub_coord(1).AND..NOT.dir_periodic(1)) tasks(6,itask)=IBSET(tasks(6,itask),1)
02577           IF (iy==lb_coord(2).AND..NOT.dir_periodic(2)) tasks(6,itask)=IBSET(tasks(6,itask),2)
02578           IF (iy==ub_coord(2).AND..NOT.dir_periodic(2)) tasks(6,itask)=IBSET(tasks(6,itask),3)
02579           IF (iz==lb_coord(3).AND..NOT.dir_periodic(3)) tasks(6,itask)=IBSET(tasks(6,itask),4)
02580           IF (iz==ub_coord(3).AND..NOT.dir_periodic(3)) tasks(6,itask)=IBSET(tasks(6,itask),5)
02581           itask=itask+1
02582        ENDDO
02583        ENDDO
02584        ENDDO
02585     ENDIF
02586 
02587   END SUBROUTINE rs_find_node
02588 
02589 ! *****************************************************************************
02597   FUNCTION encode_rank (rank, grid_level, n_levels) RESULT(encoded_int)
02598 
02599     INTEGER, INTENT(IN)                      :: rank, grid_level, n_levels
02600     INTEGER(KIND=int_8)                      :: encoded_int
02601 
02602 ! ordered so can still sort by rank
02603 
02604     encoded_int = rank*n_levels + grid_level - 1
02605 
02606   END FUNCTION
02607 
02608   FUNCTION decode_rank (encoded_int, n_levels) RESULT(rank)
02609 
02610     INTEGER(KIND=int_8), INTENT(IN)          :: encoded_int
02611     INTEGER, INTENT(IN)                      :: n_levels
02612     INTEGER                                  :: rank
02613 
02614     rank = encoded_int / n_levels
02615 
02616   END FUNCTION
02617 
02618   FUNCTION decode_level (encoded_int, n_levels) RESULT(grid_level)
02619 
02620     INTEGER(KIND=int_8), INTENT(IN)          :: encoded_int
02621     INTEGER, INTENT(IN)                      :: n_levels
02622     INTEGER                                  :: grid_level
02623 
02624     INTEGER(KIND=int_8)                      :: n_levels8
02625 
02626     n_levels8 = n_levels
02627 
02628     grid_level = MODULO (encoded_int, n_levels8) + 1
02629 
02630   END FUNCTION decode_level
02631 
02632 END MODULE task_list_methods