/*
DFT++ is a density functional package developed by the research group
of Professor Tomas Arias
Copyright 1996-2003 Sohrab Ismail-Beigi
This file is part of DFT++.
DFT++ is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
DFT++ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DFT++; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Please see the file CREDITS for a list of authors.
For academic users, we request that publications using results obtained with
this software reference
"New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).
and, if using the wavelet basis, further reference
"Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).
and
"Robust ab initio calculation of condensed matter: transparent convergence through
semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).
For your convenience, preprints of the above articles may be obtained from
http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/
/*
* Sohrab Ismail-Beigi, Dec. 1996, modified Mar 1997.
*
* Calculates the diagonals of various products of column_bundles
* and diagonal matrices in an efficient manner.
*
*/
/* $Id: diaginnerouter.cpp,v 1.14.2.16 2003/05/29 18:54:20 ivan Exp $ */
#include "header.h"
#include "parallel.h"
/*
* Returns diag(F*X^Y) where X^ is the hermetian adjoint of X.
*
* Useful for the calculation of the Kinetic energy.
*/
diag_matrix
diaginner(const diag_matrix &F, ColumnBundle &X, const ColumnBundle &Y)
{
#ifdef DFT_PROFILING
timerOn(7); // Turn on the diaginner timer
#endif
if (X.col_length != Y.col_length || X.tot_ncols != Y.tot_ncols ||
X.tot_ncols != F.n)
die("Size mismatch in diaginner: F.n=%d, X is %d by %d, Y is %d by %d\n",
F.n,X.col_length,X.tot_ncols,Y.col_length,Y.tot_ncols);
//this is what we'll return
diag_matrix diagFXdagY(F.n);
diagFXdagY.zero_out();
#ifdef DFT_MPI
// localresult it is a temporary result in MPI case
diag_matrix localresult(F.n);
localresult.zero_out();
#else
// or it points to diagFXdagY (serial/thread cases)
diag_matrix &localresult = diagFXdagY;
#endif
// do the work...
for (int i=0; i < X.my_ncols; i++) {
//dft_log("col: %d X[i]^Y[i]=%1.12lf 2 %1.12lf i\n",i,tmp.x, tmp.y);
localresult.c[X.start_ncol+i] =
F.c[X.start_ncol+i] * ( (*X.col[i])^(*Y.col[i]) );
}
#ifdef DFT_MPI
// If this is an MPI case, we have to sum over all results from
// all processors before begin done...
#ifdef DFT_PROFILING
timerOn(38); // Turn on diaginner MPI_Allreduce timer
counterIncr(17); // counter for reduces
counterIncr(18, SCALAR_SIZE*(double)localresult.n); // bytes reduced
#endif
MPI_Allreduce( &localresult.c[0], &diagFXdagY.c[0],
localresult.n*SCALAR_SIZE, // change SCALAR_SIZE to datasize
MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
timerOff(38); // Turn off diaginner MPI_Allreduce timer
#endif
#endif // DFT_MPI
#ifdef DFT_PROFILING
timerOff(7); // Turn off the diaginner timer
#endif
return diagFXdagY;
}
//
// This local function does the work of diagouterI (i.e. calculating
// electron density): it calculates the cumulative density of
// n_cols_todo in the column_bundle X starting at column start_col.
// Notice that these column indices are referenced to the columns
// "owned" by X, i.e. they must be between 0 and X->my_ncols.
//
static void
do_diagouterI_some_columns(const diag_matrix &F,
const ColumnBundle &X,
int start_col_loc,
int n_cols_todo,
RealSpaceScalarFieldColumn &n_local)
{
/* ipd 08/27/02 -replace the code with copy constructor
appears to be causing problems for denser WL grids
// Stores the result of I on a single column of X
RealSpaceScalarFieldColumn IXcol;
// IPD: the actual data allocation happens in init() - somewhat ugly
IXcol.init_scalarfield(*X.col[0]->basis, REALSPACE);
*/
RealSpaceScalarFieldColumn IXcol(n_local);
// loop over the columns of X and accumulate the resulting densities
// into n_local
for (int i=start_col_loc; i < start_col_loc+n_cols_todo; i++){
/* Apply I (i.e. FFT3D) */
apply_I(*X.col[i],IXcol);
/* Take square of IXcol, scale it by the filling f_i
and accumulate into n_local. */
//ipd: consider a function add_scale_abs2 to avoid cost of temp
// n_local += F(X.start_ncol+i) * abs2(IXcol);
add_scale_abs2(F(X.start_ncol+i), IXcol, n_local);
}
}
#ifdef DFT_THREAD
//
// This routine is what the threads run when spawned. Used when
// threads are running. This routine decodes what's been passed to it
// and has that thread run the portion of diagouterI() belonging to
// it by calling the routine above.
//
// the threading is along the column number
static void *
diagouterI_calc_n_thread(void *arg)
{
// Decode what's been passed to this thread
dft_thread_data *data = (dft_thread_data *)arg;
diag_matrix *F = (diag_matrix *)data->p1;
ColumnBundle *X = (ColumnBundle *)data->p2;
int start_col = data->start;
int n_cols_todo = data->n;
int my_id = data->id;
RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p3;
RealSpaceScalarFieldColumn &n_local = *n_thread_array[my_id];
// Do the work assigned to this thread
do_diagouterI_some_columns(*F,*X,start_col,n_cols_todo,n_local);
// Free up the data-passing structure and exit
myfree(arg);
return NULL;
}
// ipd: threaded along the column length, while the previous function
// was threaded along the column number of n_thread_array
static void *
diagouterI_sum_n_thread(void *arg)
{
// Decode what's been passed to this thread
dft_thread_data *data = (dft_thread_data *)arg;
RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p1;
RealSpaceScalarFieldColumn *n_local = (RealSpaceScalarFieldColumn *)data->p2;
int N_threads = System::Get_N_threads();
int i_start = data->start;
int n_todo = data->n;
// Sum over all columns, and for each column, accumulate into the
// total electron density the contributions from the various threads
for (int t=0; t < N_threads; t++)
for (int i=i_start; i < i_start+n_todo; i++)
n_local->data.d[i] += n_thread_array[t]->data.d[i];
// We're done!
myfree(arg);
return NULL;
}
#endif // DFT_THREAD
/*
* Returns diag((I*X)*F*(I*X)^) where X^ is the hermetian adjoint of X.
*
* The routine does the calculation by applying I to 'howmany' columns of
* X.
*
* It's prime use is in the calculation of the charge-density.
*
* The two routines below do this calculation in parallel by using
* threads.
*
*/
RealSpaceScalarFieldColumn
diagouterI(const diag_matrix &F,const ColumnBundle &X)
{
#ifdef DFT_PROFILING
timerOn(6); // diagouterI timer
#endif
if (F.n != X.tot_ncols)
die("In diagouterI, F.n != X.tot_ncols\n");
/* Stores the final result */
RealSpaceScalarFieldColumn diagIXFIXdag;
//actual allocation of the data
diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE);
diagIXFIXdag.data.zero_out();
#ifdef DFT_MPI
/* Stores intermediate result for columns owned by this MPI node */
RealSpaceScalarFieldColumn n_local(diagIXFIXdag);
#else
/* If we're serial/thread,
* we don't need any other places to store the result...
* just take n_local to be the same as the final result */
RealSpaceScalarFieldColumn &n_local = diagIXFIXdag;
#endif
// Now calculate the electron density for the bands
// owned by this process. For thread case, we should
// distribute the work among the threads...
// The final accumulated result ends up in n_local.
#ifdef DFT_THREAD
// Threads! We have to distribute the work.
// First, we need an array of temporary vectors to hold
// the results from each thread. Allocate N_thread vectors
// and initialize them to the right size
int N_threads = System::Get_N_threads();
/*
// ipd 08/20/02: this appears to cause memory leaks - can't find it
// switching (back?) to the ** array
RealSpaceScalarFieldColumn *n_thread_array =
(RealSpaceScalarFieldColumn *)mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn),
"n_thread_array","diagouterI");
int t;
for (t=0; t < N_threads; t++){
n_thread_array[t].fat = TRUE; //ipd: will hold data but ugly like this
// consider loop and new instead of malloc
n_thread_array[t].init_scalarfield(*n_local.basis, REALSPACE);
n_thread_array[t].zero_out();
}
*/
RealSpaceScalarFieldColumn **n_thread_array=
(RealSpaceScalarFieldColumn **)
mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn *),
"n_thread_array", "diagouterI");
int t;
for(t=0; t<N_threads; t++){
// allocate them via copy constructor
n_thread_array[t]=new RealSpaceScalarFieldColumn(diagIXFIXdag);
n_thread_array[t]->zero_out();
}
// Now launch the threads. Each thread calculates the electron density
// of some of the columns and accumulates into its n_local_array vector
dft_call_threads(X.my_ncols,
(void *)&F,(void *)&X,(void *)n_thread_array,NULL, NULL,
0,0,0,0,0,0,
diagouterI_calc_n_thread);
// Now sum up all the resulting temporary vectors using threads by
// distributing the pieces of the vectors to be summed across processors
dft_call_threads(n_local.length,
(void *)n_thread_array,(void *)&n_local,NULL,NULL, NULL,
0,0,0,0,0,0,
diagouterI_sum_n_thread);
// Free the temporary space and we're done
// ipd: now this should be authomatic via the destructor
for (t=0; t < N_threads; t++)
delete n_thread_array[t];
myfree(n_thread_array);
#else
// Serial (no threads): just call the calculating routine
// and tell it to do all the work in one big lump.
do_diagouterI_some_columns(F,X,0,X.my_ncols,n_local);
#endif
#ifdef DFT_MPI
/* If running MPI, the we have to do a global sum of all n_locals
* on different nodes into the final results diagIXFIXdag */
#ifdef DFT_PROFILING
timerOn(39); // Turn on diagouterI MPI_Allreduce timer
counterIncr(19); // reduce count
counterIncr(20,diagIXFIXdag.data.datasize*
(double)diagIXFIXdag.length); // bytes reduced
#endif
MPI_Allreduce( &(n_local.data.d[0]), &(diagIXFIXdag.data.d[0]),
diagIXFIXdag.length*SCALAR_SIZE, DFT_MPI_REAL,
MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
timerOff(39); // Turn off diagouterI MPI_Allreduce timer
#endif
#endif // DFT_MPI
#ifdef DFT_PROFILING
timerOff(6); // diagouterI timer
#endif
return diagIXFIXdag;
}
//diag_matrix
RealSpaceScalarFieldColumn
diagouterI_one_column(const diag_matrix &F,const ColumnBundle &X,
int col)
{
if (F.n != X.tot_ncols)
die("In diagouterI, F.n != X.tot_ncols\n");
/* Stores the final result */
RealSpaceScalarFieldColumn diagIXFIXdag;
//actuall allocation of the data
diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE);
diagIXFIXdag.data.zero_out();
do_diagouterI_some_columns(F,X,col,1,diagIXFIXdag);
return diagIXFIXdag;
}
syntax highlighted by Code2HTML, v. 0.9.1