/* DFT++ is a density functional package developed by the research group of Professor Tomas Arias Copyright 1996-2003 Sohrab Ismail-Beigi This file is part of DFT++. DFT++ is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. DFT++ is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with DFT++; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Please see the file CREDITS for a list of authors. For academic users, we request that publications using results obtained with this software reference "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000). and, if using the wavelet basis, further reference "Multiresolution analysis of electronic structure: semicardinal and wavelet bases," T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999). and "Robust ab initio calculation of condensed matter: transparent convergence through semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003). For your convenience, preprints of the above articles may be obtained from http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively. */ /* * Sohrab Ismail-Beigi, Dec. 1996, modified Mar 1997. * * Calculates the diagonals of various products of column_bundles * and diagonal matrices in an efficient manner. * */ /* $Id: diaginnerouter.cpp,v 1.14.2.16 2003/05/29 18:54:20 ivan Exp $ */ #include "header.h" #include "parallel.h" /* * Returns diag(F*X^Y) where X^ is the hermetian adjoint of X. * * Useful for the calculation of the Kinetic energy. */ diag_matrix diaginner(const diag_matrix &F, ColumnBundle &X, const ColumnBundle &Y) { #ifdef DFT_PROFILING timerOn(7); // Turn on the diaginner timer #endif if (X.col_length != Y.col_length || X.tot_ncols != Y.tot_ncols || X.tot_ncols != F.n) die("Size mismatch in diaginner: F.n=%d, X is %d by %d, Y is %d by %d\n", F.n,X.col_length,X.tot_ncols,Y.col_length,Y.tot_ncols); //this is what we'll return diag_matrix diagFXdagY(F.n); diagFXdagY.zero_out(); #ifdef DFT_MPI // localresult it is a temporary result in MPI case diag_matrix localresult(F.n); localresult.zero_out(); #else // or it points to diagFXdagY (serial/thread cases) diag_matrix &localresult = diagFXdagY; #endif // do the work... for (int i=0; i < X.my_ncols; i++) { //dft_log("col: %d X[i]^Y[i]=%1.12lf 2 %1.12lf i\n",i,tmp.x, tmp.y); localresult.c[X.start_ncol+i] = F.c[X.start_ncol+i] * ( (*X.col[i])^(*Y.col[i]) ); } #ifdef DFT_MPI // If this is an MPI case, we have to sum over all results from // all processors before begin done... #ifdef DFT_PROFILING timerOn(38); // Turn on diaginner MPI_Allreduce timer counterIncr(17); // counter for reduces counterIncr(18, SCALAR_SIZE*(double)localresult.n); // bytes reduced #endif MPI_Allreduce( &localresult.c[0], &diagFXdagY.c[0], localresult.n*SCALAR_SIZE, // change SCALAR_SIZE to datasize MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); #ifdef DFT_PROFILING timerOff(38); // Turn off diaginner MPI_Allreduce timer #endif #endif // DFT_MPI #ifdef DFT_PROFILING timerOff(7); // Turn off the diaginner timer #endif return diagFXdagY; } // // This local function does the work of diagouterI (i.e. calculating // electron density): it calculates the cumulative density of // n_cols_todo in the column_bundle X starting at column start_col. // Notice that these column indices are referenced to the columns // "owned" by X, i.e. they must be between 0 and X->my_ncols. // static void do_diagouterI_some_columns(const diag_matrix &F, const ColumnBundle &X, int start_col_loc, int n_cols_todo, RealSpaceScalarFieldColumn &n_local) { /* ipd 08/27/02 -replace the code with copy constructor appears to be causing problems for denser WL grids // Stores the result of I on a single column of X RealSpaceScalarFieldColumn IXcol; // IPD: the actual data allocation happens in init() - somewhat ugly IXcol.init_scalarfield(*X.col[0]->basis, REALSPACE); */ RealSpaceScalarFieldColumn IXcol(n_local); // loop over the columns of X and accumulate the resulting densities // into n_local for (int i=start_col_loc; i < start_col_loc+n_cols_todo; i++){ /* Apply I (i.e. FFT3D) */ apply_I(*X.col[i],IXcol); /* Take square of IXcol, scale it by the filling f_i and accumulate into n_local. */ //ipd: consider a function add_scale_abs2 to avoid cost of temp // n_local += F(X.start_ncol+i) * abs2(IXcol); add_scale_abs2(F(X.start_ncol+i), IXcol, n_local); } } #ifdef DFT_THREAD // // This routine is what the threads run when spawned. Used when // threads are running. This routine decodes what's been passed to it // and has that thread run the portion of diagouterI() belonging to // it by calling the routine above. // // the threading is along the column number static void * diagouterI_calc_n_thread(void *arg) { // Decode what's been passed to this thread dft_thread_data *data = (dft_thread_data *)arg; diag_matrix *F = (diag_matrix *)data->p1; ColumnBundle *X = (ColumnBundle *)data->p2; int start_col = data->start; int n_cols_todo = data->n; int my_id = data->id; RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p3; RealSpaceScalarFieldColumn &n_local = *n_thread_array[my_id]; // Do the work assigned to this thread do_diagouterI_some_columns(*F,*X,start_col,n_cols_todo,n_local); // Free up the data-passing structure and exit myfree(arg); return NULL; } // ipd: threaded along the column length, while the previous function // was threaded along the column number of n_thread_array static void * diagouterI_sum_n_thread(void *arg) { // Decode what's been passed to this thread dft_thread_data *data = (dft_thread_data *)arg; RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p1; RealSpaceScalarFieldColumn *n_local = (RealSpaceScalarFieldColumn *)data->p2; int N_threads = System::Get_N_threads(); int i_start = data->start; int n_todo = data->n; // Sum over all columns, and for each column, accumulate into the // total electron density the contributions from the various threads for (int t=0; t < N_threads; t++) for (int i=i_start; i < i_start+n_todo; i++) n_local->data.d[i] += n_thread_array[t]->data.d[i]; // We're done! myfree(arg); return NULL; } #endif // DFT_THREAD /* * Returns diag((I*X)*F*(I*X)^) where X^ is the hermetian adjoint of X. * * The routine does the calculation by applying I to 'howmany' columns of * X. * * It's prime use is in the calculation of the charge-density. * * The two routines below do this calculation in parallel by using * threads. * */ RealSpaceScalarFieldColumn diagouterI(const diag_matrix &F,const ColumnBundle &X) { #ifdef DFT_PROFILING timerOn(6); // diagouterI timer #endif if (F.n != X.tot_ncols) die("In diagouterI, F.n != X.tot_ncols\n"); /* Stores the final result */ RealSpaceScalarFieldColumn diagIXFIXdag; //actual allocation of the data diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE); diagIXFIXdag.data.zero_out(); #ifdef DFT_MPI /* Stores intermediate result for columns owned by this MPI node */ RealSpaceScalarFieldColumn n_local(diagIXFIXdag); #else /* If we're serial/thread, * we don't need any other places to store the result... * just take n_local to be the same as the final result */ RealSpaceScalarFieldColumn &n_local = diagIXFIXdag; #endif // Now calculate the electron density for the bands // owned by this process. For thread case, we should // distribute the work among the threads... // The final accumulated result ends up in n_local. #ifdef DFT_THREAD // Threads! We have to distribute the work. // First, we need an array of temporary vectors to hold // the results from each thread. Allocate N_thread vectors // and initialize them to the right size int N_threads = System::Get_N_threads(); /* // ipd 08/20/02: this appears to cause memory leaks - can't find it // switching (back?) to the ** array RealSpaceScalarFieldColumn *n_thread_array = (RealSpaceScalarFieldColumn *)mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn), "n_thread_array","diagouterI"); int t; for (t=0; t < N_threads; t++){ n_thread_array[t].fat = TRUE; //ipd: will hold data but ugly like this // consider loop and new instead of malloc n_thread_array[t].init_scalarfield(*n_local.basis, REALSPACE); n_thread_array[t].zero_out(); } */ RealSpaceScalarFieldColumn **n_thread_array= (RealSpaceScalarFieldColumn **) mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn *), "n_thread_array", "diagouterI"); int t; for(t=0; tzero_out(); } // Now launch the threads. Each thread calculates the electron density // of some of the columns and accumulates into its n_local_array vector dft_call_threads(X.my_ncols, (void *)&F,(void *)&X,(void *)n_thread_array,NULL, NULL, 0,0,0,0,0,0, diagouterI_calc_n_thread); // Now sum up all the resulting temporary vectors using threads by // distributing the pieces of the vectors to be summed across processors dft_call_threads(n_local.length, (void *)n_thread_array,(void *)&n_local,NULL,NULL, NULL, 0,0,0,0,0,0, diagouterI_sum_n_thread); // Free the temporary space and we're done // ipd: now this should be authomatic via the destructor for (t=0; t < N_threads; t++) delete n_thread_array[t]; myfree(n_thread_array); #else // Serial (no threads): just call the calculating routine // and tell it to do all the work in one big lump. do_diagouterI_some_columns(F,X,0,X.my_ncols,n_local); #endif #ifdef DFT_MPI /* If running MPI, the we have to do a global sum of all n_locals * on different nodes into the final results diagIXFIXdag */ #ifdef DFT_PROFILING timerOn(39); // Turn on diagouterI MPI_Allreduce timer counterIncr(19); // reduce count counterIncr(20,diagIXFIXdag.data.datasize* (double)diagIXFIXdag.length); // bytes reduced #endif MPI_Allreduce( &(n_local.data.d[0]), &(diagIXFIXdag.data.d[0]), diagIXFIXdag.length*SCALAR_SIZE, DFT_MPI_REAL, MPI_SUM, MPI_COMM_WORLD ); #ifdef DFT_PROFILING timerOff(39); // Turn off diagouterI MPI_Allreduce timer #endif #endif // DFT_MPI #ifdef DFT_PROFILING timerOff(6); // diagouterI timer #endif return diagIXFIXdag; } //diag_matrix RealSpaceScalarFieldColumn diagouterI_one_column(const diag_matrix &F,const ColumnBundle &X, int col) { if (F.n != X.tot_ncols) die("In diagouterI, F.n != X.tot_ncols\n"); /* Stores the final result */ RealSpaceScalarFieldColumn diagIXFIXdag; //actuall allocation of the data diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE); diagIXFIXdag.data.zero_out(); do_diagouterI_some_columns(F,X,col,1,diagIXFIXdag); return diagIXFIXdag; }