/*
    DFT++ is a density functional package developed by the research group
    of Professor Tomas Arias

    Copyright 1996-2003 Sohrab Ismail-Beigi

    This file is part of DFT++.

    DFT++ is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    DFT++ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with DFT++; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Please see the file CREDITS for a list of authors.

    For academic users, we request that publications using results obtained with
    this software reference

    "New algebraic formulation of density functional calculation," by Sohrab Ismail-Beigi
    and T.A. Arias, Computer Physics Communications 128:1-2, 1-45 (June 2000).

    and, if using the wavelet basis, further reference

    "Multiresolution analysis of electronic structure: semicardinal and wavelet bases,"
    T.A. Arias, Reviews of Modern Physics 71:1, 267-311 (January 1999).

    and 

    "Robust ab initio calculation of condensed matter: transparent convergence through
    semicardinal multiresolution analysis,'' I.P. Daykov, T.A. Arias, and
    Torkel D. Engeness, Physical Review Letters, 90:21, 216402 (May 2003).

    For your convenience, preprints of the above articles may be obtained from
    http://arXiv.org/abs/cond-mat/9909130, 9805262, and 0204411, respectively.
*/

/*
 * Sohrab Ismail-Beigi,  Dec. 1996, modified Mar 1997.
 *
 * Calculates the diagonals of various products of column_bundles 
 * and diagonal matrices in an efficient manner.
 *
 */

/* $Id: diaginnerouter.cpp,v 1.14.2.16 2003/05/29 18:54:20 ivan Exp $ */

#include "header.h"
#include "parallel.h"

/*
 * Returns diag(F*X^Y) where X^ is the hermetian adjoint of X.
 *
 * Useful for the calculation of the Kinetic energy.
 */
diag_matrix
diaginner(const diag_matrix &F, ColumnBundle &X, const ColumnBundle &Y)
{
#ifdef DFT_PROFILING
  timerOn(7);  // Turn on the diaginner timer
#endif

  if (X.col_length != Y.col_length || X.tot_ncols != Y.tot_ncols ||
      X.tot_ncols != F.n)
    die("Size mismatch in diaginner: F.n=%d, X is %d by %d, Y is %d by %d\n",
        F.n,X.col_length,X.tot_ncols,Y.col_length,Y.tot_ncols);

  //this is what we'll return
  diag_matrix diagFXdagY(F.n);
  diagFXdagY.zero_out();

#ifdef DFT_MPI
  // localresult it is a temporary result in MPI case
  diag_matrix localresult(F.n);
  localresult.zero_out();
#else
  // or it points to diagFXdagY (serial/thread cases)
  diag_matrix &localresult = diagFXdagY;
#endif

  // do the work...
  for (int i=0; i < X.my_ncols; i++) {
    //dft_log("col: %d   X[i]^Y[i]=%1.12lf 2 %1.12lf i\n",i,tmp.x, tmp.y);
    localresult.c[X.start_ncol+i] =
      F.c[X.start_ncol+i] * ( (*X.col[i])^(*Y.col[i]) );
  }

#ifdef DFT_MPI
  // If this is an MPI case, we have to sum over all results from
  // all processors before begin done...

#ifdef DFT_PROFILING
  timerOn(38);  // Turn on diaginner MPI_Allreduce timer
  counterIncr(17); // counter for reduces
  counterIncr(18, SCALAR_SIZE*(double)localresult.n); // bytes reduced
#endif
  
  MPI_Allreduce( &localresult.c[0], &diagFXdagY.c[0],
                 localresult.n*SCALAR_SIZE, // change SCALAR_SIZE to datasize
		 MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
    
#ifdef DFT_PROFILING
  timerOff(38);  // Turn off diaginner MPI_Allreduce timer
#endif

#endif // DFT_MPI

#ifdef DFT_PROFILING
  timerOff(7);  // Turn off the diaginner timer
#endif

  return diagFXdagY;
}


//
// This local function does the work of diagouterI (i.e. calculating
// electron density):  it calculates the cumulative density of 
// n_cols_todo in the column_bundle X starting at column start_col.
// Notice that these column indices are referenced to the columns
// "owned" by X, i.e. they must be between 0 and X->my_ncols.
//
static void
do_diagouterI_some_columns(const diag_matrix &F,
			   const ColumnBundle &X,
			   int start_col_loc,
			   int n_cols_todo,
			   RealSpaceScalarFieldColumn &n_local)
{
/* ipd 08/27/02 -replace the code with copy constructor
   appears to be causing problems for denser WL grids
   
  // Stores the result of I on a single column of X
  RealSpaceScalarFieldColumn IXcol;

  // IPD: the actual data allocation happens in init() - somewhat ugly
  IXcol.init_scalarfield(*X.col[0]->basis, REALSPACE);
*/
  RealSpaceScalarFieldColumn IXcol(n_local);

  // loop over the columns of X and accumulate the resulting densities
  // into n_local
  for (int i=start_col_loc; i < start_col_loc+n_cols_todo; i++){
      /* Apply I (i.e. FFT3D) */
      apply_I(*X.col[i],IXcol);
      
      /* Take square of IXcol, scale it  by the filling f_i
         and accumulate into n_local. */
      //ipd: consider a function add_scale_abs2 to avoid cost of temp
      // n_local += F(X.start_ncol+i) * abs2(IXcol);
      add_scale_abs2(F(X.start_ncol+i), IXcol, n_local);
  }
}



#ifdef DFT_THREAD
//
// This routine is what the threads run when spawned.  Used when
// threads are running.  This routine decodes what's been passed to it
// and has that thread run the portion of diagouterI() belonging to
// it by calling the routine above.
//
// the threading is along the column number
static void *
diagouterI_calc_n_thread(void *arg)
{
  // Decode what's been passed to this thread
  dft_thread_data *data = (dft_thread_data *)arg;

  diag_matrix *F = (diag_matrix *)data->p1;
  ColumnBundle *X = (ColumnBundle *)data->p2;
  int start_col = data->start;
  int n_cols_todo = data->n;
  int my_id = data->id;
  RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p3;
  RealSpaceScalarFieldColumn &n_local = *n_thread_array[my_id];

  // Do the work assigned to this thread
  do_diagouterI_some_columns(*F,*X,start_col,n_cols_todo,n_local);

  // Free up the data-passing structure and exit
  myfree(arg);
  return NULL;
}

// ipd: threaded along the column length, while the previous function
// was threaded along the column number of n_thread_array
static void *
diagouterI_sum_n_thread(void *arg)
{
  // Decode what's been passed to this thread
  dft_thread_data *data = (dft_thread_data *)arg;

  RealSpaceScalarFieldColumn **n_thread_array = (RealSpaceScalarFieldColumn **)data->p1;
  RealSpaceScalarFieldColumn *n_local = (RealSpaceScalarFieldColumn *)data->p2;
  int N_threads = System::Get_N_threads();
  int i_start = data->start;
  int n_todo = data->n;

  // Sum over all columns, and for each column, accumulate into the
  // total electron density the contributions from the various threads
  for (int t=0; t < N_threads; t++)
    for (int i=i_start; i < i_start+n_todo; i++)
      n_local->data.d[i] += n_thread_array[t]->data.d[i];
  
  // We're done!
  myfree(arg);
  return NULL;
}
#endif  // DFT_THREAD



/*
 * Returns diag((I*X)*F*(I*X)^) where X^ is the hermetian adjoint of X.
 *
 * The routine does the calculation by applying I to 'howmany' columns of
 * X.
 *
 * It's prime use is in the calculation of the charge-density.
 *
 * The two routines below do this calculation in parallel by using
 * threads.
 *
 */
RealSpaceScalarFieldColumn
diagouterI(const diag_matrix &F,const ColumnBundle &X)
{
#ifdef DFT_PROFILING
  timerOn(6); // diagouterI timer
#endif
  
  if (F.n != X.tot_ncols)
    die("In diagouterI, F.n != X.tot_ncols\n");

  /* Stores the final result */
  RealSpaceScalarFieldColumn diagIXFIXdag;
  //actual allocation of the data
  diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE);
  diagIXFIXdag.data.zero_out();
  
#ifdef DFT_MPI
  /* Stores intermediate result for columns owned by this MPI node */
  RealSpaceScalarFieldColumn n_local(diagIXFIXdag);
#else
  /* If we're serial/thread,
   * we don't need any other places to store the result...
   * just take n_local to be the same as the final result */
  RealSpaceScalarFieldColumn &n_local = diagIXFIXdag;
#endif
  
  // Now calculate the electron density for the bands
  // owned by this process.  For thread case, we should
  // distribute the work among the threads...
  // The final accumulated result ends up in n_local.
#ifdef DFT_THREAD
  // Threads!  We have to distribute the work.
  // First, we need an array of temporary vectors to hold
  // the results from each thread.  Allocate N_thread vectors
  // and initialize them to the right size
  int N_threads = System::Get_N_threads();


  /*
  // ipd 08/20/02: this appears to cause memory leaks - can't find it
  // switching (back?) to the ** array
  RealSpaceScalarFieldColumn *n_thread_array =
    (RealSpaceScalarFieldColumn *)mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn),
                                           "n_thread_array","diagouterI");
  int t;
  for (t=0; t < N_threads; t++){
    n_thread_array[t].fat = TRUE; //ipd: will hold data but ugly like this
    // consider loop and new instead of malloc
    n_thread_array[t].init_scalarfield(*n_local.basis, REALSPACE);
    n_thread_array[t].zero_out();
  }
  */
  
  RealSpaceScalarFieldColumn **n_thread_array=
    (RealSpaceScalarFieldColumn **)
    mymalloc(N_threads*sizeof(RealSpaceScalarFieldColumn *),
             "n_thread_array", "diagouterI");
  
  int t;
  for(t=0; t<N_threads; t++){
    // allocate them via copy constructor
    n_thread_array[t]=new RealSpaceScalarFieldColumn(diagIXFIXdag);
    n_thread_array[t]->zero_out();
  }

  // Now launch the threads.  Each thread calculates the electron density
  // of some of the columns and accumulates into its n_local_array vector
  dft_call_threads(X.my_ncols,
		   (void *)&F,(void *)&X,(void *)n_thread_array,NULL, NULL,
		   0,0,0,0,0,0,
		   diagouterI_calc_n_thread);

  // Now sum up all the resulting temporary vectors using threads by
  // distributing the pieces of the vectors to be summed across processors
  dft_call_threads(n_local.length,
		   (void *)n_thread_array,(void *)&n_local,NULL,NULL, NULL,
		   0,0,0,0,0,0,
		   diagouterI_sum_n_thread);
  
  // Free the temporary space and we're done
  // ipd: now this should be authomatic via the destructor
  for (t=0; t < N_threads; t++)
    delete n_thread_array[t];
    
  myfree(n_thread_array);

#else
  // Serial (no threads):  just call the calculating routine
  // and tell it to do all the work in one big lump.
  do_diagouterI_some_columns(F,X,0,X.my_ncols,n_local);
#endif

#ifdef DFT_MPI
/* If running MPI, the we have to do a global sum of all n_locals
 * on different nodes into the final results diagIXFIXdag */

#ifdef DFT_PROFILING
  timerOn(39);  // Turn on diagouterI MPI_Allreduce timer
  counterIncr(19); // reduce count
  counterIncr(20,diagIXFIXdag.data.datasize*
              (double)diagIXFIXdag.length); // bytes reduced
#endif
  MPI_Allreduce( &(n_local.data.d[0]), &(diagIXFIXdag.data.d[0]),
		 diagIXFIXdag.length*SCALAR_SIZE, DFT_MPI_REAL,
		 MPI_SUM, MPI_COMM_WORLD );
#ifdef DFT_PROFILING
  timerOff(39);  // Turn off diagouterI MPI_Allreduce timer
#endif

#endif // DFT_MPI

#ifdef DFT_PROFILING
  timerOff(6); // diagouterI timer
#endif

  return diagIXFIXdag;
}




//diag_matrix
RealSpaceScalarFieldColumn
diagouterI_one_column(const diag_matrix &F,const ColumnBundle &X, 
                      int col)
{

    if (F.n != X.tot_ncols)
        die("In diagouterI, F.n != X.tot_ncols\n");
    
    /* Stores the final result */
  RealSpaceScalarFieldColumn diagIXFIXdag;
  //actuall allocation of the data
  diagIXFIXdag.init_scalarfield(*X.col[0]->basis, REALSPACE);
  diagIXFIXdag.data.zero_out();
  
  do_diagouterI_some_columns(F,X,col,1,diagIXFIXdag);
  
  return diagIXFIXdag;
}


syntax highlighted by Code2HTML, v. 0.9.1