/*******************************************************************************
* Copyright (C) 2014 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file GenerateProblem.cpp

 HPCG routine
 */

#include <cassert>
#include "GenerateCoarseProblem.hpp"
#include "GenerateGeometry.hpp"
#include "GenerateProblem.hpp"
#include "SetupHalo.hpp"
#include "UsmUtil.hpp"

/*!
  Routine to construct a prolongation/restriction operator for a given fine grid matrix
  solution (as computed by a direct solver).

  @param[inout]  Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary vectors will be defined.

  Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.

*/

void GenerateCoarseProblem(const SparseMatrix & Af, sycl::queue & main_queue, int runRealRef) {

  // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
  // below may result in global range values.
  global_int_t nxf = Af.geom->nx;
  global_int_t nyf = Af.geom->ny;
  global_int_t nzf = Af.geom->nz;

  local_int_t nxc, nyc, nzc; //Coarse nx, ny, nz
  assert(nxf%2==0); assert(nyf%2==0); assert(nzf%2==0); // Need fine grid dimensions to be divisible by 2
  nxc = nxf/2; nyc = nyf/2; nzc = nzf/2;
     // A pointer to this array is stored in mgData, so it is not freed before returning from function.
  local_int_t *f2cOperator = (local_int_t *) sparse_malloc_device( Af.localNumberOfRows*sizeof(local_int_t), main_queue);
  local_int_t localNumberOfRows = nxc*nyc*nzc; // This is the size of our subblock

  // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
  assert(localNumberOfRows>0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)

  // Use a parallel loop to do initial assignment:
  // distributes the physical placement of arrays of pointers across the memory system

  auto ev = main_queue.submit([&](sycl::handler &cgh) {
      const local_int_t total_size = round_up_next_multiple(localNumberOfRows, 256);
      auto kernel = [=](sycl::nd_item<1> item) {
          local_int_t row = item.get_global_id(0);
          if(row<localNumberOfRows) {
            f2cOperator[row] = 0;
          }
      };
      cgh.parallel_for<class GenerateCoarseProblemClass1>(sycl::nd_range<1>(total_size, 256), kernel);
  });

  ev = main_queue.submit ([&](sycl::handler &cgh) {
      cgh.depends_on(ev);
//      const local_int_t wi = 4;
      const local_int_t zw = nzc; //ceil_div(static_cast<local_int_t>(nzc), wi);
      const local_int_t yw = nyc; //ceil_div(static_cast<local_int_t>(nyc), wi);
      const local_int_t xw = nxc; //ceil_div(static_cast<local_int_t>(nxc), wi);

      auto kernel = [=](sycl::item<3> item) {
          local_int_t zck = item.get_id(0);
          local_int_t yck = item.get_id(1);
          local_int_t xck = item.get_id(2);

          if(zck < nzc && yck < nyc && xck < nxc) {
              local_int_t xfk = 2*xck;
              local_int_t yfk = 2*yck;
              local_int_t zfk = 2*zck;
              local_int_t currentCoarseRow = zck*nyc*nxc + yck*nxc + xck;
              local_int_t currentFineRow   = zfk*nyf*nxf + yfk*nxf + xfk;
              f2cOperator[currentCoarseRow] = currentFineRow;
          }
      };
      cgh.parallel_for<class GenerateCoarseProblemClass2>(sycl::range<3>(zw,yw,xw), kernel);
  });

  // Construct the geometry and linear system
  Geometry * geomc = new Geometry;
  local_int_t zlc = 0; // Coarsen nz for the lower block in the z processor dimension
  local_int_t zuc = 0; // Coarsen nz for the upper block in the z processor dimension
  int pz = Af.geom->pz;
  if (pz>0) {
    zlc = Af.geom->partz_nz[0]/2; // Coarsen nz for the lower block in the z processor dimension
    zuc = Af.geom->partz_nz[1]/2; // Coarsen nz for the upper block in the z processor dimension
  }
  GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, Af.geom->pz, zlc, zuc, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy, Af.geom->npz, geomc, main_queue);

  SparseMatrix * Ac = new SparseMatrix;
  InitializeSparseMatrix(*Ac, geomc);
  GenerateProblem(*Ac, 0, 0, 0, main_queue, runRealRef);
  SetupHalo(*Ac, main_queue);
  Vector *rc = new Vector;
  Vector *xc = new Vector;
  Vector * Axf = new Vector;
  InitializeVectorDevice(*rc, Ac->localNumberOfRows, main_queue);
  InitializeVectorDevice(*xc, Ac->localNumberOfColumns, main_queue);
  InitializeVectorDevice(*Axf, Af.localNumberOfColumns, main_queue);
  auto zero_ev1 = ZeroVector(*rc, main_queue);
  auto zero_ev2 = ZeroVector(*xc, main_queue);
  auto zero_ev3 = ZeroVector(*Axf, main_queue);

  // Wait for zero_ev1, zero_ev2, zero_ev3
  main_queue.ext_oneapi_submit_barrier({ev, zero_ev1, zero_ev2, zero_ev3}).wait();

  Af.Ac = Ac;
  MGData * mgData = new MGData;
  InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
  Af.mgData = mgData;

  if (runRealRef) {
     Vector *rc_host = new Vector;
     Vector *xc_host = new Vector;
     Vector *Axf_host = new Vector;
     InitializeVector(*rc_host, Ac->localNumberOfRows);
     InitializeVector(*xc_host, Ac->localNumberOfColumns);
     InitializeVector(*Axf_host, Af.localNumberOfColumns);
     ZeroVector(*rc_host);
     ZeroVector(*xc_host);
     ZeroVector(*Axf_host);

     // A pointer to this array is stored in mgData_host, so it is not freed before returning from function.
     local_int_t *f2cOperator_host = (local_int_t *) malloc( Af.localNumberOfRows*sizeof(local_int_t));
     main_queue.memcpy(f2cOperator_host, f2cOperator, sizeof(local_int_t)*Af.localNumberOfRows).wait();

     MGData * mgData_host = new MGData;
     InitializeMGData(f2cOperator_host, rc_host, xc_host, Axf_host, *mgData_host);
     Af.mgData_host = mgData_host;
  }


  return;
}
