#include <random>
#include <stdio.h>
#include <mpi.h>
#include <csmethodsMPI.hpp>
#include <buffs.hpp>
#include <latpack.hpp>

using namespace std;

template<class AnyClass, class SystemInfo, class PackFunc> /*Algorithm can be used for a class that defines any system*/
void cloning(double tmax, int nc, vector<int> dim, double dt, double s, char obs, char proc, vector<double> rates, int iseed, double repeats, int clonemethod)
{
        void (*selectclones)(int*, int*, int*, double*, int, int, int, double, SystemInfo**, int, int);
        if(clonemethod==1)
	{
	  selectclones = csm1MPI<SystemInfo*>;
	}
	if(clonemethod==2)
	{
	  selectclones = csm2MPI<SystemInfo*>;
	}

        int sLL = int(round(s*double(dim[0])*double(dim[0])));
	int logt = int(round(log10(tmax)));
	int lognc = int(round(log10(nc)));
	int dtprint = int(round(dt));

	int myid, numproc;
	MPI_Comm_size(MPI_COMM_WORLD, &numproc);
	MPI_Comm_rank(MPI_COMM_WORLD, &myid);
	MPI_Status status;
	
	FILE * pFile;
	char FILENAME[50];
	sprintf(FILENAME,"%dLessComms%d,%d,%d,%d,%d,%d,%d.txt",iseed,clonemethod,numproc,logt,dim[0],lognc,sLL,dtprint);
	pFile = fopen(FILENAME,"w");
        double repcount;
	for(repcount=0;repcount<repeats;repcount++)
	{
	double start = MPI_Wtime();       
	
	int iter;
	/*Initialise Variables*/
	int i, k; //For loop indices
	double j;
	double r; //Random number for picking lattices
	int c; //Holds indices of lattices to be cloned

	/*Generate seeds for each system's internal twister*/
	int *seeds; //Seeds for lattices
	seeds = new int[nc]();
	mt19937 twister(iseed); //Twister for generating seeds
	std::uniform_int_distribution<int> dist(0, pow(10, 7)); //Distribution for twister seeds
	for (i = 0; i < nc; i++) //Initialise seeds to be in order
	{
		seeds[i] = dist(twister);
	}

	///***Variables for cloning process***///
	double *upsilon;
	upsilon = new double[nc](); //Holds cloning/pruning rates on lattices local to this processor
	double *globalups;
	globalups = new double[nc](); //Holds all cloning/pruning rates

	int *clones_local;
	clones_local = new int[nc](); //Holds indices of replacement clones to replace clones local to processor
	int* clones_global;
	clones_global = new int[nc](); //Holds indices of all replacement clones
	int* procs_local;
	procs_local = new int[nc](); //Holds processor of replacement clones to replacement clones local to processor
	int* procs_global;
	procs_global = new int[nc](); //Holds processors of all replacement clones

	int* proc_comm_local;
	proc_comm_local = new int[numproc](); //Holds how many systems are received
	int* proc_comm;
	proc_comm = new int[numproc*numproc]();

	int* index; //Holds local index of each lattice
	index = new int[nc]();

	for(i=0; i<nc; i++)
	{
	  index[i] = int(floor(double(i)) / double(numproc));
	}

	int cpp = int(ceil(double(nc)/double(numproc))); //Size of each array of systems
	int ctp = cpp; //Clones stored on this processor
	if((cpp-1)*numproc + myid >= nc)
	{
	  ctp = ctp-1; //Some processors have one less system than others
	}
	int copycount; //Number of systems to be copied internally

	//Note: Two sets of clones prevent over-writing during cloning
	int arrswitch = 0; //Switch between two sets of lattices so one one not overwritten during cloning
	AnyClass *systems[2 * cpp]; //Vector that holds two sets of lattices
	SystemInfo *infos[2 * cpp]; //Vector that holds information about observables
	mt19937 *twisters[nc];
	buffs *buffers[numproc];

	for(k=0; k<numproc; k++)
	{
	  buffers[k] = new buffs(0,0);
	}

	double totups = 0; //Holds total of cloning/pruning factors
	double X = 1; //Holds product of cloning factors
	double lnX = 0; //Log value easier to store

	/*'Key' is ued in the binary search*/
	double *key; //Holds cumulative upsilon values to be used as a key in a binary search
	key = new double[nc + 1]();
	key[0] = 0; //First element of key is always 0

	///***Generate Systems, Info Classes and Random Number Generators***///
	for (i = 0; i < cpp; i++)
	{
	        k = i*numproc + myid;
		if (k<nc)
		{
       		  twisters[i] = new mt19937(seeds[k]);
       		  systems[i] = new AnyClass(dim); //Fill vector a with newly constructed lattices
       		  systems[i + cpp] = new AnyClass(dim); //Fill vector b with newly constructed lattices
       		  infos[i] = new SystemInfo(dim, s, obs, proc, rates, twisters[i]); //Fill vector a with newly constucted classes that hold information about processes
       		  infos[i + cpp] = new SystemInfo(dim, s, obs, proc, rates, twisters[i]); //Fill vector b with info classes
		}
    	}
	PackFunc Pack(dim);

       	///***Initialise each lattice***///
	for (k = 0; k < nc; k++)
	{
	  i = k*numproc + myid;
	  if(i<nc)
	  {
	        infos[k]->init(systems[k]);
		//Note: Only systems in set A are initialised. Systems in set B carry on process from systems in set A after cloning
	  }
	}

	int size = infos[0]->size;
	char *buff = new char[size];
	char *recbuff = new char[size];
	int position;

	int *tcount = new int[numproc];

	vector<vector<int>> sendcomms(numproc);
	vector<vector<int>> recvcomms(numproc);
	vector<int> copycomms;

	int rsize, msize;
	int recvsys, sendsys;

        for (j = 0; j < tmax / dt; j++) //At each time step
	{
	        totups = 0; //Reset the sum of upsilons to 0
		for (k = 0; k < cpp; k++) //On each system
		{
		  i = k*numproc + myid;
		  if(i<nc)
		  {
		    (infos[(arrswitch*cpp)+k]->*(infos[(arrswitch*cpp)+k]->dynamics))(dt, systems[(arrswitch*cpp)+k]); //Run Dynamics for one cloning interval
		    (infos[(arrswitch*cpp)+k]->*(infos[(arrswitch*cpp)+k]->calcups))(&upsilon[i]); //Calculate Cloning Factor Upsilon Y
		  }
	    	}

		///***Generate Key for Binary Search***///
		MPI_Allreduce(upsilon, globalups, nc, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); //Collect all upsilon values

		        for (i = 0; i < nc; i++)
			{
			  key[i + 1] = key[i] + globalups[i]; //Generate key for selecting clones
			}
			totups = key[nc]; //Extract total of upsilons
			X = double(totups) / double(nc); //Calculate cloning factor and store as log to avoid infintely large values
			lnX = lnX + log(X);

		///***Decide which lattices to clone***///
		selectclones(clones_local, procs_local, proc_comm_local, key, nc, numproc, cpp, totups, infos, myid, arrswitch);
		
		///***Communicate Cloning Process***///
		MPI_Allreduce(clones_local, clones_global, nc, MPI_INT, MPI_MAX, MPI_COMM_WORLD); //Communicate to all processors which systems are being cloned to where
		MPI_Allreduce(procs_local, procs_global, nc, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
		MPI_Allgather(proc_comm_local, numproc, MPI_INT, proc_comm, numproc, MPI_INT, MPI_COMM_WORLD);

		for(i=0; i<numproc; i++)
		{
		  tcount[i] = (proc_comm[i*numproc+myid])-(proc_comm[myid*numproc+i]); //Reduce the number of messages sent between each pair of processors
		}

		copycount = ctp; //Initially assume every processor is copied internally

		for(i=0; i<numproc; i++)
		{
                  if(tcount[i]>0 && i!=myid)
		  {
		    sendcomms[i].reserve(tcount[i]); //If sending systems to processor i reserve space in communicator
		  }
		  if(tcount[i]<0 && i!=myid)
		  {
		    recvcomms[i].reserve(abs(tcount[i])); //If receiving systems from processor i reserve space in communicator
		    copycount += tcount[i]; //and reduce the number of systems cloned internally
		  }
  		}
		sendcomms[myid].reserve(copycount);
		recvcomms[myid].reserve(copycount);
		tcount[myid]=copycount;

		k=0; //Start at processor 0
		for(i=0; i<nc; i++) //Fill send communicator
		{
		  if(procs_global[i]==myid) //If this processor was sending a system to index i under the simple communications
		  {
		    if(tcount[k]<1)
		    {
		      while(tcount[k]<1)
		      {
			k+=1; //Find a processor 'k' that I need to send a system to under the reduced communications
		      }
		    }
		    sendcomms[k].push_back(i); //Add to the communicator that the system will be sent to processor 'k' instead
		    tcount[k]-=1; //Record that we need to find one less system to send to processor 'k'
		  }
		}

		tcount[myid]=-1*copycount;
		
		k=0; //Start at processor 0
		for(i=0; i<nc; i++) //Fill receive communicator
		{
		  if(int(i%numproc)==myid) //If this processor holds system with index i
		  {
		    if(tcount[k]>-1)
		    {
		      while(tcount[k]>-1)
		      {
			k+=1; //Find a processor 'k' that I will receive a system from under the reduced communications
		      }
		    }
		    recvcomms[k].push_back(i); //Add to the communicator that the system will be received from processor 'k'
		    tcount[k]+=1; //Record that we need to find one less system to receive from processor 'k'
		  }
		}

		int iter;
				
		///***Packing***///
		for(i=0; i<numproc; i++)
		{
		  if(myid != i) //Nothing MPI sent to self
		  {
		    rsize = size * recvcomms[i].size();
		    msize = size * sendcomms[i].size();

		    delete buffers[i];
		    buffers[i] = new buffs(rsize,msize);

		    k = int(sendcomms[i].size());
		    position=0;

		    if(msize>0)
		    {
		      for(iter=0; iter<k; iter++)
		      {
			position=size*iter;
			sendsys=sendcomms[i][k-iter-1];
			Pack.pack(systems[arrswitch*cpp+index[clones_global[sendsys]]], buffers[i]->sendarr, &position, msize);
		      }
		    }
		  }
		}

		///***Sending and Receiving***///
		for(iter=1; iter<numproc; iter++)
		{
		  i = (myid+iter)%numproc; //Thread to send to
		  k = (myid+numproc-iter)%numproc; //Thread to receive from
		  rsize = size*sendcomms[i].size(); //Size of receive array
		  msize = size*recvcomms[k].size(); //Size of send array

		  if(k < myid) //If receiving before sending
		  {
		    MPI_Recv(buffers[k]->recvarr, msize, MPI_PACKED, k, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); //Receive Systems
		    MPI_Send(buffers[i]->sendarr, rsize, MPI_PACKED, i, 0, MPI_COMM_WORLD); //Then Send
		  }
		  else
		  {
		    MPI_Send(buffers[i]->sendarr, rsize, MPI_PACKED, i, 0, MPI_COMM_WORLD); //Send Systems
		    MPI_Recv(buffers[k]->recvarr, msize, MPI_PACKED, k, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); //Then receive
		  }
		}

		///***Unpacking***///
		for(i=0; i<numproc; i++)
		{
		  if(myid != i) //If receving from a different processor 
		  {
		    k = recvcomms[i].size();
		    for(iter=0; iter<k; iter++)
		    {
		      position=size*iter;
		      recvsys = recvcomms[i][k-iter-1];
		      Pack.unpack(systems[(1-arrswitch)*cpp + index[recvsys]], buffers[i]->recvarr, &position, k); 
		    }
		  }
		}

		k=recvcomms[myid].size();
		for(iter=0; iter<k; iter++)
		{
		  recvsys=recvcomms[myid][k-iter-1];
		  sendsys=sendcomms[myid][k-iter-1];
		  //Put cloned systems in the other set of systems
		  *systems[(1 - arrswitch)*cpp + index[recvsys]] = *systems[(arrswitch*cpp) + index[clones_global[sendsys]]]; //Copy System
		}

		for(i=0; i<numproc; i++)
		{
		  recvcomms[i].erase(recvcomms[i].begin(), recvcomms[i].end());
		  sendcomms[i].erase(sendcomms[i].begin(), sendcomms[i].end());
		  proc_comm_local[i]=0;
		}

		arrswitch = 1 - arrswitch; //Set the other vector of lattices to be used in next time loop		 
	}

	/*Calculate psi for this value of s*/
	double psi;
	psi = double(lnX) / double(j*dt);

	if(myid==0)
	{
	  fprintf(pFile,"\n%f", psi);
	}

	/*Delete Arrays*/
	delete upsilon;
	delete key;
	delete seeds;

	for(i=0;i<nc;i++)
	{
	  //delete infos[i];
	  //delete infos[nc+i];
	  //delete systems[i];
	  //delete systems[nc+i];
	  //delete twisters[i];
	}

	iseed+=1;

	if(myid==0)
	{
	  double end = MPI_Wtime();
	  fprintf(pFile, "\n%f", end-start);
	}
        }
}

int main(int argc, char** argv)
{
	int T = atoi(argv[1]);
	int nc = atoi(argv[2]);
	std::vector<int> dim;
	int L = atoi(argv[3]);
	dim.push_back(L);
	int N = 0.5*L;
	dim.push_back(N);
	double dt = atof(argv[4]);
        double s = double(atof(argv[5]))/double(L*L);
	std::vector<double> rates;
	rates.push_back(1);
	rates.push_back(1);
	double seed = double(atof(argv[6]));
	double repeats = atof(argv[7]);
	int clonemethod = atoi(argv[8]);
	MPI_Init(&argc, &argv);
	int myid;
	MPI_Comm_rank(MPI_COMM_WORLD, &myid);
        if(myid==0)
	{
	   printf("Less Comms: %d units of time, %d clones %d sites %d particles %f repeats, s = %f and Cloning Interval of %f units of time",T,nc,L,N,repeats,s,dt);
	}
        cloning<lattice, latticeinfo, latticepack>(T, nc, dim, dt, s, 'h', 'A', rates, seed, repeats, clonemethod);
	MPI_Finalize();
}
