#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>

#include <amp.h>
#include <amp_math.h>

#include <algorithm>
#include <vector>
#include <iostream>
#include <iomanip>
#include <cmath>
#include <chrono>
#include <ctime>
#include <array>
#include <numeric>


#undef min
#undef max

const double SteppingMul = 0.1;
const size_t SamplesCount = 8 ;	//vice uz neda pamet GPU

void SquareRange(double* firstIndex, double* lastIndex, double stepsince, double stepping) restrict(cpu, amp) {	
	for (auto iter = firstIndex; iter != lastIndex; iter++) {
		*iter = concurrency::precise_math::sqrt(1.0 - stepsince*stepsince);
		stepsince += stepping;
	}
}

double IntegrateRange(double* secondIndex, double* lastIndex, double stepping) restrict(cpu, amp) {
	double sum = 0.0;
	for (auto iter = secondIndex; iter != lastIndex; iter++) {
			//Riemannuv integral
		double a = *(iter - 1);
		double b = *iter;
		double nh = a;				//nepouzijme ternarni operator, protoze se < a > nestrida nahodne
		if (b < a) nh = b;			//resp. procesor se pro nas problem nauci, ze toto se nikdy neprovede

		sum += stepping*(nh + 0.5*concurrency::precise_math::fabs(a - b));
	}

	return sum;
}


class CTbbSquare {
	std::vector<double> &mAxis;
	double mStepping;
public:
	CTbbSquare(std::vector<double> &axis, double stepping) : mAxis(axis), mStepping(stepping) {};
	void operator() (const tbb::blocked_range<int> &r) const {		
		SquareRange(mAxis.data() + r.begin(), mAxis.data() + r.end(), ((double)r.begin())*mStepping, mStepping);		
	}
};

class CTbbIntegrate {
	std::vector<double> &mAxis;
	double mStepping;
public:
	double mArea;

	CTbbIntegrate(std::vector<double> &axis, double stepping) : mAxis(axis), mStepping(stepping),  mArea(0.0) {};
	CTbbIntegrate(const CTbbIntegrate &other, tbb::split) : mAxis(other.mAxis), mStepping(other.mStepping), mArea(0.0) {};


	void operator() (const tbb::blocked_range<int> &r) {

		mArea += IntegrateRange(mAxis.data() + r.begin(), mAxis.data() + r.end(), mStepping);
	}

	void join(const CTbbIntegrate &other) {
		mArea += other.mArea;
	}

};



int main() {

	concurrency::accelerator defdevice;
	std::wcout << "Zarizeni: " << defdevice.get_description() << std::endl;
	if (defdevice == concurrency::accelerator(concurrency::accelerator::direct3d_ref))
		std::cout << "Pozor: GPGPU vypocty pobezi na (pomalem) emulatoru, ktery je urceny pro ladeni." << std::endl;

	std::vector<double> XAxis;
	double Stepping = 0.1;


	std::cout << "Iter.\tSerial area\tTBB area\tGPGPU area\tTBB save GPGPU save" << std::endl;	

	for (size_t i = 0; i != SamplesCount; i++) {				
		XAxis.resize((size_t) ceil(1.0 / Stepping));
		

		auto clockstart = std::chrono::system_clock::now();
		SquareRange(XAxis.data(), XAxis.data()+XAxis.size(), 0, Stepping);
		double SerialArea = IntegrateRange(XAxis.data()+1, XAxis.data() + XAxis.size(), Stepping);
		auto clockstop = std::chrono::system_clock::now();		
		auto SerialLasted = std::chrono::duration_cast<std::chrono::microseconds>(clockstop - clockstart).count();


		clockstart = std::chrono::system_clock::now();		
		tbb::parallel_for(tbb::blocked_range<int>(0, (int) XAxis.size()), CTbbSquare(XAxis, Stepping));
		CTbbIntegrate reducer(XAxis, Stepping);
		tbb::parallel_reduce(tbb::blocked_range<int>(1, (int) XAxis.size()), reducer);
		double TBBArea = reducer.mArea;
		clockstop = std::chrono::system_clock::now();
		auto TBBLasted = std::chrono::duration_cast<std::chrono::microseconds>(clockstop - clockstart).count();


		clockstart = std::chrono::system_clock::now();
		concurrency::array_view<double, 1> XAxisView((int) XAxis.size(), XAxis.data());		

		concurrency::parallel_for_each(XAxisView.extent, [=](concurrency::index<1> idx) restrict(amp) {
			SquareRange(XAxisView.data() + idx[0], XAxisView.data() + idx[0] + 1, 0, Stepping);
						//inlinuje se na toto:
							//double step = idx[0] * Stepping;
							//XAxisView[idx] = concurrency::precise_math::sqrt(1.0 - step*step);
		});

		XAxisView.synchronize();
		reducer.mArea = 0.0;
		tbb::parallel_reduce(tbb::blocked_range<int>(1, (int)XAxis.size()), reducer);
			//jak to udělat efektivně na GPGPU? Viz přednáška o výpočtu součtů prefixů.
		double GPGPUArea = reducer.mArea;

		clockstop = std::chrono::system_clock::now();
		auto GPGPULasted = std::chrono::duration_cast<std::chrono::microseconds>(clockstop - clockstart).count();

		//print results
		std::cout << i+1 << "\t";
		std::cout << std::fixed << std::setprecision(12);
		std::cout << 4.0*SerialArea << "\t";		//*4.0 protoze jsme spocitali ctvrinu obsahu kruhu polomeru 1
		std::cout << 4.0*TBBArea << "\t";			//=> ma se nam vypocitat Pi
		std::cout << 4.0*GPGPUArea << "\t";			//ovsem pro SamplesCount = 8 jenom na 7 desetinnych mist
		std::cout << std::fixed;
		std::cout << (SerialLasted - TBBLasted) << "\t";
		std::cout << (SerialLasted - GPGPULasted);	//bude vychazet horsi nez jenom s TBB, protoze vypocetne je problem prilis malo narocny


		std::cout << std::endl;

		Stepping *= SteppingMul;
	}

	std::cout << std::endl << "Pi:     3.141592653589  3.141592653589  3.141592653589" << std::endl;
			//Muzou se vysledky lisit? Ano, protoze cisla v plovouci carce maji jenom omezeny pocet mist
			//a tudiz aritmeticke operace na FPU nejsou vzdy asociativni, a s paralelnim zpracovanim
			//se tim padem mohou odehrat v jinem poradi nez u serioveho kodu.

	return 0;
}

/*

Mozny vystup:

Zarizeni: Intel(R) HD Graphics 4600
Iter.   Serial area     TBB area        GPGPU area      TBB save GPGPU save
1       3.017340347378  3.017340347378  3.017340347378  -31251  -140583
2       3.137595684583  3.137595684583  3.137595684583  0       0
3       3.141466046555  3.141466046555  3.141466046555  0       0
4       3.141588649255  3.141588649255  3.141588649255  0       0
5       3.141592526964  3.141592526960  3.141592526960  0       0
6       3.141592649574  3.141592649585  3.141592649585  15625   15625
7       3.141592654059  3.141592653464  3.141592653464  31266   15609
8       3.141592648261  3.141592653577  3.141592653577  359391  203127

Pi:     3.141592653589  3.141592653589  3.141592653589

*/