#include #include #include #include #include #include #include #include #include #include #include #include #include #undef min #undef max const double SteppingMul = 0.1; const size_t SamplesCount = 8 ; //vice uz neda pamet GPU void SquareRange(double* firstIndex, double* lastIndex, double stepsince, double stepping) restrict(cpu, amp) { for (auto iter = firstIndex; iter != lastIndex; iter++) { *iter = concurrency::precise_math::sqrt(1.0 - stepsince*stepsince); stepsince += stepping; } } double IntegrateRange(double* secondIndex, double* lastIndex, double stepping) restrict(cpu, amp) { double sum = 0.0; for (auto iter = secondIndex; iter != lastIndex; iter++) { //Riemannuv integral double a = *(iter - 1); double b = *iter; double nh = a; //nepouzijme ternarni operator, protoze se < a > nestrida nahodne if (b < a) nh = b; //resp. procesor se pro nas problem nauci, ze toto se nikdy neprovede sum += stepping*(nh + 0.5*concurrency::precise_math::fabs(a - b)); } return sum; } class CTbbSquare { std::vector &mAxis; double mStepping; public: CTbbSquare(std::vector &axis, double stepping) : mAxis(axis), mStepping(stepping) {}; void operator() (const tbb::blocked_range &r) const { SquareRange(mAxis.data() + r.begin(), mAxis.data() + r.end(), ((double)r.begin())*mStepping, mStepping); } }; class CTbbIntegrate { std::vector &mAxis; double mStepping; public: double mArea; CTbbIntegrate(std::vector &axis, double stepping) : mAxis(axis), mStepping(stepping), mArea(0.0) {}; CTbbIntegrate(const CTbbIntegrate &other, tbb::split) : mAxis(other.mAxis), mStepping(other.mStepping), mArea(0.0) {}; void operator() (const tbb::blocked_range &r) { mArea += IntegrateRange(mAxis.data() + r.begin(), mAxis.data() + r.end(), mStepping); } void join(const CTbbIntegrate &other) { mArea += other.mArea; } }; int main() { concurrency::accelerator defdevice; std::wcout << "Zarizeni: " << defdevice.get_description() << std::endl; if (defdevice == concurrency::accelerator(concurrency::accelerator::direct3d_ref)) std::cout << "Pozor: GPGPU vypocty pobezi na (pomalem) emulatoru, ktery je urceny pro ladeni." << std::endl; std::vector XAxis; double Stepping = 0.1; std::cout << "Iter.\tSerial area\tTBB area\tGPGPU area\tTBB save GPGPU save" << std::endl; for (size_t i = 0; i != SamplesCount; i++) { XAxis.resize((size_t) ceil(1.0 / Stepping)); auto clockstart = std::chrono::system_clock::now(); SquareRange(XAxis.data(), XAxis.data()+XAxis.size(), 0, Stepping); double SerialArea = IntegrateRange(XAxis.data()+1, XAxis.data() + XAxis.size(), Stepping); auto clockstop = std::chrono::system_clock::now(); auto SerialLasted = std::chrono::duration_cast(clockstop - clockstart).count(); clockstart = std::chrono::system_clock::now(); tbb::parallel_for(tbb::blocked_range(0, (int) XAxis.size()), CTbbSquare(XAxis, Stepping)); CTbbIntegrate reducer(XAxis, Stepping); tbb::parallel_reduce(tbb::blocked_range(1, (int) XAxis.size()), reducer); double TBBArea = reducer.mArea; clockstop = std::chrono::system_clock::now(); auto TBBLasted = std::chrono::duration_cast(clockstop - clockstart).count(); clockstart = std::chrono::system_clock::now(); concurrency::array_view XAxisView((int) XAxis.size(), XAxis.data()); concurrency::parallel_for_each(XAxisView.extent, [=](concurrency::index<1> idx) restrict(amp) { SquareRange(XAxisView.data() + idx[0], XAxisView.data() + idx[0] + 1, 0, Stepping); //inlinuje se na toto: //double step = idx[0] * Stepping; //XAxisView[idx] = concurrency::precise_math::sqrt(1.0 - step*step); }); XAxisView.synchronize(); reducer.mArea = 0.0; tbb::parallel_reduce(tbb::blocked_range(1, (int)XAxis.size()), reducer); //jak to udělat efektivně na GPGPU? Viz přednáška o výpočtu součtů prefixů. double GPGPUArea = reducer.mArea; clockstop = std::chrono::system_clock::now(); auto GPGPULasted = std::chrono::duration_cast(clockstop - clockstart).count(); //print results std::cout << i+1 << "\t"; std::cout << std::fixed << std::setprecision(12); std::cout << 4.0*SerialArea << "\t"; //*4.0 protoze jsme spocitali ctvrinu obsahu kruhu polomeru 1 std::cout << 4.0*TBBArea << "\t"; //=> ma se nam vypocitat Pi std::cout << 4.0*GPGPUArea << "\t"; //ovsem pro SamplesCount = 8 jenom na 7 desetinnych mist std::cout << std::fixed; std::cout << (SerialLasted - TBBLasted) << "\t"; std::cout << (SerialLasted - GPGPULasted); //bude vychazet horsi nez jenom s TBB, protoze vypocetne je problem prilis malo narocny std::cout << std::endl; Stepping *= SteppingMul; } std::cout << std::endl << "Pi: 3.141592653589 3.141592653589 3.141592653589" << std::endl; //Muzou se vysledky lisit? Ano, protoze cisla v plovouci carce maji jenom omezeny pocet mist //a tudiz aritmeticke operace na FPU nejsou vzdy asociativni, a s paralelnim zpracovanim //se tim padem mohou odehrat v jinem poradi nez u serioveho kodu. return 0; } /* Mozny vystup: Zarizeni: Intel(R) HD Graphics 4600 Iter. Serial area TBB area GPGPU area TBB save GPGPU save 1 3.017340347378 3.017340347378 3.017340347378 -31251 -140583 2 3.137595684583 3.137595684583 3.137595684583 0 0 3 3.141466046555 3.141466046555 3.141466046555 0 0 4 3.141588649255 3.141588649255 3.141588649255 0 0 5 3.141592526964 3.141592526960 3.141592526960 0 0 6 3.141592649574 3.141592649585 3.141592649585 15625 15625 7 3.141592654059 3.141592653464 3.141592653464 31266 15609 8 3.141592648261 3.141592653577 3.141592653577 359391 203127 Pi: 3.141592653589 3.141592653589 3.141592653589 */