#include <mpi.h> int MPI_Init_thread(int *argc, char ***argv, int required, int *provided)
argc C/C++ only: Pointer to the number of arguments. argv C/C++ only: Argument vector. required Desired level of thread support (integer). provided Available level of thread support (integer).
required 可选值 分别是0,1,2,3
1 2 3 4 5 6 7 8
MPI_THREAD_SINGLE Only one thread will execute. MPI_THREAD_FUNNELED If the process is multithreaded, only the thread that called MPI_Init_thread will make MPI calls. MPI_THREAD_SERIALIZED If the process is multithreaded, only one thread will make MPI library calls at one time. MPI_THREAD_MULTIPLE If the process is multithreaded, multiple threads may call MPI at once with no restrictions.
Time taken is 21.364595 6.066717 EnforceLabelConnectivityComputing
时间细划,初始化omp
细致划分,malloc size大小的空间不耗时,是初始化为-1耗时
1 2 3 4
Time taken is 16.963094 0.000025 EnforceLabelConnectivity numlable Time taken is 17.395982 0.432887 EnforceLabelConnectivity xvec yvec Time taken is 22.668060 5.272079 EnforceLabelConnectivity iteration Time taken is 23.001499 0.333438 EnforceLabelConnectivity klabelsComputing
修改后
1 2 3 4
Time taken is 16.063057 0.000026 EnforceLabelConnectivity numlable Time taken is 16.095485 0.032428 EnforceLabelConnectivity xvec yvec Time taken is 21.116599 5.021114 EnforceLabelConnectivity iteration Time taken is 21.237874 0.121275 EnforceLabelConnectivity klabelsComputing
Time taken is 16.144375 13.062605 PerformSuperpixelSegmentation_VariableSandM 循环 Time taken is 16.144399 0.000025 EnforceLabelConnectivity numlable Time taken is 16.177300 0.032901 EnforceLabelConnectivity xvec yvec Time taken is 48.978709 32.801409 EnforceLabelConnectivity iteration Time taken is 49.086252 0.107543 EnforceLabelConnectivity klabelsComputing time=49086 ms There are 86475718 points' labels are different from original file.
不知道哪里错了,需要debug。简单debug,发现小问题。
1 2 3 4 5
Time taken is 15.670141 0.000024 EnforceLabelConnectivity numlable Time taken is 15.718014 0.047873 EnforceLabelConnectivity xvec yvec Time taken is 22.103680 6.385666 EnforceLabelConnectivity iteration Time taken is 22.219160 0.115480 EnforceLabelConnectivity klabelsComputing time=22219 ms There are 0 points' labels are different from original file.
但是尴尬的是并没有快。哭哭哭~~~~。
优化一下变量,快了3秒,大胜利!!!
1 2 3 4 5
Time taken is 16.203514 0.000029 EnforceLabelConnectivity numlable Time taken is 16.234977 0.031463 EnforceLabelConnectivity xvec yvec Time taken is 18.428990 2.194013 EnforceLabelConnectivity iteration Time taken is 18.527664 0.098674 EnforceLabelConnectivity klabelsComputing time=18527 ms There are 0 points' labels are different from original file.
Time taken is 16.226124 0.000024 EnforceLabelConnectivity numlable Time taken is 16.258697 0.032573 EnforceLabelConnectivity xvec yvec Time taken is 26.320222 10.061525 EnforceLabelConnectivity iteration Time taken is 26.401399 0.081177 EnforceLabelConnectivity klabelsComputing time=26401 ms There are 86588716 points' labels are different from original file.
Time taken is 15.743455 0.000025 EnforceLabelConnectivity numlable Time taken is 15.773654 0.030198 EnforceLabelConnectivity xvec yvec Time taken is 26.348979 10.575326 EnforceLabelConnectivity iteration Time taken is 26.442129 0.093150 EnforceLabelConnectivity klabelsComputing time=26442 ms There are 0 points' labels are different from original file.
现在的想法是要有先后顺序,把对(x,y)一行都处理完,再发射task。或者采取延迟发射的。
延迟发射
把发射任务(x+delay,y)用队列存储,每次循环check一下,最后循环结束后,在全部发射。
或者标记(x+delay,y)发射(x,y)。但是对于循环结束后的,不好处理。
1 2 3 4 5
Time taken is 17.344073 0.000027 EnforceLabelConnectivity numlable Time taken is 17.377535 0.033462 EnforceLabelConnectivity xvec yvec Time taken is 28.461901 11.084366 EnforceLabelConnectivity iteration Time taken is 28.544698 0.082797 EnforceLabelConnectivity klabelsComputing time=28544 ms There are 86588716 points' labels are different from original file.
Time taken is 15.538704 0.000026 EnforceLabelConnectivity numlable Time taken is 15.577671 0.038968 EnforceLabelConnectivity xvec yvec Time taken is 28.233859 12.656188 EnforceLabelConnectivity iteration Time taken is 28.332256 0.098396 EnforceLabelConnectivity klabelsComputing time=28332 ms
delay = 20 快了一点,哭
1 2 3 4 5
Time taken is 15.631368 0.000025 EnforceLabelConnectivity numlable Time taken is 15.661496 0.030128 EnforceLabelConnectivity xvec yvec Time taken is 26.788105 11.126609 EnforceLabelConnectivity iteration Time taken is 26.869487 0.081382 EnforceLabelConnectivity klabelsComputing time=26869 ms There are 0 points' labels are different from original file.
逆向优化分析
打上时间戳
1 2 3 4
end Time 84 32839 taken is 0.000000 dxy4 end Time 84 32839 taken is 0.000000 threadcount end Time 84 32839 taken is 0.031285 core end Time 84 32839 taken is 0.000023 count
Time taken is 36.212269 32.876626 PerformSuperpixelSegmentation_VariableSandM 循环 Time taken is 36.212297 0.000028 EnforceLabelConnectivity numlable Time taken is 36.247536 0.035239 EnforceLabelConnectivity xvec yvec Time taken is 106.097341 69.849805 EnforceLabelConnectivity iteration Time taken is 106.204154 0.106813 EnforceLabelConnectivity klabelsComputing time=106204 ms There are 0 points' labels are different from original file.
这个原因感觉是一开始只有1个,然后一般也就产生1/2个任务。将其初始任务改成64个就行。
但是如何一开始启动64个呢,我又提前不知道任务。
常驻64线程
写完又是segFault,debug
[64][64][10000]太大了,每次的队列应该没这么多[64][64][100]
对于结束的统计,要用同步一下,需要加critical。结果就对了 但是,这也太慢了
1 2 3 4 5
Time taken is 28.219408 0.000017 EnforceLabelConnectivity numlable Time taken is 28.271994 0.052586 EnforceLabelConnectivity xvec yvec Time taken is 83.591540 55.319546 EnforceLabelConnectivity iteration Time taken is 83.696990 0.105450 EnforceLabelConnectivity klabelsComputing time=83696 ms There are 0 points' labels are different from original file.
int count(1); #pragma omp parallel num_threads(64) { #pragma omp single { int c = 0; while(c < count) { for( ; c < count; c++ ) { #pragma omp task{ for( int n = 0; n < 4; n++ ) { int x = xvec[c] + dx4[n]; int y = yvec[c] + dy4[n];
if( (x >= 0 && x < width) && (y >= 0 && y < height) ) { int nindex = y*width + x;
double result = 0; #pragma omp parallel num_threads(ndata) { double local_result; int num = omp_get_thread_num(); if (num==0) local_result = f(x); elseif (num==1) local_result = g(x); elseif (num==2) local_result = h(x); #pragma omp critical result += local_result; }
double result = 0; #pragma omp parallel { double local_result; #pragma omp for for (i=0; i<N; i++) { local_result = f(x,i); #pragma omp critical result += local_result; } // end of for loop }
intmymax(int r,int n) { // r is the already reduced value // n is the new value int m; if (n>r) { m = n; } else { m = r; } return m; } #pragma omp declare reduction \ (rwz:int:omp_out=mymax(omp_out,omp_in)) \ initializer(omp_priv=INT_MIN) m = INT_MIN; #pragma omp parallel for reduction(rwz:m) for (int idata=0; idata<ndata; idata++) m = mymax(m,data[idata]);
cd <compdir>\ tar -xvf aocc-compiler-<ver>.tar cd aocc-compiler-<ver> bash install.sh # It will install the compiler and displaythe AOCC setup instructions.
source <compdir>/setenv_AOCC.sh # This will setup the shell environment for using AOCC C, C++, and Fortran compiler where the command is executed.