etc:common_activities:intel_students_cup:tour2
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| etc:common_activities:intel_students_cup:tour2 [2007/10/31 23:08] – jcmvbkbc | etc:common_activities:intel_students_cup:tour2 [2008/01/03 02:32] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== Исследование вопроса ====== | ====== Исследование вопроса ====== | ||
| - | пока я выложу свои | + | |
| + | * [[tour2_tasks|Текущие задачи]] | ||
| + | * [[algo_details|Детали алгоритма]] | ||
| ===== План ===== | ===== План ===== | ||
| Line 150: | Line 152: | ||
| | | ||
| </ | </ | ||
| + | |||
| ==== Менее очевидные шаги ==== | ==== Менее очевидные шаги ==== | ||
| + | === Использование Math Kernel Library === | ||
| + | [[http:// | ||
| + | * библиотека в основном хорошо реализует матричные операции; | ||
| + | Для исследования Intel Kernel Math Library написал следующее: | ||
| + | |||
| + | Код " | ||
| + | <code cpp> | ||
| + | #include < | ||
| + | #include < | ||
| + | |||
| + | int main() | ||
| + | { | ||
| + | double X[32][32]; | ||
| + | double F[32][32]; | ||
| + | |||
| + | for(int i=0; i<32; ++i) | ||
| + | for(int j=0; j<32; ++j) | ||
| + | X[i][j] = rand()%1024; | ||
| + | |||
| + | for(int x=0; | ||
| + | for(int y=0; | ||
| + | for(int i=0; | ||
| + | for(int j=0; | ||
| + | | ||
| + | return 0; | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | Код программы, | ||
| + | В нем применил функцию, | ||
| + | <code cpp> | ||
| + | #include " | ||
| + | #include < | ||
| + | |||
| + | int main() | ||
| + | { | ||
| + | double X[32][32]; | ||
| + | double F[32][32]; | ||
| + | |||
| + | for(int i=0; i<32; ++i) | ||
| + | for(int j=0; j<32; ++j) | ||
| + | X[i][j] = rand()%1024; | ||
| + | |||
| + | for(int x=0; | ||
| + | for(int y=0; | ||
| + | vdSin(32*32, | ||
| + | return 0; | ||
| + | } | ||
| + | </ | ||
| + | |||
| + | MakeFile: | ||
| + | < | ||
| + | default: stupid fast | ||
| + | |||
| + | main.o: main.cpp | ||
| + | g++ main.cpp -c -o main.o | ||
| + | |||
| + | stupid: main.o | ||
| + | g++ main.o -o stupid | ||
| + | clean: | ||
| + | rm -f main.o stupid | ||
| + | fast: imkl_main.o | ||
| + | g++ -L/ | ||
| + | |||
| + | imkl_main.o: | ||
| + | g++ -I/ | ||
| + | </ | ||
| + | Третьим шагом было изменение в " | ||
| + | Результат запуска: | ||
| + | <code bash> | ||
| + | make && time ./stupid && time ./fast && time ./floatfast | ||
| + | |||
| + | real 0m41.288s | ||
| + | user 0m39.579s | ||
| + | sys | ||
| + | |||
| + | real 0m18.878s | ||
| + | user 0m18.158s | ||
| + | sys | ||
| + | |||
| + | real 0m7.799s | ||
| + | user 0m7.412s | ||
| + | sys | ||
| + | </ | ||
| + | |||
| + | помимо всего прочего MKL имеет реализацию __одновременного__ вычисления синуса и косинуса в одной функции | ||
| + | |||
| + | Очевидно, | ||
| + | |||
| + | В случае же с MKL, интересней. Далее сравнительные времена выполнения двух программ с применением MKL: | ||
| + | - вызываются vsSin и vsCos | ||
| + | - вызывается vsSinCos | ||
| + | < | ||
| + | real 0m16.126s | ||
| + | user 0m15.261s | ||
| + | sys | ||
| + | |||
| + | real 0m13.289s | ||
| + | user 0m12.670s | ||
| + | sys | ||
| + | </ | ||
| + | |||
| + | === Использование Intel C Compiler === | ||
| + | [[http:// | ||
| + | * должен позволить автоматически ипользовать SIMD команды (SSE, SSE2...) для оптимизации вычислений в основном в циклах; | ||
| + | * возможно подскажет где что можно ещё распараллелить; | ||
| + | |||
| + | [[icc|Результаты]] | ||
| + | |||
| + | === Использование Integrated Performance Primitives === | ||
| + | [[http:// | ||
| + | * как заявляется производительность растёт в том числе и за счёт оптимизации библиотеки под различные модели процессоров; | ||
| + | * можно попробовать исопльзовать оттуда не только тригонометрию но и функции работы с изображениями 2D; | ||
| + | P.S. Менеджер проекта этой библиотеки из Нижнего Новгорода откуда и сам sunset :) | ||
| + | |||
| + | тестовая программка, | ||
| + | <code cpp> | ||
| + | #include < | ||
| + | #ifdef USE_IPP | ||
| + | #include < | ||
| + | #endif | ||
| + | #include < | ||
| + | #include < | ||
| + | |||
| + | #define COUNT(a) (sizeof(a)/ | ||
| + | |||
| + | float a[1024]; | ||
| + | float r1[1024]; | ||
| + | float r2[1024]; | ||
| + | |||
| + | int main() | ||
| + | { | ||
| + | for(size_t i=0; | ||
| + | a[i]=(drand48()-.5)*20; | ||
| + | |||
| + | #ifdef USE_IPP | ||
| + | for(int i=0; | ||
| + | ippsSin_32f_A21(a, | ||
| + | #else | ||
| + | for(int i=0; | ||
| + | for(size_t j=0; | ||
| + | r2[j]=sinf(a[j]); | ||
| + | #endif | ||
| + | |||
| + | #if 0 | ||
| + | double s=0; | ||
| + | for(size_t i=0; | ||
| + | { | ||
| + | s+=fabs(r1[i]-r2[i]); | ||
| + | } | ||
| + | printf(" | ||
| + | #endif | ||
| + | } | ||
| + | </ | ||
| + | < | ||
| + | default: stupid fast | ||
| + | |||
| + | stupid: main.o | ||
| + | g++ main.o -o stupid | ||
| + | clean: | ||
| + | rm -f main.o stupid | ||
| + | fast: ipp_main.o | ||
| + | g++ -L/ | ||
| + | |||
| + | ipp_main.o: main.cpp | ||
| + | g++ -I/ | ||
| + | </ | ||
| + | |||
| + | результаты: | ||
| + | < | ||
| + | $ time ./fast && time ./stupid | ||
| + | |||
| + | real 0m3.666s | ||
| + | user 0m3.661s | ||
| + | sys | ||
| + | |||
| + | real 0m37.244s | ||
| + | user 0m37.095s | ||
| + | sys | ||
| + | </ | ||
| + | итого -- ускорение в 10 раз. замена A21 на A11 дает | ||
| + | < | ||
| + | $ time ./fast && time ./stupid | ||
| + | |||
| + | real 0m2.975s | ||
| + | user 0m2.964s | ||
| + | sys | ||
| + | |||
| + | real 0m36.754s | ||
| + | user 0m36.675s | ||
| + | sys | ||
| + | </ | ||
| - | - ускорить счет синуса. например | + | Для тригонометрии |
| < | < | ||
| - | diff -ruN src-org/ | + | diff -ruN src-org/ |
| + | --- src-org/ | ||
| + | +++ src1/ | ||
| + | @@ -56,19 +56,19 @@ | ||
| + | | ||
| + | | ||
| + | | ||
| + | -CINC = -I$(SRC_DIR) | ||
| + | +CINC = -I$(SRC_DIR) -I/ | ||
| + | | ||
| + | | ||
| + | -CDEFOPT | ||
| + | +CDEFOPT | ||
| + | | ||
| + | -CFLAGS | ||
| + | +CFLAGS | ||
| + | | ||
| + | |||
| + | | ||
| + | | ||
| + | -LDFLAGS | ||
| + | +LDFLAGS | ||
| + | | ||
| + | -LIBS = -lm -lc | ||
| + | +LIBS = -lm -lc -lippcore -lippvm | ||
| + | | ||
| + | |||
| + | | ||
| + | diff -ruN src-org/ | ||
| --- src-org/ | --- src-org/ | ||
| - | +++ src/ | + | +++ src1/ |
| - | @@ -84,6 +84,27 @@ | + | @@ -45,6 +45,7 @@ |
| - | return a * (float)k; | + | #include < |
| - | } | + | #endif |
| + | # | ||
| + | +#include < | ||
| - | +# | + | # |
| - | +#define PI2 (2*3.141592653589793f) | + | #define MAX(x,y) (((x) < (y)) ? (y) : (x)) |
| - | +float g_sinTab[2*SIN_TAB_SZ+1]; | + | @@ -747,9 +748,10 @@ |
| - | + | + | |
| - | +void fillSinTab() | + | |
| - | +{ | + | |
| - | + for(int i=0; | + | |
| - | + { | + | |
| - | + g_sinTab[i]=sinf((i-SIN_TAB_SZ)*PI2/ | + | |
| - | + } | + | |
| - | +} | + | |
| - | + | + | |
| - | +inline float tab_sinf(float v) | + | |
| - | +{ | + | |
| - | + float i=fmodf(v,PI2)*SIN_TAB_SZ/ | + | |
| - | + size_t idx=i; | + | |
| - | + float d=i-idx; | + | |
| - | + | + | |
| - | + return g_sinTab[idx]*(1-d)+g_sinTab[idx+1]*d; | + | |
| - | +} | + | |
| - | + | + | |
| - | /* memory allocation function for two-dimensional arrays */ | + | |
| - | | + | |
| - | { | + | |
| - | @@ -320,6 +341,8 @@ | + | |
| - | / | + | |
| - | | + | |
| - | { | + | |
| - | + fillSinTab(); | + | |
| - | + | + | |
| - | | + | |
| - | | + | |
| - | @@ -749,7 +772,7 @@ | + | |
| - | #pragma ivdep | + | - |
| - | | + | - |
| - pFlTmp[t] = (float)sinf(pFlTmp[t]); | - pFlTmp[t] = (float)sinf(pFlTmp[t]); | ||
| - | + pFlTmp[t] = (float)tab_sinf(pFlTmp[t]); | + | + |
| + | + //#pragma ivdep | ||
| + | + //for(t=0; t< | ||
| + | + // | ||
| /* initialize the values of derivation */ | /* initialize the values of derivation */ | ||
| | | ||
| </ | </ | ||
| + | Результат -- 10.044/ | ||
| + | |||
| + | |||
| + | zps. | ||
| + | Максимальный результат: | ||
| + | diff -ruN src-org src-ipp | ||
| + | < | ||
| + | diff src-org/ | ||
| + | 59c59 | ||
| + | < CINC = -I$(SRC_DIR) | ||
| + | --- | ||
| + | > CINC = -I$(SRC_DIR) -I/ | ||
| + | 62c62,63 | ||
| + | < CDEFOPT | ||
| + | --- | ||
| + | > # | ||
| + | > CDEFOPT | ||
| + | 64c65 | ||
| + | < CFLAGS | ||
| + | --- | ||
| + | > CFLAGS | ||
| + | 69c70 | ||
| + | < LDFLAGS | ||
| + | --- | ||
| + | > LDFLAGS | ||
| + | 71c72 | ||
| + | < LIBS = -lm -lc | ||
| + | --- | ||
| + | > LIBS = -lm -lc -lippcore -lippvm -lguide -lipps -lippm | ||
| + | diff src-org/ | ||
| + | 48c48,51 | ||
| + | < | ||
| + | --- | ||
| + | > #include < | ||
| + | > #include < | ||
| + | > #include < | ||
| + | > #include < | ||
| + | 307c310,311 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | 339a344,346 | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 357c364,366 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 362a372 | ||
| + | > | ||
| + | 657a668,669 | ||
| + | > | ||
| + | > float xxx[iAngleHarmNum]; | ||
| + | 663c675 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | 679a692 | ||
| + | > | ||
| + | 732c745,749 | ||
| + | < */ | ||
| + | --- | ||
| + | > */// std::cerr << " | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 735, | ||
| + | < | ||
| + | < KX1 = flK[t] * flDecartX[i][j]; | ||
| + | < KY1 = flK[t] * flDecartY[i][j]; | ||
| + | < | ||
| + | --- | ||
| + | > kx1 = KX1[t]; | ||
| + | > ky1 = KY1[t]; | ||
| + | > ot = OT[t]; | ||
| + | > int len = t * iAngleHarmNum; | ||
| + | 741, | ||
| + | < | ||
| + | < | ||
| + | < KX1 * flAzimuthCosFi[l] - KY1 * flAzimuthSinFi[l] + | ||
| + | < | ||
| + | < } /* end for l */ | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | > kx1*flAzimuthCosFi[l] - ky1*flAzimuthSinFi[l] + flRandomPhase[iSinIndex1]; | ||
| + | > } /* end for l */ | ||
| + | 747c763,776 | ||
| + | < | ||
| + | --- | ||
| + | > #if 0 | ||
| + | > float * dest; | ||
| + | > for(t = 0; t < NKMAX; t++) | ||
| + | > | ||
| + | > dest = & | ||
| + | > kx1 = KX1[t]; | ||
| + | > ky1 = KY1[t]; | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > } /* end for t */ | ||
| + | > dest = NULL; | ||
| + | > #endif | ||
| + | > // ippsAdd_32f_I(flRandomPhase, | ||
| + | 750, | ||
| + | < # | ||
| + | < | ||
| + | < | ||
| + | --- | ||
| + | > ippsSin_32f_A21(pFlTmp, | ||
| + | > | ||
| + | > //# | ||
| + | > // | ||
| + | > // | ||
| + | 758a790,801 | ||
| + | > #if 0 | ||
| + | > | ||
| + | > const float *pr[]={flAmplitudeX, | ||
| + | > float __r[2]; | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > } | ||
| + | > #endif | ||
| + | > #if 1 | ||
| + | 767a811 | ||
| + | > #endif | ||
| + | Только в src-ipp: .sunset.cpp.swp | ||
| + | </ | ||
| + | |||
| + | результат: | ||
| + | <code bash> | ||
| + | Frame 1 of 16 ... frame time 8.534 | ||
| + | Frame 2 of 16 ... frame time 8.508 | ||
| + | Frame 3 of 16 ... frame time 8.167 | ||
| + | Frame 4 of 16 ... frame time 8.538 | ||
| + | Frame 5 of 16 ... frame time 8.467 | ||
| + | Frame 6 of 16 ... frame time 8.302 | ||
| + | Frame 7 of 16 ... frame time 8.565 | ||
| + | Frame 8 of 16 ... frame time 8.425 | ||
| + | Frame 9 of 16 ... frame time 8.549 | ||
| + | Frame 10 of 16 ... frame time 8.398 | ||
| + | Frame 11 of 16 ... frame time 8.425 | ||
| + | Frame 12 of 16 ... frame time 8.595 | ||
| + | Frame 13 of 16 ... frame time 8.419 | ||
| + | Frame 14 of 16 ... frame time 8.312 | ||
| + | Frame 15 of 16 ... frame time 8.471 | ||
| + | Frame 16 of 16 ... frame time 8.417 | ||
| + | ================================= | ||
| + | Timing: | ||
| + | Total time is 135.098 sec., average frame time is 8.444 sec. | ||
| + | |||
| + | Correctness check: | ||
| + | Max RGB difference is 11. | ||
| + | Number of different color pixels is 3564 (0.7%). | ||
| + | |||
| + | </ | ||
| + | |||
| + | {{: | ||
| + | Лучший из полученных результатов: | ||
| + | <code bash> | ||
| + | Frame 1 of 16 ... frame time 3.591 | ||
| + | Frame 2 of 16 ... frame time 3.492 | ||
| + | Frame 3 of 16 ... frame time 3.497 | ||
| + | Frame 4 of 16 ... frame time 3.499 | ||
| + | Frame 5 of 16 ... frame time 3.485 | ||
| + | Frame 6 of 16 ... frame time 3.492 | ||
| + | Frame 7 of 16 ... frame time 3.494 | ||
| + | Frame 8 of 16 ... frame time 3.492 | ||
| + | Frame 9 of 16 ... frame time 3.523 | ||
| + | Frame 10 of 16 ... frame time 3.491 | ||
| + | Frame 11 of 16 ... frame time 3.496 | ||
| + | Frame 12 of 16 ... frame time 3.495 | ||
| + | Frame 13 of 16 ... frame time 3.494 | ||
| + | Frame 14 of 16 ... frame time 3.492 | ||
| + | Frame 15 of 16 ... frame time 3.503 | ||
| + | Frame 16 of 16 ... frame time 3.495 | ||
| + | ================================= | ||
| + | Timing: | ||
| + | Total time is 56.036 sec., average frame time is 3.502 sec. | ||
| - | Даже такая простецкая реализация дает время 23.724/кадр, или ускорение в 1.5 раза. При этом количество отличающихся пикселов растет по сравнению с оригинальным | + | Correctness check: |
| - | Полагаю, более тонкая (ассемблерная?) реализация синуса даст еще небольшой прирост скорости. | + | Max RGB difference is 20. |
| + | Number of different color pixels is 5437 (1.1%). | ||
| + | </ | ||
| + | Достигнут: | ||
| + | указанием ключа fast при компилляции | ||
| + | | ||
| + | выносом | ||
| + | ==== Совсем неочевидные/ | ||
| + | * [[tab_sin|Табличный синус]] (плавающие числа) -- тупиковая ветвь. | ||
| + | * Целочисленная реализация алгоритма расчета вектора нормали -- выигрыша по скорости нет, по точности -- проигрыш. Тупиковая ветвь. | ||
| + | * [[harm_reduction|Уменьшение количества гармоник]] -- тупиковая ветвь. | ||
etc/common_activities/intel_students_cup/tour2.1193861307.txt.gz · Last modified: 2008/01/03 02:32 (external edit)