etc:common_activities:intel_students_cup:tour2
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| etc:common_activities:intel_students_cup:tour2 [2007/11/03 21:52] – zps | etc:common_activities:intel_students_cup:tour2 [2008/01/03 02:32] (current) – external edit 127.0.0.1 | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== Исследование вопроса ====== | ====== Исследование вопроса ====== | ||
| - | пока я выложу свои | + | |
| + | * [[tour2_tasks|Текущие задачи]] | ||
| + | * [[algo_details|Детали алгоритма]] | ||
| ===== План ===== | ===== План ===== | ||
| Line 150: | Line 152: | ||
| | | ||
| </ | </ | ||
| + | |||
| ==== Менее очевидные шаги ==== | ==== Менее очевидные шаги ==== | ||
| Line 258: | Line 261: | ||
| * должен позволить автоматически ипользовать SIMD команды (SSE, SSE2...) для оптимизации вычислений в основном в циклах; | * должен позволить автоматически ипользовать SIMD команды (SSE, SSE2...) для оптимизации вычислений в основном в циклах; | ||
| * возможно подскажет где что можно ещё распараллелить; | * возможно подскажет где что можно ещё распараллелить; | ||
| + | |||
| + | [[icc|Результаты]] | ||
| === Использование Integrated Performance Primitives === | === Использование Integrated Performance Primitives === | ||
| Line 264: | Line 269: | ||
| * можно попробовать исопльзовать оттуда не только тригонометрию но и функции работы с изображениями 2D; | * можно попробовать исопльзовать оттуда не только тригонометрию но и функции работы с изображениями 2D; | ||
| P.S. Менеджер проекта этой библиотеки из Нижнего Новгорода откуда и сам sunset :) | P.S. Менеджер проекта этой библиотеки из Нижнего Новгорода откуда и сам sunset :) | ||
| + | |||
| + | тестовая программка, | ||
| + | <code cpp> | ||
| + | #include < | ||
| + | #ifdef USE_IPP | ||
| + | #include < | ||
| + | #endif | ||
| + | #include < | ||
| + | #include < | ||
| + | |||
| + | #define COUNT(a) (sizeof(a)/ | ||
| + | |||
| + | float a[1024]; | ||
| + | float r1[1024]; | ||
| + | float r2[1024]; | ||
| + | |||
| + | int main() | ||
| + | { | ||
| + | for(size_t i=0; | ||
| + | a[i]=(drand48()-.5)*20; | ||
| + | |||
| + | #ifdef USE_IPP | ||
| + | for(int i=0; | ||
| + | ippsSin_32f_A21(a, | ||
| + | #else | ||
| + | for(int i=0; | ||
| + | for(size_t j=0; | ||
| + | r2[j]=sinf(a[j]); | ||
| + | #endif | ||
| + | |||
| + | #if 0 | ||
| + | double s=0; | ||
| + | for(size_t i=0; | ||
| + | { | ||
| + | s+=fabs(r1[i]-r2[i]); | ||
| + | } | ||
| + | printf(" | ||
| + | #endif | ||
| + | } | ||
| + | </ | ||
| + | < | ||
| + | default: stupid fast | ||
| + | |||
| + | stupid: main.o | ||
| + | g++ main.o -o stupid | ||
| + | clean: | ||
| + | rm -f main.o stupid | ||
| + | fast: ipp_main.o | ||
| + | g++ -L/ | ||
| + | |||
| + | ipp_main.o: main.cpp | ||
| + | g++ -I/ | ||
| + | </ | ||
| + | |||
| + | результаты: | ||
| + | < | ||
| + | $ time ./fast && time ./stupid | ||
| + | |||
| + | real 0m3.666s | ||
| + | user 0m3.661s | ||
| + | sys | ||
| + | |||
| + | real 0m37.244s | ||
| + | user 0m37.095s | ||
| + | sys | ||
| + | </ | ||
| + | итого -- ускорение в 10 раз. замена A21 на A11 дает | ||
| + | < | ||
| + | $ time ./fast && time ./stupid | ||
| + | |||
| + | real 0m2.975s | ||
| + | user 0m2.964s | ||
| + | sys | ||
| + | |||
| + | real 0m36.754s | ||
| + | user 0m36.675s | ||
| + | sys | ||
| + | </ | ||
| Для тригонометрии в hotspot 2 использовал ippsSin. | Для тригонометрии в hotspot 2 использовал ippsSin. | ||
| Line 291: | Line 374: | ||
| | | ||
| -LIBS = -lm -lc | -LIBS = -lm -lc | ||
| - | +LIBS = -lm -lc -lippcore -lippvm | + | +LIBS = -lm -lc -lippcore -lippvm |
| | | ||
| Line 323: | Line 406: | ||
| Результат -- 10.044/ | Результат -- 10.044/ | ||
| + | |||
| + | zps. | ||
| + | Максимальный результат: | ||
| + | diff -ruN src-org src-ipp | ||
| + | < | ||
| + | diff src-org/ | ||
| + | 59c59 | ||
| + | < CINC = -I$(SRC_DIR) | ||
| + | --- | ||
| + | > CINC = -I$(SRC_DIR) -I/ | ||
| + | 62c62,63 | ||
| + | < CDEFOPT | ||
| + | --- | ||
| + | > # | ||
| + | > CDEFOPT | ||
| + | 64c65 | ||
| + | < CFLAGS | ||
| + | --- | ||
| + | > CFLAGS | ||
| + | 69c70 | ||
| + | < LDFLAGS | ||
| + | --- | ||
| + | > LDFLAGS | ||
| + | 71c72 | ||
| + | < LIBS = -lm -lc | ||
| + | --- | ||
| + | > LIBS = -lm -lc -lippcore -lippvm -lguide -lipps -lippm | ||
| + | diff src-org/ | ||
| + | 48c48,51 | ||
| + | < | ||
| + | --- | ||
| + | > #include < | ||
| + | > #include < | ||
| + | > #include < | ||
| + | > #include < | ||
| + | 307c310,311 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | 339a344,346 | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 357c364,366 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 362a372 | ||
| + | > | ||
| + | 657a668,669 | ||
| + | > | ||
| + | > float xxx[iAngleHarmNum]; | ||
| + | 663c675 | ||
| + | < | ||
| + | --- | ||
| + | > | ||
| + | 679a692 | ||
| + | > | ||
| + | 732c745,749 | ||
| + | < */ | ||
| + | --- | ||
| + | > */// std::cerr << " | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | 735, | ||
| + | < | ||
| + | < KX1 = flK[t] * flDecartX[i][j]; | ||
| + | < KY1 = flK[t] * flDecartY[i][j]; | ||
| + | < | ||
| + | --- | ||
| + | > kx1 = KX1[t]; | ||
| + | > ky1 = KY1[t]; | ||
| + | > ot = OT[t]; | ||
| + | > int len = t * iAngleHarmNum; | ||
| + | 741, | ||
| + | < | ||
| + | < | ||
| + | < KX1 * flAzimuthCosFi[l] - KY1 * flAzimuthSinFi[l] + | ||
| + | < | ||
| + | < } /* end for l */ | ||
| + | --- | ||
| + | > | ||
| + | > | ||
| + | > kx1*flAzimuthCosFi[l] - ky1*flAzimuthSinFi[l] + flRandomPhase[iSinIndex1]; | ||
| + | > } /* end for l */ | ||
| + | 747c763,776 | ||
| + | < | ||
| + | --- | ||
| + | > #if 0 | ||
| + | > float * dest; | ||
| + | > for(t = 0; t < NKMAX; t++) | ||
| + | > | ||
| + | > dest = & | ||
| + | > kx1 = KX1[t]; | ||
| + | > ky1 = KY1[t]; | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > } /* end for t */ | ||
| + | > dest = NULL; | ||
| + | > #endif | ||
| + | > // ippsAdd_32f_I(flRandomPhase, | ||
| + | 750, | ||
| + | < # | ||
| + | < | ||
| + | < | ||
| + | --- | ||
| + | > ippsSin_32f_A21(pFlTmp, | ||
| + | > | ||
| + | > //# | ||
| + | > // | ||
| + | > // | ||
| + | 758a790,801 | ||
| + | > #if 0 | ||
| + | > | ||
| + | > const float *pr[]={flAmplitudeX, | ||
| + | > float __r[2]; | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > | ||
| + | > } | ||
| + | > #endif | ||
| + | > #if 1 | ||
| + | 767a811 | ||
| + | > #endif | ||
| + | Только в src-ipp: .sunset.cpp.swp | ||
| + | </ | ||
| + | |||
| + | результат: | ||
| + | <code bash> | ||
| + | Frame 1 of 16 ... frame time 8.534 | ||
| + | Frame 2 of 16 ... frame time 8.508 | ||
| + | Frame 3 of 16 ... frame time 8.167 | ||
| + | Frame 4 of 16 ... frame time 8.538 | ||
| + | Frame 5 of 16 ... frame time 8.467 | ||
| + | Frame 6 of 16 ... frame time 8.302 | ||
| + | Frame 7 of 16 ... frame time 8.565 | ||
| + | Frame 8 of 16 ... frame time 8.425 | ||
| + | Frame 9 of 16 ... frame time 8.549 | ||
| + | Frame 10 of 16 ... frame time 8.398 | ||
| + | Frame 11 of 16 ... frame time 8.425 | ||
| + | Frame 12 of 16 ... frame time 8.595 | ||
| + | Frame 13 of 16 ... frame time 8.419 | ||
| + | Frame 14 of 16 ... frame time 8.312 | ||
| + | Frame 15 of 16 ... frame time 8.471 | ||
| + | Frame 16 of 16 ... frame time 8.417 | ||
| + | ================================= | ||
| + | Timing: | ||
| + | Total time is 135.098 sec., average frame time is 8.444 sec. | ||
| + | |||
| + | Correctness check: | ||
| + | Max RGB difference is 11. | ||
| + | Number of different color pixels is 3564 (0.7%). | ||
| + | |||
| + | </ | ||
| + | |||
| + | {{: | ||
| + | Лучший из полученных результатов: | ||
| + | <code bash> | ||
| + | Frame 1 of 16 ... frame time 3.591 | ||
| + | Frame 2 of 16 ... frame time 3.492 | ||
| + | Frame 3 of 16 ... frame time 3.497 | ||
| + | Frame 4 of 16 ... frame time 3.499 | ||
| + | Frame 5 of 16 ... frame time 3.485 | ||
| + | Frame 6 of 16 ... frame time 3.492 | ||
| + | Frame 7 of 16 ... frame time 3.494 | ||
| + | Frame 8 of 16 ... frame time 3.492 | ||
| + | Frame 9 of 16 ... frame time 3.523 | ||
| + | Frame 10 of 16 ... frame time 3.491 | ||
| + | Frame 11 of 16 ... frame time 3.496 | ||
| + | Frame 12 of 16 ... frame time 3.495 | ||
| + | Frame 13 of 16 ... frame time 3.494 | ||
| + | Frame 14 of 16 ... frame time 3.492 | ||
| + | Frame 15 of 16 ... frame time 3.503 | ||
| + | Frame 16 of 16 ... frame time 3.495 | ||
| + | ================================= | ||
| + | Timing: | ||
| + | Total time is 56.036 sec., average frame time is 3.502 sec. | ||
| + | |||
| + | Correctness check: | ||
| + | Max RGB difference is 20. | ||
| + | Number of different color pixels is 5437 (1.1%). | ||
| + | </ | ||
| + | Достигнут: | ||
| + | указанием ключа fast при компилляции | ||
| + | заменой дублирующих вычислений в циклах в очевидных местах | ||
| + | выносом вычисления синусов и косинусов (ipp) полярных координат | ||
| + | |||
| + | ==== Совсем неочевидные/ | ||
| + | * [[tab_sin|Табличный синус]] (плавающие числа) -- тупиковая ветвь. | ||
| + | * Целочисленная реализация алгоритма расчета вектора нормали -- выигрыша по скорости нет, по точности -- проигрыш. Тупиковая ветвь. | ||
| + | * [[harm_reduction|Уменьшение количества гармоник]] -- тупиковая ветвь. | ||
etc/common_activities/intel_students_cup/tour2.1194115969.txt.gz · Last modified: 2008/01/03 02:32 (external edit)