Программирование для intel xeon phi

57
Нижегородский государственный университет им. Н.И.Лобачевского Факультет Вычислительной математики и кибернетики Лабораторная работа №4 Оптимизация расчетов на примере задачи вычисления справедливой цены опциона Европейского типа Программирование для Intel Xeon Phi Мееров И.Б., Сысоев А.В. Кафедра математического обеспечения ЭВМ При поддержке компании Intel

Upload: arista

Post on 04-Jan-2016

85 views

Category:

Documents


5 download

DESCRIPTION

Лабораторная работа № 4 Оптимизация расчетов на примере задачи вычисления справедливой цены опциона Европейского типа. Программирование для Intel Xeon Phi. При поддержке компании Intel. Мееров И.Б. , Сысоев А.В. Кафедра математического обеспечения ЭВМ. Вы думаете, все так просто? - PowerPoint PPT Presentation

TRANSCRIPT

4

4 Intel Xeon Phi .., .. Intel . .. 1, ? , . *

. .*

*

#. , 2013 .

Xeon Xeon Phi:

#. , 2013 .

1. #. , 2013 .

.: . . Intel Xeon Intel Xeon Phi. .

#. , 2013 .

2 Intel Xeon E5-2690 (2.9 GHz)2 Intel Xeon Phi 7110X (61 )64 GB Linux CentOS 6.2, Intel Parallel Studio XE 2013 SP1#. , 2013 .

2. #. , 2013 .

- (1) (2) - t - t - ( ) - ( ) - ( ) - (E=0 P=1, , Wt Ws ~ N(0, t-s), s < t), Wt () P=1. - .

#. , 2013 .

-

. , S ( ). , S ( ). , .

#. , 2013 .

-,

:

(3)

( ) (, -), Wt N(0, t).

#. , 2013 .

?

tS , Wt - . 1 , .

#. , 2013 .

P1 P2, P2 t P1 P1 K, . P2 () C P1. K (, strike price), C .

#. , 2013 .

- . P1 P2. C T ( , maturity, ) : K . ST K. ST < K, , C, C. ST > K, K, ( C ST K).

#. , 2013 .

, / . P2: (4)

T , (1. t = T 1. t = 0)#. , 2013 .

- . t=0 (F ):

(5)#. , 2013 .

3. #. , 2013 .

. , , , . , , . .

#. , 2013 .

#. , 2013 .

4. #. , 2013 .

(AoS Array of Structures)

(SOA Structure of Arrays): , . .#. , 2013 .

. int numThreads = 1;int N = 60000000;int main(int argc, char *argv[]){ int version; if (argc < 2) { printf("Usage: size version [#of_threads]\n"); return 1; } N = atoi(argv[1]); version = atoi(argv[2]); if (argc > 3) numThreads = atoi(argv[3]);

//

float res = GetOptionPrice(); printf("%.8f;\n", res); return 0;}#. , 2013 .

. const float sig = 0.2f;const float r = 0.05f;const float T = 3.0f;const float S0 = 100.0f;const float K = 100.0f;

float GetOptionPrice() { float C, d1, d2, p1, p2; d1 = (logf(S0 / K) + (r + sig * sig * 0.5f) * T) / (sig * sqrtf(T)); d2 = (logf(S0 / K) + (r - sig * sig * 0.5f) * T) / (sig * sqrtf(T)); p1 = cdfnormf(d1); p2 = cdfnormf(d2); C = S0 * p1 - K * expf((-1.0f) * r * T) * p2; return C;}#. , 2013 .

. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .

. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (log(pS0[i] / pK[i]) + (r + sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); d2 = (log(pS0[i] / pK[i]) + (r - sig * sig * 0.5) * pT[i]) / (sig * sqrt(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * exp((-1.0) * r * pT[i]) * p2; }}#. , 2013 .

. N60000000120000000180000000240000000 V0 ()17,00234,00451,00867,970 !

#. , 2013 .

1. __declspec(noinline) void GetOptionPricesV0( float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, p1, p2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); p1 = cdfnormf(d1); p2 = cdfnormf(d2); pC[i] = pS0[i] * p1 - pK[i] * expf((-1.0f) * r * pT[i]) * p2; }}#. , 2013 .

1. N60000000120000000180000000240000000 V0()17,00234,00451,00867,970 V1()16,77633,54950,33766,989 ? 3

#. , 2013 .

2. : cdfnorm() vs. erf() erff() cdfnormf(), .

: ?

#. , 2013 .

2. : cdfnorm() vs. erf()__declspec(noinline) void GetOptionPricesV2(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

2. : cdfnorm() vs. erf()N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 !

#. , 2013 .

3. : restrict ?

restrict?

restrict?

restrict, . ?#. , 2013 .

3. : restrict ? vec-report3 vec-report6 (Linux) Qvec-report3 Qvec-report6 (Windows) mavx ( SSE, AVX).

, ? , .

#. , 2013 .

3. : restrict__declspec(noinline) void GetOptionPricesV3( float * restrict pT, float * restrict pK, float * restrict pS0, float * restrict pC) { int i; float d1, d2, erf1, erf2; for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

4. : simd__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

4*. : ivdep vector always__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma ivdep#pragma vector always for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

3-4. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,0671. : loop was vectorized (SIMD loop was vectorized)2. 3 , . 3. 8 , 5,43. .. 2 3!

#. , 2013 .

__declspec(noinline) void GetOptionPricesV4(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd // Intel . ivdep for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 / sqrtf(2.0f)); erf2 = 0.5f + 0.5f * erff(d2 / sqrtf(2.0f)); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}5.43 #. , 2013 .

5. const float invsqrt2 = 0.707106781f;__declspec(noinline) void GetOptionPricesV5(float *pT, float *pK, float *pS0, float *pC){ int i; float d1, d2, erf1, erf2;#pragma simd for (i = 0; i < N; i++) { d1 = (logf(pS0[i] / pK[i]) + (r + sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); d2 = (logf(pS0[i] / pK[i]) + (r - sig * sig * 0.5f) * pT[i]) / (sig * sqrtf(pT[i])); erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }} . ? , #. , 2013 .

5. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085

#. , 2013 .

6. . __declspec(noinline) void GetOptionPricesV6(float *pT, float *pK, float *pS0, float *pC) { int i; float d1, d2, erf1, erf2, invf; float sig2 = sig * sig;#pragma simd for (i = 0; i < N; i++) { invf = invsqrtf(sig2 * pT[i]); d1 = (logf(pS0[i] / pK[i]) + (r + sig2 * 0.5f) * pT[i]) * invf; d2 = (logf(pS0[i] / pK[i]) + (r - sig2 * 0.5f) * pT[i]) * invf; erf1 = 0.5f + 0.5f * erff(d1 * invsqrt2); erf2 = 0.5f + 0.5f * erff(d2 * invsqrt2); pC[i] = pS0[i] * erf1 - pK[i] * expf((-1.0f) * r * pT[i]) * erf2; }}#. , 2013 .

6. . N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 . .

#. , 2013 .

6.1. SSE: 16, AVX: 32, Xeon Phi: 64memalign() -> __mm_malloc()Windows: __declspec(align(XX)) float T[N];Linux: float T[N] __attribute__((aligned(64)));#pragma vector aligned, __assume_aligned, __assumeint main(int argc, char *argv[]){ pT = (float *)memalign(32, 4 * N * sizeof(float));// pT = new float[4 * N]; ... free(pT);// delete [] pT; return 0;} #. , 2013 .

6.2. .icc ... -fimf-precision=low -fimf-domain-exclusion=31 N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724#. , 2013 .

7. #pragma omp parallel for private(invf, d1, d2, erf1, erf2)N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V7(16 )0,0580,0840,1260,153#. , 2013 .

7.1. . .

: ?#. , 2013 .

7.1. N60000000120000000180000000240000000 V017,00234,00451,00867,970 V116,77633,54950,33766,989 V22,8715,7278,64911,230 V30,5221,0491,5832,091 V40,5211,0361,5662,067 V50,5271,0471,5802,085 V60,5381,0711,6142,133 V6.10,5391,0721,6172,135 V6.20,4380,8711,3141,724 V70,0580,0840,1260,153 V6.30,4090,8121,2261,603 V7.10,0330,0620,0910,118#. , 2013 .

7.1. 7.1 6.3 , 7 6.2, 12.54 (60 . ) 13,61 (240 . ). 6.3 7,5% , 6.2, 7.1 , 7 ( 60 . , 70%, ).#. , 2013 .

Xeon Phi , 7. , , . , 6, Xeon. Xeon Phi -mmic. memalign() 32 64. . .

#. , 2013 .

Xeon Phi. Xeon Phi ( 6.2) 23%, , 2,3 . ( 60%) , , , . 6.3. . N60000000120000000180000000240000000 V61,5443,0894,6336,174 V6.11,5453,0914,6346,179 V6.20,6761,3522,0272,703 V6.30,4220,8451,2691,690#. , 2013 .

Xeon Phi. N60000000120000000180000000240000000 V70,1340,1490,1640,175S(V6.2/V7)5,03369,05012,33115,437 V7.10,0080,0170,0250,033S(V6.3/V7.1)50,58551,17851,78351,546N60000000120000000180000000240000000 V70,2340,2550,2570,255S(V6.2/V7)2,8855,3037,88310,590 V7.10,0070,0140,0210,028S(V6.3/V7.1)59,42259,58760,38959,839N60000000120000000180000000240000000 V70,5320,5270,5330,558S(V6.2/V7)1,2692,5643,8004,842 V7.10,0080,0160,0240,031S(V6.3/V7.1)53,28654,24853,96953,96460120240#. , 2013 .

Xeon Phi. 7 . , . ( 7.1) ( 50,5 60,4). 120 , 60 , Xeon Phi.

- ?#. , 2013 .

8. - 4 (pT, pK, PS0, pC). 3 , (pC) . , pC, . , ( pC nontemporal data). , , , nontemporal data, streaming stores, .#pragma vector nontemporal#. , 2013 .

8. -N60000000120000000180000000240000000 V80,0090,0180,0270,035S(V6.3/V8)46,87847,50547,61047,832N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,87365,04865,42665,887N60000000120000000180000000240000000 V80,0070,0130,0190,026S(V6.3/V8)63,22264,76865,37965,42060120240 54 65.

#. , 2013 .

Xeon vs. Xeon PhiN60000000120000000180000000240000000Xeon0,0300,0610,0900,116Xeon Phi0,0070,0130,0190,026#. , 2013 .

, . , Call Put. , Xeon Xeon Phi, , Xeon Phi . Xeon Phi . , .#. , 2013 .

. , 2. . 3- . .: , 2007. . 832. .. . , 2004. 1076. .., .., .., .. . . . 4 . : - , 2013. 1394 .#. , 2013 .

, ..., , . . [email protected] ,..., [email protected]

#. , 2013 .

12.012.42.882.62.542.362.242.832.572.651.342.482.472.532.142.832.292.142.091.82.242.332.942.882.562.172.092.562.82.352.772.62.212.992.152.492.62.42.022.142.212.12.592.742.972.051.42.262.852.452.332.692.642.72.91.82.352.462.012.492.822.122.562.913.82.842.322.752.672.962.242.42.062.12.162.282.242.52.172.122.632.583.42.482.722.012.592.042.712.8821.82.532.662.882.962.012.142.522.213.82.282.472.752.052.372.392.12.082.092.282.522.992.632.592.952.362.732.952.722.862.832.752.332.462.022.1232.192.932.942.042.152.482.312.752.22.742.432.252.282.6132.662.012.882.152.262.562.352.472.692.152.352.852.112.622.822.762.442.042.562.432.52.012.072.922.552.72.342.322.022.22.172.892.372.382.722.572.45

1 2 3 4 5 6 7 8 9

1 1 2 3 4 5 6 7 8 902.012.42.882.62.542.362.242.832.57t12.651.342.482.472.532.142.832.292.14t22.091.82.242.332.942.882.562.172.09t32.562.82.352.772.62.212.992.152.49t42.62.42.022.142.212.12.592.742.97t52.051.42.262.852.452.332.692.642.7t62.91.82.352.462.012.492.822.122.56t72.913.82.842.322.752.672.962.242.4t82.062.12.162.282.242.52.172.122.63t92.583.42.482.722.012.592.042.712.88t1021.82.532.662.882.962.012.142.52t112.213.82.282.472.752.052.372.392.1t122.082.092.282.522.992.632.592.952.36t132.732.952.722.862.832.752.332.462.02t142.1232.192.932.942.042.152.482.31t152.752.22.742.432.252.282.6132.66t162.012.882.152.262.562.352.472.692.15t172.352.852.112.622.822.762.442.042.56t182.432.52.012.072.922.552.72.342.32T2.022.22.172.892.372.382.722.572.45