optimisations faustiennes

Optimisations Faustiennes

Réalisé par :

Ramzi DARMOUL Encadré par :

M. Pierre JOUVELOT (CRI)M. Karim BARKATI (CRI)

M. Moncef TEMANI (ISI)

20 septembre 2010

Nourchène Elleuch Ben Ayed

Contexte Optimisation Faustiennes:

Etude expérimentale

Sous-projet 5 du projet ANR ASTREE

nature multi-équipe du stage:

Equipe FAUST: Pierre JOUVELOT, Karim BARKATI, Yann ORLAREY, Stéphane LETZ…

Equipe PIPS: Corinne ANCOURT, François IRIGOIN, Serge GUELTON, Mehdi AMINI…

Membres du CRI: Antoniu POP, Laurent DAVERIO, Claire MEDRALA, Samuel BENVENISTE…

2/22

Nourchène Elleuch Ben Ayed 3/22

Plan

1. Présentation de FAUST

2. Présentation de PIPS

3. Problématique

4. Workflow

5. Transformations de code

6. Résultats de la vectorisation

7. Résultats de la parallélisation

8. Conclusion et perspectives


FAUST (Functional Audio Stream)(1/2)

Langage de traitement de signal audio en temps réel :

programmation fonctionnelle

algèbre de bloc-diagramme

Compilateur qui génère du C++

choix d’architectures de déploiement

calcul DSP au niveau des échantillons4/22

Syntaxe Priorité Description

expression ~ expression 4 Composition récursive

expression , expression 3 Composition parallèle

expression : expression 2 Composition séquentielle

expression <: expression 1 Composition de division

expression :> expression 1 Composition de regroupement


FAUST (Functional Audio Stream)(2/2)

Fonctions GTK, JACK, etc.…class mydsp : public dsp{…// fonction de DSPvoid compute(int count, FAUSTFLOAT** input, FAUSTFLOAT** output) { float fSlow0 = (4.656613e-10f *

fslider0); FAUSTFLOAT* output0 = output[0];

for (int i=0; i<count; i++) { iRec0[0] = (12345+(1103515245*

iRec0[1])); output0[i]=(FAUSTFLOAT)(fSlow0*

iRec0[0]); iRec0[1] = iRec0[0]; } } };

noise.dsp

noise.cpprandom = +(12345) ~ *(1103515245);noise = random /2147483647.0;process = noise *

vslider("noise[style:knob]",0,0,100,0.1)/100;

5/22

PIPS (Paralléliseur interprocédural de programmes scientifiques)

Compilateur source-à-source :

analyses sémantiques

optimisations de code

parallélisation automatique

Source C ou FORTRAN

Script TPIPS

Source TransforméPIPS

6/22


Problématique


Workflow


Réponses impulsionnelles

freeverb = vgroup("Freeverb", fxctrl(fixedgain, wetSlider, stereoReverb(combfeed, allpassfeed, dampSlider, stereospread)));

process = 1-1' <: freeverb;

freeverb = vgroup("Freeverb", fxctrl(fixedgain, wetSlider, stereoReverb(combfeed, allpassfeed, dampSlider, stereospread)));

process = freeverb;

...virtual int getNumInputs() { return 2; }virtual int getNumOutputs() { return 2;}...

...virtual int getNumInputs() { return 0; }virtual int getNumOutputs() { return 2;}...

freeverb.cppfreeverb.dsp

Freeverb-impulse.dspFreeverb-impulse.cpp


Transformations de code (1/2)int fRec0[3];

int fRec1[2];

int IOTA;

...

void compute (int count, FAUSTFLOAT** input, FAUSTFLOAT** output)

{

FAUSTFLOAT* output0 = output[0];

for (int i=0; i<count; i++)

{

...

fRec0[0] = fVec1[IOTA-iSlow4&511]* fRec0[1] * fRec0[2];

fRec1[0] = (fTemp0 - floorf(fTemp0));

output0[i] = (FAUSTFLOAT)ftbl0[int((65536.0f * fRec1[0]))];

// post processing

fRec0[2]=fRec0[1]; fRec0[1]=fRec0[0];

fRec1[1] = fRec1[0];

...

IOTA = IOTA+1;

...

}

}


Transformations de code (2/2)float floorf(float x) {...}

void compute (int count, FAUSTFLOAT input[0][0], FAUSTFLOAT output[1][1024])

{

FAUSTFLOAT output0[1024];

int i;

int _iota=0;

int __scalar__0, __scalar__1; ... ;

for (i=0; i<count; i++)

{

output0[i]=output[0][i];

}

for (i=0; i<count; i++)

{ ...

__scalar__0= fVec1[_iota-iSlow4&511]* __scalar__1* __scalar__2;

__scalar__3 = (fTemp0 - floorf(fTemp0));

output0[i] = (FAUSTFLOAT)ftbl0[(int)((65536.0f* __scalar__3))];

output[0][i]=output0[i];

// post processing

__scalar__2=__scalar__1; __scalar__1=__scalar__0;

__scalar__4=__scalar__3;

_iota = i+1; ...

}

int getNumInputs() { return 0; }int getNumOutputs() { return 1; }


Vectorisation de SACvoid compute(int count, float input[0][0], float output[1][256])

{ ...

// PIPS generated variable

float F_03;

// PIPS : SAC generated variable

int aligned0[3+1] = {0, 0, 0, 0}, aligned1[3+1] = {0, 0, 0, 0};

// PIPS : SAC generated double vector(s)

__m128 v2df_vec1;

...

// PIPS SIMD_COMMENT_1

F_03 = fSlow2-fVec0[1];

v2df_vec1=_mm_loadu_ps(F_03);

v2di_vec3=_mm_cmpgt_ps(v2df_vec4, v2sf_vec5);

v2di_vec3=_mm_storeu_ps(aligned3[0]);

SIMD_MULPD(vec18, vec19, vec20);

SIMD_STORE_V2DF_TO_V2SF(vec18, &aligned6[0]);

SIMD_ADDPS(vec21, vec22, vec23);

...

}


Validation des transformations de PIPS


Résultats de vectorisation (1/4)


Résultats de vectorisation (2/4)

Nombre total de défauts dans le cache de données L1


Résultats de vectorisation-FAUST (3/4)

// boucles non vectorisées avec GCC après pré-vectorisation de FAUST

// LOOP 0x1d46270

for (int i=0; i<4; i++) {fRec7_tmp[i]=fRec7_perm[i];}

for (int i=0; i<count; i++) {

fRec7[i] = (0 - (((fRec3[i] * fRec7[i-2]) + (fRec2[i] *

fRec7[i-1])) - ((float)input4[i] *fRec1[i])));

}

for (int i=0; i<4; i++) {fRec7_perm[i]=fRec7_tmp[count+i];}

// LOOP 0x1d493b0

for (int i=0; i<4; i++) {fRec9_tmp[i]=fRec9_perm[i];}


fRec9[i] = (0 - (((fRec3[i] * fRec9[i-2]) + (fRec2[i] * fRec9[i-1])) - ((float)input6[i] * fRec1[i])));

}

// boucles vectorisées avec GCC après pré-vectorisation de FAUST


output0[i] = (FAUSTFLOAT)(fRec0[i] - fRec0[i-1]);

}


output1[i] = (FAUSTFLOAT)(fRec4[i] - fRec4[i-1]);

}


Résultats de vectorisation-PIPS (4/4)

SIMD_STORE_V4SF(vec21, &aligned7[0]);

SIMD_ADDPS(vec24, vec25, vec26);

SIMD_ADDPD(vec27, vec4, vec29);

SIMD_STORE_GENERIC_V2DF(vec27, &fRec1[2+LU_IND0], &fRec2[2+LU_IND0]);

aligned9[1] = (float) input0[1+LU_IND0]*fRec1[1+LU_IND0];

SIMD_LOAD_V4SF(vec32, &aligned9[0]);

SIMD_SUBPS(vec30, vec21, vec32);

SIMD_LOAD_V4SF(vec35, &aligned11[0]);

SIMD_SUBPS(vec33, vec24, vec35);

SIMD_UMINPS(vec36, vec30);

SIMD_STORE_GENERIC_V4SF(vec36, &fRec10[1+LU_IND0], &fRec0[1+LU_IND0], &fRec4[1+LU_IND0], &fRec5[1+LU_IND0]);

SIMD_UMINPS(vec38, vec33);

SIMD_STORE_GENERIC_V4SF(vec38, &fRec6[1+LU_IND0], &fRec7[1+LU_IND0], &fRec8[0], &fRec9[1+LU_IND0]);

// Boucles non vectorisées avec PIPS !!!

output0[1+LU_IND0] = (float) (fRec0[1+LU_IND0]-fRec0[LU_IND0]);

...

output1[1+LU_IND0] = (float) (fRec4[1+LU_IND0]-fRec4[LU_IND0]);


Résultats de parallélisation-PIPS (1/3)

#pragma omp parallel for

for (int i=0; i<4; i++) fXec0_tmp[i]=fXec0_perm[i];



fXec0[i] = fbutton0;

}


for (int i=0; i<4; i++) fXec0_perm[i]=fXec0_tmp[count+i];


for (int i=0; i<4; i++) fRec1_tmp[i]=fRec1_perm[i];

// exec code


fRec1[i] = ((((fXec0[i] - fXec0[i-1]) > 0.0f) + fRec1[i-1]) - (fSlow1 * (fRec1[i-1] > 0.0f)));

}


Résultats de parallélisation (2/3)

19/22


Résultats de parallélisation-FAUST (3/3)

#pragma omp parallel\

firstprivate(fSlow0, fSlow1, fXec0, fRec1, iRec2, fSlow2, iSlow3, fRec0)

{

for (int index = 0; index < fullcount; index += 32) {

// SECTION : 1

#pragma omp single

{ // LOOP 0xa08de20

for (int i=0; i<4; i++) fXec0_tmp[i]=fXec0_perm[i];

for (int i=0; i<count; i++) fXec0[i] = fbutton0;

for (int i=0; i<4; i++) fXec0_perm[i]=fXec0_tmp[count+i];}

// SECTION : 2

#pragma omp sections

{ #pragma omp section

{ // LOOP 0xa08d480

for (int i=0; i<4; i++) fRec1_tmp[i]=fRec1_perm[i];


Conclusion Difficultés rencontrées :

le grand nombre d’outils (C, C++ , FAUST, PIPS, jack, qjackctl, alsa, makefile, shell, R, ggplot2, octave, OProfile, Valgrind, gcc, icc, OpenMP, SSE, emacs, LaTeX,…)

certains bugs de PIPS et de SAC

Contributions :

Vectorisation et parallélisation des exemples DSP de FAUST via PIPS

Détection de certains bugs dans PIPS

Comparaison, analyse et interprétation des causes des différences de performances

Propositions d’optimisations pour FAUST


Perspectives

Propositions formulées pour PIPS :

Etendre la branche OpenMP

Stabiliser la branche SAC

Propositions formulées pour FAUST :

Générer des tableaux au lieu des pointeurs

Générer du code C

S’inspirer de l’algorithme de SAC de PIPS pour réaliser une vectorisation automatique

Combiner le parallélisme de tâches de FAUST avec le parallélisme de données de PIPS

optimisations faustiennes

Documents

inourchne elleuch ben

faustfloat input00

faustfloat output11024

fslider0 faustfloat

output0for int i

void compute int

pips generated variable

transformations de code