programming language -final project- “project 4”
DESCRIPTION
Programming Language -Final Project- “Project 4”. 學生:林郁凱 謝侑娟 學號: 19709046 19709053 指導教師:鍾翊方 孫行人. Project 4: Identification of Monotonic Genes by H-test. Find all of ascending and descending Monotonic Genes - PowerPoint PPT PresentationTRANSCRIPT
Programming Language -Final Project-
“Project 4”
學生:林郁凱 謝侑娟 學號: 19709046 19709053
指導教師:鍾翊方 孫行人
2
Project 4: Identification of Monotonic Genes by H-test
1. Find all of ascending and descending Monotonic Genesa) For each gene, the mean of the gene expression values in
each group is calculated.
b) Let the mean for gene i in group j be μij; j = 1,..., k, k is the number of groups. We sort μij in increasing (or decreasing) order.
c) If the order of the sorted μij is not the same as that of the unsorted μij, then we remove the gene from the list.
2. Perform H-test and rank genes by their H-value
3. Plot the results
1. Find all of ascending and descending Monotonic Genes
a) mean of gene expression
b) sort, increasing / decreasing order
ESCN_Nor
Increase (mean of expression)
Ascending
• #Monotonic gene • open (FILE, "ESCN_Nor.txt") || die "file open
error!\n";$line_1=<FILE>;@monotonic=<FILE>;close(FILE);
• # 分成 5 群 (ESC EB d10 d17 NSC)• @groups=(3, 3, 6, 6, 9);• foreach (@monotonic){• @line_each=split(/\s+/,$_);• • $value_ref=[ • [@line_each[1..3]],• [@line_each[4..6]],• [@line_each[7..12]],• [@line_each[13..18]],• [@line_each[19..27]]• ];
• # 求 mean of gene expression for($i=0;$i<5;$i++){
• for($d=0;$d< $groups[$i] ;$d++){• $num_1=$value_ref->[$i]->[$d];• $total_1+= $num_1;• }• $total_1=($total_1/$groups[$i]);• $dd_1=$total_1;• $total_1=();$num_1=();• push(@aa_1,$dd_1);• }• @cc_1=@aa_1;• $dd_1=();@aa_1=();• unshift(@cc_1,$line_each[0]);
• #sorting (Ascending & Descending) for($t=1;$t<6;$t++){
• if($cc_1[$t]>$cc_1[$t+1]){• $count2++;• }• }• if($count2==1){• $strings1=join" ",@cc_1;• push(@qq,$strings1);• }• elsif($count2==5){• $strings2=join" ",@cc_1;• push(@rr,$strings2);• }• @cc_1=();$count2=();
* “undef”=0;
• foreach (@qq){• @line_increase=split(/\s+/,$_);• $h_increase=$table{$line_increase[0]};• $table_increase->{$line_increase[0]}• =$h_increase;• }
• foreach (@rr){• @line_decrease=split(/\s+/,$_);•
$h_decrease=$table{$line_decrease[0]};• $table_decrease->{$line_decrease[0]}• =$h_decrease;• }
2. Perform H-test and rank genes by their H-value
a) Kruskal-Wallis H test
b) H-test (Modify~tied observations)
12
H-test (Monotonic Genes)
Kruskal-Wallis H test
Ri= Ranks of a group
Ni= samples of a group (ex:[ESC]~ 3 samples)
C = groups (ex:[ESC EB d10 d17 NSC]~ 5 groups)
N =total of samples (ex: [ESCN_Nor]~27 samples)
排名 (Rank)
probe_ID 「 ESC . . EB . . D10 . . . . d17 . . . . . NSC . . . . . . . . 」~ gene expression
value frequency
Repeat (value & frequency)
取中間值
• # 將 sample 從大到小排序 • for($u=1; $u<=$#line_each; $u++){• for($v=1; $v<=$#line_each; $v++){• if($line_each[$u] <
$line_each[$v]){• $count++;• }• }• $count=$count+1;• push(@hh,$count);• @h_test=@hh;• $count=();• }• @hh=();
probe_ID 「 ESC . . EB . … D10 …. . . d17 . . NSC
• # 找出重複的表現數值及頻率 • foreach (@h_test){• if(exists $seen{$_}){• $seen{$_} += 1;• }• else{• $seen{$_} = 1;• }• }• foreach $target (sort keys %seen){• $frequency= $seen{$target};• • push(@targets,$target);• push(@frequencys,$frequency);• }• %seen=();$target=();$frequency=();• @ties=@frequencys;
value frequency
• # 計算重複值的平均次序 ( 找中間值 )• for($g=0;$g<=$#targets;$g++){ $inter_value=$targets[$g]+
($frequencys[$g]-1)/2;
• push(@inters,$inter_value);• }• @frequencys=();$inter_value=();
• foreach (@h_test){• for($k=0;$k<=$#inters;$k++){• if($_ == $targets[$k]){• $_=" ";• $_=~ s/(\s+)/$inters[$k]/;• }• }• }• @targets=();@inters=();• unshift(@h_test,$line_each[0]);
ESC EB d10 d17 NSC
71 52 128 82 45
Ri 值 (1)
ESC EB d10 d17 NSC
Ri 值 (2)
ESC EB d10 d17 NSC
Ri 平方
• $h_test_ref=[
• [@h_test[1..3]],
• [@h_test[4..6]],
• [@h_test[7..12]],
• [@h_test[13..18]],
• [@h_test[19..27]]
• ];
• for($i=0;$i<5;$i++){• for($d=0;$d< $groups[$i] ;$d++){• $num_2=$h_test_ref->[$i]->[$d];
$total_2+= $num_2;• }• $dd_2=$total_2**2; • $total_2=();$num_2=();• push(@aa_2,$dd_2);• }• @cc_2=@aa_2;• $dd_2=();@aa_2=();
H-test (Modify~tied observations)
Ri= Ranks of a group
Ni= samples of a group (ex:[ESC]~ 3 samples)
C = groups (ex:[ESC EB d10 d17 NSC]~ 5 groups)
N =total of samples (ex: [ESCN_Nor]~27 samples)
T = t3-t (t: tied observation, repeat frequency)
value frequency
Repeat (value & frequency)
• # Kruskal-Wallis H test • $sample=$#line_each;• for($p=0; $p<=$#groups; $p++){•
$series_numer=($cc_2[$p]/$groups[$p]);• $all+= $series_numer;• }
• for(@ties){ • $series_denomin=($_**3 -$_);• $repeat+= $series_denomin;• }•
T = t3-t
• $h_numerator= (12/($sample*($sample+1)))*($all)-3*($sample+1) ;
• • $h_denominator=1-• ($repeat)/($sample**3 -$sample);
• $h=$h_numerator/$h_denominator;• $h =sprintf "%.3f",$h;
• $table{$line_each[0]}=$h;• $all=();$repeat=();$h_numerator=();• $h_denominator=();$h=();• }
• foreach (@qq){• @line_increase=split(/\s+/,$_);• $h_increase=$table{$line_increase[0]};• $table_increase->{$line_increase[0]}• =$h_increase;• }
• foreach (@rr){• @line_decrease=split(/\s+/,$_);•
$h_decrease=$table{$line_decrease[0]};• $table_decrease->{$line_decrease[0]}• =$h_decrease;• }
• #Ascending/Descending H-test • & Probe ID • @results=qw/check_1 check_2/;• @h_tables=($table_increase,
$table_decrease);
• for($n=0; $n<=$#results; $n++){• $filename ="$results[$n].txt";• open(FILE, ">$results[$n].txt")• || die"File open Error!\n";• foreach $item (• sort { $h_tables[$n]->{$b} <=>
$h_tables[$n]->{$a} or $a <=> $b } keys %{$h_tables[$n]})
• {• $aa=$h_tables[$n]->{$item};• print FILE $item."\t".$aa."\n";• }• print" 已對檔案『 $filename 』寫出。 \n";• close(FILE);• }
* Reference: $table_increase , $table_decrease
High value of H-test, gene express
Probe_ID H-valueAscending
Top 15 of H-test value, gene express
「 ESC . . EB . . d10 . . . . . 」 ~gene expression Probe_ID
Ascending
• #H-test of Ascending/Descending • monotonic gene
• open (FILE, "check_1.txt") || die "file open error!\n";
• @increase_h_test=<FILE>;• close(FILE);
• open (FILE, "check_2.txt") || die "file open error!\n";
• @decrease_h_test=<FILE>;• close(FILE);
• #High value of H-test, for gene expression (Ascending & Descending)
• foreach (@increase_h_test){• @line1=split(/\s+/,$_);• foreach(@check::monotonic){• if( $_=~/($line1[0](\s+)(.*))/){• push(@probes1,$1);• push(@aa,@probes1);• }• @probes1=();• }• }• foreach (@decrease_h_test){• @line2=split(/\s+/,$_);• foreach(@check::monotonic){• if( $_=~/($line2[0](\s+)(.*))/){• push(@probes2,$1);• push(@bb,@probes2);• }• @probes2=();• }• }
• #Top 15 of H-test value, for gene expression (Ascending & Descending)
• $filename ="top15_increase.txt";• open(FILE, ">top15_increase.txt")||die"File
open Error!\n";• for($i=0; $i<15; $i++){• print FILE "$aa[$i]\n";• }• print" 已對檔案『 $filename 』寫出。 \n";• close(FILE);
• $filename ="top15_decrease.txt";• open(FILE, ">top15_decrease.txt")||die"File
open Error!\n";• for($i=0; $i<15; $i++){• print FILE "$bb[$i]\n";• }• print" 已對檔案『 $filename 』寫出。 \n";• close(FILE);
3. Plot the results
a) Setting “undef”
b) GD::Graph
Probe ID H_value
Probe ID ESC EB d10 …...
Top 5 of H-test value , Samples 對Gene Expression values (Ascending)
• #Top 5 of H-test value , Samples 對 Gene Expression values
• @all=([@aa], [@bb]);• @h_test=([@increase_h_test],
[@decrease_h_test]);• @name=qw/Image_top15_increase
Image_top15_decrease/;
• for($s=0; $s<2; $s++){• for($m=0; $m<=14; $m++){• @line_each=split(/\s+/,$all[$s]->[$m]);• @line3=split(/\s+/,$h_test[$s]->[$m]);• • $value_ref=[• [@line_each[1..3]],• [@line_each[4..6]],• [@line_each[7..12]],• [@line_each[13..18]],• [@line_each[19..27]]• ];
• for($u=0; $u<27; $u++){• push(@re_undef,undef);• }• @line=@re_undef;
• @line_0=@line;• @line_1=@line;• @line_2=@line;• @line_3=@line;• @line_4=@line;
• for($u=0; $u<3; $u++){• $line_0[$u]= $line_each[$u+1];• }• for($u=3; $u<6; $u++){• $line_1[$u]= $line_each[$u+1];• }• for($u=6; $u<12; $u++){• $line_2[$u]= $line_each[$u+1];• }• for($u=12; $u<18; $u++){• $line_3[$u]= $line_each[$u+1];• }• for($u=18; $u<27; $u++){• $line_4[$u]= $line_each[$u+1];• }• @re_undef=();
[1,2,3,un,un,un,un,un,un,un,un,un,un,un,un,un,un,…]
[un,un,un,4,5,6,un,un,un,un,un,un,un,un,un,un,un,…]
[un,un,un,un,un,un,7,8,9,10,11,12,un,un,un,un,un,…]
• use GD::Graph::points;• @data=(• [ undef ,undef , undef ,undef ,
5 ,undef ,undef , undef ,
• undef , 10 ,undef ,undef , • undef ,undef, 15 ,undef ,• undef ,undef ,undef , 20 ,• undef , undef ,undef ,undef ,• 25 ,undef , undef],• [@line_0],[@line_1],[@line_2],• [@line_3],[@line_4],• );• $chart= • new GD::Graph::points(700,500); • # 長、寬
• $chart-> set(• x_label => 'Samples', #x 軸 - 標題• y_label => 'Gene Expression Values',
• #y 軸 - 標題• title=> # 主題• "$line_each[0] H_value:
$line3[1]",
• marker_size => 2, #point 大小• markers => [ 1 ],
• x_label_skip => 1,• y_min_value => 0,• box_axis => 0,• x_min_value => 0,• );
• @legend_keys= • qw(ESC EB d10 d17
NSC);• #point 名稱
• $chart-> set_legend(@legend_keys);• $chart-> set( dclrs =>• [ qw(green pink blue yellow
red) ] );• @line_each=();
• $gd_object=$chart->plot(\@data);• @data=();
• @results=qw/image1 image2 image3 • image4 image5 image6• image7 image8 image9 • image10 image11 image12• image13 image14
image15/;
• $imagename =• "$name[$s]/$results[$m].png";
• open(IMAGE,">$name[$s]/• $results[$m].png") • or die "open >• $name[$s]/$results[$m].png:
$!";
• binmode IMAGE;• print IMAGE $gd_object->png;• print" 已對檔案『 $imagename 』• 寫出。
\n";• close IMAGE;• }• }
Probe ID H_value
Probe ID ESC EB d10 …...
Top 5 of H-test value , Samples 對Gene Expression values (Ascending)
Probe ID H_value
Probe ID ESC EB d10 …...
Top 5 of H-test value , Samples 對Gene Expression values (Ascending)
Probe ID H_value
Probe ID ESC EB d10 …...
Top 5 of H-test value , Samples 對Gene Expression values (Ascending)
Probe ID H_value
Probe ID ESC EB d10 …...
Top 5 of H-test value , Samples 對Gene Expression values (Ascending)
Thank you for listening!