#!/usr/bin/perl $expect_common_0=$expect_common_15=$expect_common_25=$expect_common_35=$expect_common_50= $expect_common_75=0; $real_common_0=$real_common_15=$real_common_25=$real_common_35=$real_common_50=$real_common_75=0; $total_uniqueS_control = $total_uniqueQ_control = 0; $total_slidS_control = $total_slidQ_control = $total_common_control = 0; $total_alignment_lengthQ = 0; $total_alignment_lengthS = 0; $control = 0; $check_eid = 0; %count_intronQ = (); %count_intronS = (); $total_commonQ_0 = 0; $total_commonQ_1 = 0; $total_commonQ_2 = 0; $total_commonS_0 = 0; $total_commonS_1 = 0; $total_commonS_2 = 0; $total_uniqueQ = 0; $total_uniqueS = 0; $total_slidingQ = 0; $total_slidingS = 0; $total_lengthQ = 0 ; $total_lengthS = 0; $count_gene_pairs = 0; $total_query_0=0; $total_query_1=0; $total_query_2=0; $total_sbjct_0=0; $total_sbjct_1=0; $total_sbjct_2=0; $count_unique_pairs = 0; $short_exonQ = $short_exonS = 0; open (OUTPUT, ">output") || die "Can't open output: $!\n"; open (OUT2, ">output2") || die "Can't open output2: $!\n"; open (OUT3, ">output3") || die "Can't open output3: $!\n"; &user_parameters; %phase = (); %position = (); %length = (); @position_s = @position_q = @phase_s = @phase_q = @length_s = @length_q = (); %distr_common = %distr_slid = %distribution_all = (); for $x (0..120) { $distr_common{$x}=0; $distr_slid{$x}=0; $distribution_all{$x}=0; } $control_intronsQ = $control_intronsS = 0; $counted_intronsQ = $counted_intronsS = 0; open (PEID, "$eid") || die "Can't open $eid: $!\n"; $/ = "\n\n"; while () { if (/^>\s+([0-9]+[A-Z]*_)([A-Z]+_[0-9]+)\s+/) { #modified 2/16/2005 $id_eid = $1 . $2; if (/position:([0-9,]+)/){ $position{$id_eid} = $1; } if (/phase:([0-2]+)/) { $phase{$id_eid} = $1; } if (/sum:(\d+)/) { $length{$id_eid} = $1; } $check_eid++; } } close (PEID); print "Hush is done!!! \n $check_eid\n\n"; #__________________________________Block 1: Analyzes and changes BLAST-alignments input @entry=(); @score=@identities=(); open(FH, "$ARGV[0]") || die "Can't open ARGV[0] : $!\n"; $/ = "Reference"; while () { push(@entry, $_); if (/Score += +([0-9]+)/){push(@score, $1);} else { push(@score, 0);} if (/Identities\s+=\s+\d+\/\d+\s+\((\d+)\%/) {push(@identities, $1);} #new addition 4/27/2002 else {push(@identities, 1);} } close (FH); $number_entries = @entry; print "Number entries at step 1 is ", $number_entries, "\n"; print OUTPUT "Number entries at step 1 is $#entry \n"; for ( $x=1; $x < $number_entries; $x++) { open (OUTFILE, ">entry$x") || die "Can't open entry$x: $!\n"; print OUTFILE "$entry[$x]"; close (OUTFILE); } undef (@entry); for ($y = 1; $y < $number_entries; $y++) { $W = 0; @sarray1=(); @sarray2=(); @qarray1=(); @qarray2=(); $s1=$s2=$q1=$q2=0; open(INONE, "entry$y") || die "Can't open entry$y: $!\n"; open (OUTONE, ">temp5_$y") || die "Can't open temp5_$y: $!\n"; $/ = ""; while () { if (/^Query/) {print OUTONE;} elsif (/^> [1-9]+/) {print OUTONE;} elsif (/Expect/) {print OUTONE;} elsif (/^Score/) {print OUTONE;} if (/Query=\s*([0-9]+[A-Z]*_[A-Z]+_)([A-Z0-9]+)\s+/) { #modified on 3/30/05 $id_query[$y] = $1 . $2; } if (/>\s*([0-9]+[A-Z]*_[A-Z]+_)([A-Z0-9]+)\s+/) { #modified on 3/30/05 $id_sbjct[$y] = $1 . $2; } } close (INONE); close (OUTONE); unlink ("entry$y"); open (INTWO, "temp5_$y") || die "Can't open temp5_$y: $!\n"; open (OUTTWO, ">alignment_$y") || die "Can't open alignment_$y: $!\n"; $/ = "Score ="; while () { if (/^ +([0-9]+)/) { if ($1 >= $threshold) { @align_blocks = split("\n\n", $_); print OUTTWO $align_blocks[0], "\n\n"; undef($first_line); undef($second_line); undef($third_line); undef($first_tail); undef($second_tail); undef($third_tail); for $z (1..$#align_blocks) { @lines = (); if ($align_blocks[$z] =~ /Query:/) { @lines = split("\n", $align_blocks[$z]); if ($lines[0] =~ /(Query:\s*\d+\s*)([A-Z\-]+)(\s*\d+\s*)/) { $left_margin = length($1); $body_size = length($2); $right_margin = length($3); if ($z != 1) { $lines[0] = substr($lines[0], $left_margin); # cut left margin $lines[1] = substr($lines[1], $left_margin); # cut left margin $lines[2] = substr($lines[2], $left_margin); # cut left margin $first_tail = substr($lines[0], $body_size); $third_tail = substr($lines[2], $body_size); $lines[0] = substr($lines[0], 0, $body_size); $lines[1] = substr($lines[1], 0, $body_size); $lines[2] = substr($lines[2], 0, $body_size); } else { $first_tail = substr($lines[0], ($left_margin + $body_size)); $third_tail = substr($lines[2], ($left_margin + $body_size)); $lines[0] = substr($lines[0], 0, ($left_margin + $body_size)); $lines[1] = substr($lines[1], 0, ($left_margin + $body_size)); $lines[2] = substr($lines[2], 0, ($left_margin + $body_size)); } $first_line .= $lines[0]; $second_line .= $lines[1]; $third_line .= $lines[2]; } } } $first_line .= $first_tail; $third_line .= $third_tail; print OUTTWO $first_line, "\n", $second_line, "\n", $third_line, "\n\n"; } } } close (INTWO); close (OUTTWO); unlink ("temp5_$y"); open (INTHREE, "alignment_$y") || die "Can't open alignment_$y: $!\n"; open (ALIGN, ">$ARGV[1]/align_$y") || die "Can't open align_$y: $!\n"; $/ = "\n\n"; while () { $remove = 1; if (/Sbjct:\s*(\d+)\s*([A-Z]|-)+\s*(\d+)/) { $s1 = $1; $s2 = $3; for $w (0..$W) { if ($sarray1[$w] <= $s1 && $s1 < $sarray2[$w] - $constant1) {$remove = 0;} if ($sarray1[$w] + $constant1 < $s2 && $s2 <= $sarray2[$w]) {$remove = 0;} if ($s1 <= $sarray1[$w] && $sarray1[$w] < $s2 - $constant1) {$remove = 0;} if ($s1 + $constant1 < $sarray2[$w] && $sarray2[$w] <= $s2) {$remove = 0;} } } if (/Query:\s*(\d+)\s*([A-Z]|-)+\s*(\d+)/) { $q1 = $1; $q2 = $3; for $w (0..$W) { if ($qarray1[$w] <= $q1 && $q1 < $qarray2[$w] - $constant1) {$remove = 0; } if ($qarray1[$w] + $constant1 < $q2 && $q2 <= $qarray2[$w]) {$remove = 0; } if ($q1 <= $qarray1[$w] && $qarray1[$w] < $q2 - $constant1) {$remove = 0; } if ($q1 + $constant1 < $qarray2[$w] && $qarray2[$w] <= $q2) {$remove = 0; } } } if ($rm_control) {$remove = 1;} if ($remove) { $W++; push (@sarray1, $s1); push (@sarray2, $s2); push (@qarray1, $q1); push (@qarray2, $q2); print ALIGN $_; } } close (ALIGN); close (INTHREE); unlink ("alignment_$y"); #________________________________________________________________end of BLAST-alignment modification $position_s[$y] = $position{$id_sbjct[$y]}; $position_q[$y] = $position{$id_query[$y]}; $phase_s[$y] = $phase{$id_sbjct[$y]}; $phase_q[$y] = $phase{$id_query[$y]}; $length_s[$y] = $length{$id_sbjct[$y]}; $length_q[$y] = $length{$id_query[$y]}; @positionS = split (",", $position_s[$y]); @phaseS = split ("", $phase_s[$y]); @positionQ = split (",", $position_q[$y]); @phaseQ = split ("", $phase_q[$y]); #12/1/2001 until this point program is OK! #________________________ Block 2 puts introns on the alignment and maps subject introns on a query sequence # This block as well counts the local gaps, introns at the alignment borders and resolve Single/Multiple intron counting # by giving introns unique lables after counting. $intr_numb_pair = 0; $first_block = 0; $current_lengthQ = 0; $current_lengthS = 0; open (ALIGN, "$ARGV[1]/align_$y") || die "Can't open align_$y: $!\n"; open (TEMPOUT, ">$ARGV[1]/temporary_$y") || die "Can't open temporary_$y: $!\n"; $/ = "\n\n"; while () { @intronsQ = (); @intronsS = (); @intronsS_old = (); $N = 0; @kursor_positionQ = @kursor_positionS = (); %gapsS = (); @gapsS = (); $gapS = 0; $gapsS{-1} = 0; $gapsQ{-1} = 0; %gapsQ = (); @gapsQ = (); $gapQ = 0; @middle_line = (); @alignment_lines = (); #__________________________subblock 2b: analysis of the query gene if ( /(Query: +)([0-9]+)( +)([A-Z-]+) +([0-9]+)/) { unless ($first_block) { print OUT2 $y, "\n", $id_query[$y], "\n", $id_sbjct[$y], "\n"; print TEMPOUT $y, "\tquery is ", $id_query[$y], "\tsubject is ", $id_sbjct[$y], "\n"; $first_block = 1; $control++; } $space_q = length($1) + length($2) + length($3); $N = $2; @seqQ = split ("", $4); $current_lengthQ += $5 - $2 + 1; $total_lengthQ += $5 - $2 + 1; for ($k = 0; $k <= $#seqQ; $k++) { if ($seqQ[$k] =~ /[A-Z]/) { $gapsQ[$k] = 0; $gapsQ{$k} = $gapsQ{($k-1)}} else {$gapQ++; $gapsQ{$k} = $gapQ; $gapsQ[$k] = 1;} } @alignment_lines = split("\n", $_); $alignment_lines[1] = substr($alignment_lines[1], $space_q); @middle_line = split("", $alignment_lines[1]); %identical_left =(); %identical_right =(); @conservative_aa=(); $identical_right{$#middle_line + 1} = $#middle_line + 1; $identical_right{$#middle_line + 2} = $#middle_line + 2; $identical_right{$#middle_line + 3} = $#middle_line + 3; $identical_left{-1} = -1; $identical_left{-2} = -2; $identical_left{-3} = -3; for $k (0..$#middle_line) { if ($middle_line[$k] =~ /[A-Z]/) {$identical_left{$k}= $k;} else {$identical_left{$k}= $identical_left{$k - 1}}; if ($middle_line[$k] =~ /\+/) {$conservative_aa[$k] = 1;} else {$conservative_aa[$k] = 0;} } for ($k = $#middle_line; $k > 0; $k--) { if ($middle_line[$k] =~ /[A-Z]/) {$identical_right{$k}= $k;} else {$identical_right{$k}= $identical_right{$k + 1}}; } for $n (1..$#positionQ+1) { if ($2 <= $positionQ[$n - 1] && $5 >= $positionQ[$n - 1]) { unless ($count_intronQ{$id_query[$y] . '#' . $n} && $single_intron eq "SINGLE") { $count_intronQ{$id_query[$y] . '#' . $n} = 1; $N_Q = $2 - 1; for ($k = 0; $k <= $#seqQ; $k++) { if ($seqQ[$k] =~ /[A-Z]/) {$N_Q++;} if ($N_Q == $positionQ[$n - 1]) { $k2 = $k; last;} } $kursor_position = $space_q + $k2 -1; $intron_position = $positionQ[$n - 1]; if ($phaseQ[$n - 1] == 1) {$intron_position += 0.34;} if ($phaseQ[$n - 1] == 2) {$intron_position += 0.67;} # ADDITION 5/20/2002____________________________________________________________________________________ for $zz (0..$kursor_position) { unless ($kursor_positionQ[$zz] eq '0' || $kursor_positionQ[$zz] eq '1' || $kursor_positionQ[$zz] eq '2') { if ($zz == $kursor_position) { $kursor_positionQ[$zz] = $phaseQ[$n - 1]; } else {$kursor_positionQ[$zz] = ' ';} } } # END OF ADDITION 5/20/2002____________________________________________________________________________________ # printf TEMPOUT "%-${kursor_position}s %-7s\n", "intronQ $n", $intron_position; if (($identical_left{$identical_left{$k2} - 1} >= 0 && $identical_right{$identical_right{$k2} + 1} <= $#middle_line) || $border_rm) { push(@intronsQ, $intron_position); $intr_numb_pair++; $control_intronsQ++; } else {print "BORDER INTRON_Q \n";} } else { $counted_intronsQ++; #print $id_query[$y], '#', $n, " intron_Q $n $positionQ[$n - 1] $phaseQ[$n - 1] has been counted\n"; # print TEMPOUT "intron $n $positionQ[$n - 1] $phaseQ[$n - 1] has been counted before\n"; } } } # ADDITION 5/20/2002____________________________________________________________________________________ @kursor_positionQ = (' ', @kursor_positionQ); $kursor_positionQ = join("", @kursor_positionQ); # END OF ADDITION 5/20/2002____________________________________________________________________________________ # print TEMPOUT $_; print OUT2 join('#', @intronsQ), '_'; } #__________________________subblock 2b: analysis of the subject gene if ( /(Sbjct: +)([0-9]+)( +)([A-Z-]+) +([0-9]+)/) { @seqS = split ("", $4); for ($k = 0; $k <=$#seqS; $k++) { if ($seqS[$k] =~ /[A-Z]/) {$gapsS[$k] = 0; $gapsS{$k} = $gapsS{($k-1)}} else {$gapS++; $gapsS{$k} = $gapS; $gapsS[$k] = 1;} } $space_s = length($1) + length($2) + length($3); $current_lengthS += $5 - $2 + 1; $total_lengthS += $5 - $2 + 1; for $n (1..$#positionS+1) { if ($2 <= $positionS[$n - 1] && $5 >= $positionS[$n - 1]) { unless ($count_intronS{$id_sbjct[$y] . '#' . $n} && $single_intron eq "SINGLE") { $count_intronS{$id_sbjct[$y] . '#' . $n} = 1; $N_S = $2 - 1; for ($k = 0; $k <=$#seqS; $k++) { if ($seqS[$k] =~ /[A-Z]/) {$N_S++;} if ($N_S == $positionS[$n - 1]) { $k2 = $k; last;} } $first_il = $identical_left{$k2}; $second_il = $identical_left{$first_il - 1}; $first_ir = $identical_right{$k2}; $second_ir = $identical_right{$first_ir + 1}; $gap_size_I = 0; $gap_size_II = 0; $conservative = 0; $local_length = $second_ir - $second_il; for $k ($first_il..$first_ir) { if ($gapsS[$k]) {$gap_size_I++;} if ($gapsQ[$k]) {$gap_size_I++;} } for $k ($second_il..$first_il) { if ($gapsS[$k]) {$gap_size_II++;} if ($gapsQ[$k]) {$gap_size_II++;} } for $k ($first_ir..$second_ir) { if ($gapsS[$k]) {$gap_size_II++;} if ($gapsQ[$k]) {$gap_size_II++;} } for $k ($second_il..$second_ir) { if ($conservative_aa[$k]) {$conservative++;} } if($local_length) { $local_homology = (4 + $conserv_change*$conservative)/($local_length + 1); } $local_homology = sprintf("%.3f", $local_homology); $parameters = $gap_size_I . '_' . $gap_size_II . '_' . $local_homology; $kursor_position = $space_s + $k2 -1; $maped_position = $N + $k2 - $gapsQ{$k2}; $intron_position = $positionS[$n - 1]; if ($phaseS[$n - 1] == 1) {$intron_position += 0.34; $maped_position += 0.34;} if ($phaseS[$n - 1] == 2) {$intron_position += 0.67; $maped_position += 0.67;} # ADDITION 5/20/2002____________________________________________________________________________________ for $zz (0..$kursor_position) { unless ($kursor_positionS[$zz] eq '0' || $kursor_positionS[$zz] eq '1' || $kursor_positionS[$zz] eq '2' ) { if ($zz == $kursor_position) { $kursor_positionS[$zz] = $phaseS[$n - 1]; } else {$kursor_positionS[$zz] = ' ';} } } # END OF ADDITION 5/20/2002____________________________________________________________________________________ # printf TEMPOUT "%-${kursor_position}s %-7s %-7s %-10s \n", "intronS $n", $intron_position, $maped_position, $parameters; if ($gap_size_I <= $thr_i1 && $gap_size_II <= $thr_i2 && $local_homology > $thr_lg) { if (($second_il >= 0 && $second_ir <= $#middle_line) || $border_rm) { push(@intronsS, $maped_position); push(@intronsS_old, $intron_position); #10/12/2001 $intr_numb_pair++; $control_intronsS++; } else {print "BORDER INTRON_S \n";} } } else { $counted_intronsS++; # print $id_sbjct[$y], '#', $n, " intron_S $n $positionS[$n - 1] $phaseS[$n - 1] has been counted \n"; print TEMPOUT "intron $n $positionS[$n - 1] $phaseS[$n - 1] has been counted before\n"; } } } print TEMPOUT "\n\n"; print OUT2 join('#', @intronsS), '_'; print OUT2 join('#', @intronsS_old), "\n"; # ADDITION 5/20/2002____________________________________________________________________________________ @kursor_positionS = (' ', @kursor_positionS); $kursor_positionS = join("", @kursor_positionS); @align_lines = split("\n", $_); $line_Q = $align_lines[0]; $line_middle = $align_lines[1]; $line_S = $align_lines[2]; $size_align = length($line_Q); for ($zz = 0; $zz < $size_align; $zz += 100) { print TEMPOUT substr($kursor_positionQ, $zz, 100), "\n"; print TEMPOUT substr($line_Q, $zz, 100), "\n"; print TEMPOUT substr($line_middle, $zz, 100), "\n"; print TEMPOUT substr($line_S, $zz, 100), "\n"; print TEMPOUT substr($kursor_positionS, $zz, 100), "\n\n\n"; } # END OF ADDITION 5/20/2002____________________________________________________________________________________ } #_____________________________new addition 1/17/2001 $gene_common = $gene_slidingQ = $query_unique = $gene_slidingS = $sbjct_unique = 0; for $a (0..$#intronsQ) { $commonQ = 0; $slidingQ = 0; for $b (0..$#intronsS) { if (($intronsQ[$a] - $intronsS[$b]) == 0) {$commonQ = 1;} elsif (abs($intronsQ[$a] - $intronsS[$b]) <= $sliding_threshold) {$slidingQ = 1;} } if ($commonQ) { $gene_common++; $total_common_control++; } elsif ($slidingQ) { $gene_slidingQ++; $total_slidQ_control++; } else { $query_unique++; $total_uniqueQ_control++; } } for $b (0..$#intronsS) { $commonS = 0; $slidingS = 0; for $a (0..$#intronsQ) { if (($intronsQ[$a] - $intronsS[$b]) == 0) {$commonS = 1;} elsif (abs($intronsQ[$a] - $intronsS[$b]) <= $sliding_threshold) {$slidingS = 1;} } if ($commonS) {} elsif ($slidingS) { $gene_slidingS++; $total_slidS_control++; } else { $sbjct_unique++; $total_uniqueS_control++; } } if ($first_block) { print TEMPOUT "OLD_TYPE INTRON COUNTING \n"; print TEMPOUT " #common is $gene_common \n #slidingQ is $gene_slidingQ \n #slidingS is $gene_slidingS \n"; print TEMPOUT " #uniqueQ is $query_unique \n #uniqueS is $sbjct_unique \n\n\n"; } #_____________________________ END OF new addition 1/17/2001 } close (TEMPOUT); if ($first_block) { print OUT2 $current_lengthQ, "\n", $current_lengthS, "\n", $score[$y], "\n", $identities[$y], "\n\n"; $count_gene_pairs++; } close (ALIGN); unlink ("$ARGV[1]/align_$y"); } print "$count_gene_pairs is the total number of gene pairs \n"; print "summ of the query sequence length is $total_lengthQ \n"; print "summ of the sbjct sequence length is $total_lengthS \n"; close (OUT2); #____________Block 3: Finds unique pairs %seen = (); %intron_number = (); %introns_info = (); %lengthQ = (); %lengthS = (); %gene_info = (); %y = (); %identities = (); open (COMPARISON, "output2") || die "Can't open output2: $!\n"; $/ = "\n\n"; if ($rm_pairs == 0) { print OUT3 "Subject - first, Query - second \n"; while () { chomp; chomp; @chunk = split("\n", $_); $identities = pop(@chunk); chomp($identities); $intron_number = pop(@chunk); $lengthS = pop(@chunk); chomp($lengthS); $lengthQ = pop(@chunk); chomp($lengthQ); chomp($intron_number); $y = shift(@chunk); chomp($y); $query = shift(@chunk); $sbjct = shift(@chunk); if ($intron_number > 0.5) { if ($seen{$sbjct} == 0) { $seen{$sbjct} = 1; $intron_number{$sbjct} = $intron_number; $introns_info{$sbjct} = [@chunk]; $lengthQ{$sbjct} = $lengthQ; $lengthS{$sbjct} = $lengthS; $gene_info{$sbjct} = $query; $y{$sbjct} = $y; $identities{$sbjct} = $identities; } else { if ($intron_number{$sbjct} < $intron_number) { $intron_number{$sbjct} = $intron_number; $introns_info{$sbjct} = [@chunk]; $lengthQ{$sbjct} = $lengthQ; $lengthS{$sbjct} = $lengthS; $gene_info{$sbjct} = $query; $y{$sbjct} = $y; $identities{$sbjct} = $identities; } } } } } else { print OUT3 "Query - first, Subject - second \n"; while () { chomp; chomp; @chunk = split("\n", $_); $identities = pop(@chunk); chomp($identities); $intron_number = pop(@chunk); chomp($intron_number); $lengthS = pop(@chunk); chomp($lengthS); $lengthQ = pop(@chunk); chomp($lengthQ); $y = shift(@chunk); chomp($y); $query = shift(@chunk); $sbjct = shift(@chunk); if ($intron_number > 0.5) { $introns_info{$query} = [@chunk]; $lengthQ{$query} = $lengthQ; $lengthS{$query} = $lengthS; $gene_info{$query} = $sbjct; $y{$query} = $y; $identities{$sbjct} = $identities; } } } #_______________Block 4 counts common, unique and sliding introns foreach $x (keys %introns_info) { $single_aling_count = 0; $gene_slidingQ = $gene_slidingS = 0; $number_introns_in_alignmentQ = 0; $number_introns_in_alignmentS = 0; $gene_commonQ = 0; $gene_uniqueQ = 0; $gene_uniqueS = 0; $gene_slidingQ = 0; $gene_commonS = 0; $gene_slidingS = 0; $gene_commonQ_0 = $gene_commonQ_1 = $gene_commonQ_2 = 0; $gene_commonS_0 = $gene_commonS_1 = $gene_commonS_2 = 0; $query_0 = $query_1 = $query_2 = 0; $sbjct_0 = $sbjct_1 = $sbjct_2 = 0; foreach $z (@{$introns_info{$x}}) { $single_aling_count++; chomp $z; @Q_S = split("_", $z); @query = split("#", $Q_S[0]); @sbjct = split("#", $Q_S[1]); @sbjct_old = split("#", $Q_S[2]); #10/12/2001 $block_commonQ_0 = 0; $block_commonQ_1 = 0; $block_commonQ_2 = 0; $block_commonS_0 = 0; $block_commonS_1 = 0; $block_commonS_2 = 0; $block_slidingQ = 0; $block_slidingS = 0; $query_unique = 0; $sbjct_unique = 0; for $iq (0..$#query) { if ($query[$iq] =~ /\.34/) {$query_1++;} elsif ($query[$iq] =~ /\.67/) {$query_2++;} else {$query_0++;} if (abs($query[$iq + 1] - $query[$iq]) <= 2*$sliding_threshold){ $short_exonQ++; #print "Short exonQ $iq in the alignment $y{$x} \n"; } } for $is (0..$#sbjct) { if ($sbjct[$is] =~ /\.34/) { $sbjct_1++;} elsif ($sbjct[$is] =~ /\.67/) { $sbjct_2++;} else { $sbjct_0++;} if (abs($sbjct[$is + 1] - $sbjct[$is]) <= 2*$sliding_threshold){ $short_exonS++; #print "Short exonS $is in the alignment $y{$x} \n"; } } @common_positions = (); @s_temp = @sbjct; #10_27_01 addition for $iq (0..$#query) { $common_0 = 0; $common_1 = 0; $common_2 = 0; for $is (0..$#s_temp) { if (($query[$iq] - $s_temp[$is]) == 0) { push(@common_positions, $iq); $s_temp[$is] = -5; #10_27_01 addition if ($query[$iq] =~ /\.34/){$common_1 = 1;} elsif ($query[$iq] =~ /\.67/) {$common_2 = 1;} else {$common_0 = 1;} last; } } #8_24_2001 Here I should add the distribution of query introns along gene length $intron_position = int(100 * $query[$iq] / ($length_q[$y{$x}] + 1)); $distribution_all{$intron_position}++; $bin_numb = int($query[$iq] / 5); #ADDITION 11/2/2001 $bin_intron[$bin_numb]++; if ($common_0) {$block_commonQ_0++; $distr_common{$intron_position}++; $bin_common[$bin_numb]++;} elsif ($common_1) {$block_commonQ_1++; $distr_common{$intron_position}++; $bin_common[$bin_numb]++;} elsif ($common_2) {$block_commonQ_2++; $distr_common{$intron_position}++; $bin_common[$bin_numb]++;} undef($intron_position); } $gene_bin_numb = int($length_q[$y{$x}] / 5); #ADDITION 11/2/2001 if ($single_aling_count == 1) { for $nn (0..$gene_bin_numb) { $bin_gene[$nn]++; } } for $iq (0..$#query) { $sliding = 0; $skip = 1; foreach $zz (@common_positions) {if ($iq == $zz){$skip = 0;}} if ($skip) { for $is (0..$#s_temp) { #10_27_01 addition block if (abs($query[$iq] - $s_temp[$is]) <= $sliding_threshold) { $sliding = 1; $s_temp[$is] = 1000000; } } } $intron_position = int(100 * $query[$iq] / ($length_q[$y{$x}] + 1)); if ($sliding) {$block_slidingQ++; $distr_slid{$intron_position}++;} else {$query_unique++;} undef($intron_position); } @common_positions = (); @q_temp = @query; #10_27_01 addition for $is (0..$#sbjct) { $common_0 = 0; $common_1 = 0; $common_2 = 0; for $iq (0..$#q_temp) { if (($q_temp[$iq] - $sbjct[$is]) == 0) { $q_temp[$iq] = -50; #10_27_01 addition push(@common_positions, $is); if ($sbjct[$is] =~ /\.34/){$common_1 = 1;} elsif ($sbjct[$is] =~ /\.67/) {$common_2 = 1;} else {$common_0 = 1;} last; } } #10_12_2001 Here I should add the distribution of subject introns along gene length $intron_positionS = int(100 * $sbjct_old[$is] / ($length_s[$y{$x}] + 1)); $distribution_allS{$intron_positionS}++; $bin_numb_s = int($sbjct_old[$is] / 5); #ADDITION 11/2/2001 $bin_intron_s[$bin_numb_s]++; if ($common_0) {$block_commonS_0++;} elsif ($common_1) {$block_commonS_1++;} elsif ($common_2) {$block_commonS_2++;} } $gene_bin_numb_s = int($length_s[$y{$x}] / 5); #ADDITION 11/2/2001 if ($single_aling_count == 1) { for $nn (0..$gene_bin_numb_s) { $bin_gene_s[$nn]++; } } for $is (0..$#sbjct) { $sliding = 0; $skip = 1; foreach $zz (@common_positions) {if ($is == $zz){$skip = 0;}} if ($skip) { for $iq (0..$#q_temp) { #10_27_01 addition block if (abs($q_temp[$iq] - $sbjct[$is]) <= $sliding_threshold) { #Addition from 5/21/2002___________________________________________ if ($q_temp[$iq] =~ /\.34/){$q_phase = '1';} elsif ($q_temp[$iq] =~ /\.67/){$q_phase = '2';} else {$q_phase = 'z';} if ($sbjct[$is] =~ /\.34/){$s_phase = '1';} elsif ($sbjct[$is] =~ /\.67/){$s_phase = '2';} else {$s_phase = 'z';} $slid_phases = $q_phase . $s_phase; $slid_hash{$slid_phases}++; $nt_lenght = 3*abs($q_temp[$iq] - $sbjct[$is]); $nt_lenght = sprintf("%.0f", $nt_lenght); $slid_length{$nt_lenght}++; #END OF Addition from 5/21/2002___________________________________________ $sliding = 1; $q_temp[$iq] = 5000000; } } } if ($sliding) {$block_slidingS++;} else {$sbjct_unique++;} } #12/25/2001 addition to prog_CIP8. IMPORTANT! variable $query_unique is unique + common. SAME TRUE FOR subjct $gene_uniqueQ += $query_unique - $block_commonQ_0 - $block_commonQ_1 - $block_commonQ_2; $gene_uniqueS += $sbjct_unique - $block_commonS_0 - $block_commonS_1 - $block_commonS_2; $gene_commonQ += $block_commonQ_0 + $block_commonQ_1 + $block_commonQ_2; $gene_commonS += $block_commonS_0 + $block_commonS_1 + $block_commonS_2; $gene_slidingQ += $block_slidingQ; $gene_commonQ_0 += $block_commonQ_0; $gene_commonQ_1 += $block_commonQ_1; $gene_commonQ_2 += $block_commonQ_2; $gene_slidingS += $block_slidingS; $gene_commonS_0 += $block_commonS_0; $gene_commonS_1 += $block_commonS_1; $gene_commonS_2 += $block_commonS_2; } # 12/25/2001 ADDITION for prog_CIP8 $number_introns_in_alignmentQ = $gene_uniqueQ + $gene_slidingQ + $gene_commonQ; $number_introns_in_alignmentS = $gene_uniqueS + $gene_slidingS + $gene_commonS; if ($number_introns_in_alignmentQ && $number_introns_in_alignmentS) { $total_alignment_lengthQ += $lengthQ{$x}; $total_alignment_lengthS += $lengthS{$x}; $total_slidingQ += $gene_slidingQ; $total_slidingS += $gene_slidingS; $total_commonQ_0 += $gene_commonQ_0; $total_commonQ_1 += $gene_commonQ_1; $total_commonQ_2 += $gene_commonQ_2; $total_commonS_0 += $gene_commonS_0; $total_commonS_1 += $gene_commonS_1; $total_commonS_2 += $gene_commonS_2; $total_uniqueQ += $gene_uniqueQ; $total_uniqueS += $gene_uniqueS; $total_query_0 += $query_0; $total_query_1 += $query_1; $total_query_2 += $query_2; $total_sbjct_0 += $sbjct_0; $total_sbjct_1 += $sbjct_1; $total_sbjct_2 += $sbjct_2; #4/27/2002 ADDITION for prog_CIP9 identity_bins if ($identities{$x} == 1) { $expect_common_0 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_0 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_0 += $gene_slidingS; } if ($identities{$x} >= 15 && $identities{$x} <= 25 ) { $expect_common_15 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_15 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_15 += $gene_slidingS; } if ($identities{$x} > 25 && $identities{$x} <= 35 ) { $expect_common_25 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_25 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_25 += $gene_slidingS; } if ($identities{$x} > 35 && $identities{$x} <= 45 ) { $expect_common_35 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_35 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_35 += $gene_slidingS; } if ($identities{$x} > 45 && $identities{$x} <= 55 ) { $expect_common_45 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_45 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_45 += $gene_slidingS; } if ($identities{$x} > 55 && $identities{$x} <= 65 ) { $expect_common_55 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_55 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_55 += $gene_slidingS; } if ($identities{$x} > 65 ) { $expect_common_65 += (2 * $number_introns_in_alignmentQ * $number_introns_in_alignmentS / ($lengthQ{$x} + $lengthS{$x})); $real_common_65 += $gene_commonQ_0 + $gene_commonQ_1 + $gene_commonQ_2; $real_slid_65 += $gene_slidingS; } # END OF 4/27/2002 ADDITION for prog_CIP9 identity_bins $count_unique_pairs++; print OUT3 $gene_commonQ, "\t", $gene_slidingQ, "\t", $gene_uniqueQ, "\t", $gene_uniqueS, "\t", $lengthQ{$x}, "\t"; print OUT3 $x, "\t", $gene_info{$x}, "\t", $y{$x}, "\n"; } if ($gene_slidingQ != $gene_slidingS) { print "Warning sliding Q != S for $y{$x} \n"; $q_check = join('#', @q_temp); print "query introns $q_check \n"; $q_check2 = join('#', @query); print "query normal $q_check2 \n"; $s_check = join('#', @s_temp); print "subject introns $s_check \n"; $s_check2 = join('#', @sbjct); print "subject normal $s_check2 \n"; } } for $b (0..$number_entries) { $keep_it = 0; foreach $x (values %y) { if ($b == $x){ $keep_it = 1; last; } } unless ($keep_it || $b == 3572 || $b == 3587) {unlink ("$ARGV[1]/temporary_$b");} } $total_commonQ_sum = $total_commonQ_0 + $total_commonQ_1 + $total_commonQ_2; $total_commonS_sum = $total_commonS_0 + $total_commonS_1 + $total_commonS_2; print "commonQ0 $total_commonQ_0 commonQ1 $total_commonQ_1 commonQ2 $total_commonQ_2 \t sum is $total_commonQ_sum \n"; print "\tphase distribution for common query introns \n"; print 'phase 0 ', sprintf("%.2f", ($total_commonQ_0 / $total_commonQ_sum)), "\n"; print 'phase 1 ', sprintf("%.2f", ($total_commonQ_1 / $total_commonQ_sum)), "\n"; print 'phase 2 ', sprintf("%.2f", ($total_commonQ_2 / $total_commonQ_sum)), "\n\n"; print "commonS0 $total_commonS_0 commonS1 $total_commonS_1 commonS2 $total_commonS_2 \t sum is $total_commonS_sum \n"; print "\tphase distribution for common subject introns \n"; print 'phase 0 ', sprintf("%.2f", ($total_commonS_0 / $total_commonS_sum)), "\n"; print 'phase 1 ', sprintf("%.2f", ($total_commonS_1 / $total_commonS_sum)), "\n"; print 'phase 2 ', sprintf("%.2f", ($total_commonS_2 / $total_commonS_sum)), "\n\n"; $total_query = $total_query_0 + $total_query_1 + $total_query_2; print "number of phase 0 for query $total_query_0 \n"; print "number of phase 1 for query $total_query_1 \n"; print "number of phase 2 for query $total_query_2 \n"; print "sum is $total_query \n"; print "\tphase distribution for all query introns \n"; print 'phase 0 ', sprintf("%.3f", ($total_query_0 / $total_query)), "\n"; print 'phase 1 ', sprintf("%.3f", ($total_query_1 / $total_query)), "\n"; print 'phase 2 ', sprintf("%.3f", ($total_query_2 / $total_query)), "\n\n"; $total_sbjct = $total_sbjct_0 + $total_sbjct_1 + $total_sbjct_2; print "number of phase 0 for sbjct $total_sbjct_0 \n"; print "number of phase 1 for sbjct $total_sbjct_1 \n"; print "number of phase 2 for sbjct $total_sbjct_2 \n"; print "sum is $total_sbjct \n"; print "\tphase distribution for all subject introns \n"; print 'phase 0 ', sprintf("%.3f", ($total_sbjct_0 / $total_sbjct)), "\n"; print 'phase 1 ', sprintf("%.3f", ($total_sbjct_1 / $total_sbjct)), "\n"; print 'phase 2 ', sprintf("%.3f", ($total_sbjct_2 / $total_sbjct)), "\n\n"; print "number of unique query introns $total_uniqueQ \nnumber of unique subject introns $total_uniqueS \n\n"; print "number of sliding query introns with threshold $sliding_threshold is $total_slidingQ \n"; print "number of sliding subject introns with threshold $sliding_threshold is $total_slidingS \n\n"; print "Number of counted gene pairs is $count_unique_pairs \n"; print "total length of analyzed alignmentsQ is $total_alignment_lengthQ \n"; print "total length of analyzed alignmentsS is $total_alignment_lengthS \n\n"; #print "total number of intronsQ in all alignments is $control_intronsQ \n"; #print "total number of intronsS in all alignments is $control_intronsS \n"; #print "total number of query introns counted many times is $counted_intronsQ \n"; #print "total number of subject introns counted many times is $counted_intronsS \n"; print "number of short query exons is $short_exonQ \n"; print "number of short subject exons is $short_exonS \n"; #print "control: # of uniqueQ introns is $total_uniqueQ_control \n"; #print "control: # of uniqueS introns is $total_uniqueS_control \n"; #print "control: # of slidingS introns is $total_slidS_control \n"; #print "control: # of slidingQ introns is $total_slidQ_control \n"; #print "control: # of common introns is $total_common_control \n"; print "\n\n\nIDENTITIY DISTRIBUTION EXPECT REAL \n"; print "VERY LOW IDENTITIES\t $expect_common_0 \t $real_common_0 \t $real_slid_0 \n"; print "15-25pc IDENTITIES\t $expect_common_15 \t $real_common_15 \t $real_slid_15 \n"; print "25-35pc IDENTITIES\t $expect_common_25 \t $real_common_25 \t $real_slid_25 \n"; print "35-45pc IDENTITIES\t $expect_common_35 \t $real_common_35 \t $real_slid_35 \n"; print "45-55pc IDENTITIES\t $expect_common_45 \t $real_common_45 \t $real_slid_45 \n"; print "55-65pc IDENTITIES\t $expect_common_55 \t $real_common_55 \t $real_slid_55 \n"; print "65-100pc IDENTITIES\t $expect_common_65 \t $real_common_65 \t $real_slid_65 \n\n"; print "sliding phases 00 number of occurance is $slid_hash{'zz'} \n"; print "sliding phases 01 number of occurance is $slid_hash{'z1'} \n"; print "sliding phases 02 number of occurance is $slid_hash{'z2'} \n"; print "sliding phases 10 number of occurance is $slid_hash{'1z'} \n"; print "sliding phases 11 number of occurance is $slid_hash{'11'} \n"; print "sliding phases 12 number of occurance is $slid_hash{'12'} \n"; print "sliding phases 20 number of occurance is $slid_hash{'2z'} \n"; print "sliding phases 21 number of occurance is $slid_hash{'21'} \n"; print "sliding phases 22 number of occurance is $slid_hash{'22'} \n\n\n"; print "Distribution of sliding distances in nucleotides \n"; while ( ($k, $v) = each (%slid_length)) { print "$k \t $v \n"; } $sum_common = $sum_intron_q = $sum_intron_s = 0; open (OUT4, ">output4") || die "Can't open output4: $!\n"; print OUT4 "\n\n\n DISTRIBUTIONS \n percent\tcommon\tslid\tall_Q\tall_S\trel_common\trel_slid\n"; for $x (0..120) { print OUT4 $x, "\t", $distr_common{$x}, "\t", $distr_slid{$x}, "\t", $distribution_all{$x}, "\t", $distribution_allS{$x}, "\t"; if ($distribution_all{$x}) { print OUT4 sprintf("%.3f", $distr_common{$x}/$distribution_all{$x}), "\t"; print OUT4 sprintf("%.3f", $distr_slid{$x}/$distribution_all{$x}), "\t"; } print OUT4 "\n"; } print OUT4 "\n\n\n\n"; for $n (0..1000) { $sum_common +=$bin_common[$n]; $sum_intron_q +=$bin_intron[$n]; $sum_intron_s +=$bin_intron_s[$n]; print OUT4 $n, "\t", $bin_common[$n], "\t", $bin_intron[$n], "\t", $bin_intron_s[$n], "\t"; print OUT4 $bin_gene[$n],"\t", $bin_gene_s[$n], "\t"; if ($bin_gene[$n]) { print OUT4 sprintf("%.4f", $bin_common[$n]/$bin_gene[$n]), "\t"; print OUT4 sprintf("%.4f", $bin_intron[$n]/$bin_gene[$n]), "\t"; } else {print OUT4 '####', "\t", '####', "\t";} if ($bin_gene_s[$n]) { print OUT4 sprintf("%.4f", $bin_intron_s[$n]/$bin_gene_s[$n]), "\t"; } else {print OUT4 '####', "\t";} if ($bin_intron[$n]) { print OUT4 sprintf("%.4f", $bin_common[$n]/$bin_intron[$n]), "\t"; } else {print OUT4 '####', "\t";} if ($bin_intron_s[$n]) { print OUT4 sprintf("%.4f", $bin_common[$n]/$bin_intron_s[$n]), "\t"; } else {print OUT4 '####', "\t";} print OUT4 "\n"; } print "\t\t LAST CONTROL common introns $sum_common \t Q_introns $sum_intron_q \t S_introns $sum_intron_s \n"; sub user_parameters { if ($#ARGV != 1 ){ die ("USAGE: prog_Amir \n"); } if (! -d $ARGV[1]) { mkdir $ARGV[1], 0755 || die "Unable to create output directory"; } printf OUTPUT ("The date was %04d-%02d-%02d\n", sub {($_[5]+1900, $_[4]+1, $_[3])} ->(localtime)); print OUTPUT 'The name of input blastout file is ', $ARGV[0], "\n"; print OUTPUT 'The name of output directory is ', $ARGV[1], "\n"; print 'Give the threshold for alignment homology in BLAST SCORE BITS (integer)' , "\n"; $threshold = ; chomp($threshold); $constant1 = 10; print 'Do you want to remove pairs which genes have been counted ??' , "\n"; print "\t type y/n \n"; $answer4 = ; chomp($answer4); if ($answer4 =~ /[yY]/) { $rm_pairs = 0; } else { $rm_pairs = 1; } print 'Do you want to remove off overlapping alignments ??' , "\n"; print "\t type y/n \n"; $answer2 = ; chomp($answer2); if ($answer2 =~ /[yY]/) { $rm_control = 0; } else { $rm_control = 1; } print 'The variable rm_control is ', "$rm_control \n"; print OUTPUT 'The variable rm_control is ', "$rm_control \n"; print 'Do you want to count each intron only once ??' , "\n\n"; print "\t type y/n \n\n"; $answer3 = ; chomp($answer3); if ($answer3 =~ /[yY]/) { $single_intron = "SINGLE"; } else { $single_intron = "MULTIPLE"; } if ($single_intron eq "SINGLE" && $rm_pairs == 0) { print "WARNING! using counting only unique pair option, intron counting option must be MULTIPLE \n"; print "your choice has been changed to MULTIPLE \n"; $single_intron = "MULTIPLE"; } print 'Now the SINGLE/MULTIPLE status for the intron counting is', $single_intron, "\n\n\n"; print OUTPUT 'Now the SINGLE/MULTIPLE status for the intron counting is', $single_intron, "\n\n\n"; #print "Enter the sliding threshold for counting intron positions \n"; #$sliding_threshold = ; #chomp($sliding_threshold); $sliding_threshold = 2.1; #print 'Now you need to chose the effect of conservative aa changes on the calculation of protein homology level' , "\n\n"; #print 'in this program percent of homology is calculated by the forlmula:', "\n"; #print "\t\t", '%[HOMOLOGY] = %[IDENTITY] + %[CONSERVATIVE_AA_CHANGES]*[COEFFICIENT]', "\n\n"; #print 'You have to chose a value of the COEFFICIENT for conservative amino acid changes from 0.0 to 1.0', "\n"; #print "\t\t", '0.0 means that you will not count conservative aa changes at all'; #print "\n\t\t", '1.0 means that conservative aa changes will be equal to the identical aa in the alignments', "\n\n"; #$answer4 = ; chomp($answer4); #$conserv_change = $answer4; #print 'Now the value of the COEFFICIENT is ', $conserv_change, "\n\n"; #print OUTPUT 'Now the value of the COEFFICIENT is ', $conserv_change, "\n\n"; $conserv_change = 0.3; #print "Enter the threshold for size of a gap in interval I \n"; #$thr_i1 = ; chomp($thr_i1); #print "Enter the threshold for size of a gap in interval II \n"; #$thr_i2 = ; chomp($thr_i2); #print "Enter the threshold for the local homology level \n"; #$thr_lg = ; chomp($thr_lg); #print "You entered the following parameters 1- $thr_i1 \t 2- $thr_i2 \t 3- $thr_lg \n\n\n"; #print OUTPUT "You entered the following parameters 1- $thr_i1 \t 2- $thr_i2 \t3- $thr_lg \n\n\n"; #print "Do you want to remove intron positions at the alignment borders? y/n \n"; #$border_rm = ; #if ($border_rm =~ /y/) {$border_rm = 0;} #else {$border_rm = 1;} $thr_i1 = 100; $thr_i2 = 100; $thr_lg = 0.01; $border_rm = 1; #$eid = 'gb116dros.removed_noncan.pEID'; print "Please, enter the name and the path of the pEID file \n"; print 'EXAMPLE gb116dros.removed_noncan.pEID', "\n"; print 'EXAMPLE gb121.rm_noncan.pEID', "\n"; print "\t\t NOTE pEID file is very important \n"; print "\t\t all the information about intron phases and positions are taken from this file \n"; print "\t\t Therefore pEID should be updated if you change EID or any EID filter \n"; $eid = ; chomp($eid); print OUTPUT 'eid was', $eid, "\n"; @bin_intron = @bin_intron_s = @bin_gene_s = @bin_gene = @bin_common = (); for $n (0..2000) { $bin_intron[$n] = 0; $bin_gene[$n] = 0; $bin_common[$n] = 0; $bin_intron_s[$n] = 0; $bin_gene_s[$n] = 0; } %slid_hash = ( 'zz' => 0, 'z1' => 0, 'z2' => 0, '1z' => 0, '11' => 0, '12' => 0, '2z' => 0, '21' => 0, '22' => 0); %slid_length = (); }