#!/usr/bin/perl use warnings; # Perl script: AvMutInfo.pl # Authors: Vadim Filatov & Alexei Fedorov # Bioinformatics Lab, Department of Medicine # HSC, University of Toledo, Toledo, Ohio, USA # and DINOM LLC, Moscow, Russia # written July, 2010 # Contact alexei.fedorov@utoledo.edu # # LICENSE: GPL version 3 or later #AvMutInfo.pl executes algorithm published by Grosse et al 2000 in Pscific Symposium on Biocomiuting 5:611-620 #However, we changed the suggested formula (3) form AMI = (I(3) + 2*I(4))/3 into AMI = (I(3) + I(4) + I(5))/3 #because the initial version sometimes produces negative AMI values #the program takes as an input Fasta-fromated sequences ($ARGV[0]) and produces an output file ($ARGV[1]) in the two-column format #the first column represents the consecutive number of the current input sequence ($num_recs_read) #the second column represents AMI value for this sequence #the second column might, instead of AMI value, represent the following warnings or special (1 or -1) AMI values: # 1) non-ATCG characters (within current input seq, which makes no sence to calculate AMI) # 2) short seq (Note that differentiation of exons/introns by AMI strongly correlates with the length of the sequence window. So, w=50nt is several times worse than w=150nt.) #Thus, short seqs must be ommited. The minimal length of the input sequence are determined by a user with the $length_threshold variable (see below) # 3) AMI = 1, when "NT fequency is 0" (Pi-value from Grosse et al equal 0; This causes the illegal division by zero in formula (1).) # 4) AMI =-1. when $P3[$i][$j], or $P4[$i][$j], or $P5[$i][$j] is equal zero. This causes illegal log(0). It happens when one base (A,T,C, or G) is practilcally absent in the seq. #finally the program outputs on the screen the distribution of AMI values of the input seqs #parameters that a user can change: $length_threshold =44; #input seq must be longer than this threshold $cutoff = 151; #extra long exons and introns will be truncated by this procedure: $seq = substr($seq,20,$cutoff); ########################################################################## if (scalar(@ARGV) != 2) { print STDOUT "Usage: AvMutInfo.pl \n"; print STDOUT "\twhere is the name of a file containing sequences to be analyzed (in FASTA format),\n"; print STDOUT "\t is the name of a file to hold the output two-column table with AMI values,\n"; print STDOUT "\n"; exit 1; } @lines =(); $num_recs_read =0; %dist =(); $short_count=0; $count_nonATCG=0; for my $x (1..7) {$dist{$x}=0;} # This hash calculates distribution of AMI at the end of the script $/ = "\n>"; open( OUT, ">$ARGV[1]" ) || die "Can't open argv1: $!\n"; open( INFILE, "$ARGV[0]" ) || die "Can't open argv2: $!\n"; OUTER: while () { #initiation of arrays from Grosse et al algorithm. Here we convert NTs into digits: A=0; C=1; G=2; T=3. #This allows to keep number/freq of occurences of nts at particular redading frame positions in 2D arrays #First dimention of the arrays is the phase [0,1, or 2]; the second dimention is the NT type [0, 1, 2, or 3] @N =(); @P3 =(); @P4 = (); @P5=(); @F = (); @nf = (); $I3=0; $I4=0; $I5=0; $ami=0; $l=0; #input seq processing $num_recs_read++; $seq = ''; chop($_); chop($_); chomp($_); @lines = split( "\n", $_ ); chomp( @lines ); chomp( @lines ); for my $z ( 1 .. $#lines ) { $lines[$z] = uc($lines[$z]); $lines[$z] =~ s/\s//g; $seq .= $lines[$z]; } if ($seq =~ /[^AGTC]/) {print OUT $num_recs_read, "\t", "non-ATCG characters\n"; $count_nonATCG++; next OUTER; } if (length($seq) > 1000) {$seq = substr($seq,20,$cutoff);} if (length($seq) > $length_threshold) { #Grosse et al suggest that seq length must be a multiple of 3. Here we chop last NTs when nessesary to fit this rule. my $length = length($seq); my $remainder = $length % 3; if ($remainder ==1) {chop $seq;} if ($remainder ==2) {chop $seq; chop $seq;} $l =length($seq); #these 4 subs create @N -- NT occurances at different reading frames (phases) #step 1 from Grosse et al &composition($seq, 'A', 0); &composition($seq, 'C', 1); &composition($seq, 'G', 2); &composition($seq, 'T', 3); #these loops convert number of NT occurances into frequences (@F) #step 2 from Grosse et al for my $i (0..2){ for my $j (0..3) { unless ($N[$i][$j]) {$N[$i][$j] =0; } $F[$i][$j] = $N[$i][$j] *3 /$l; } } #step 3 from Grosse et al. for my $i (0..2){ for my $j (0..3) { $P3[$i][$j] = ($F[0][$i]*$F[0][$j] + $F[1][$i]*$F[1][$j] + $F[2][$i]*$F[2][$j]) /3; $P4[$i][$j] = ($F[0][$i]*$F[1][$j] + $F[1][$i]*$F[2][$j] + $F[2][$i]*$F[0][$j]) /3; $P5[$i][$j] = ($F[0][$i]*$F[2][$j] + $F[1][$i]*$F[0][$j] + $F[2][$i]*$F[1][$j]) /3; unless ($P3[$i][$j] ) {$ami = -1; print OUT $num_recs_read, "\t", $ami, "\n"; $dist{1}++; next OUTER;} unless ($P4[$i][$j] ) {$ami = -1; print OUT $num_recs_read, "\t", $ami, "\n"; $dist{1}++; next OUTER;} unless ($P5[$i][$j] ) {$ami = -1; print OUT $num_recs_read, "\t", $ami, "\n"; $dist{1}++; next OUTER;} } } #step 4 from Grosse et al. for my $i (0..3) { $nf[$i] = ($F[0][$i] + $F[1][$i] + $F[2][$i]) /3; } #step 5 from Grosse et al for my $i (0..2){ for my $j (0..3) { if ($nf[$i]*$nf[$j] > 0.00001) { $I3 += $P3[$i][$j] * (&log2($P3[$i][$j]/($nf[$i]*$nf[$j]))); $I4 += $P4[$i][$j] * (&log2($P4[$i][$j]/($nf[$i]*$nf[$j]))); $I5 += $P5[$i][$j] * (&log2($P5[$i][$j]/($nf[$i]*$nf[$j]))); } else { print "NT fequency is 0 \n", $seq, "\n"; $ami = 1; print OUT $num_recs_read, "\t", $ami, "\n"; $dist{7}++; next OUTER; } } } #step 6 from Grosse et al $ami = ($I3 + $I4 + $I5)/3; $ami = sprintf("%.5f", $ami); # Additional computation of AMI distribution if ( $ami <0.00001) {$dist{1}++;} elsif ($ami <0.0001) {$dist{2}++;} elsif ($ami <0.001) {$dist{3}++;} elsif ($ami <0.01) {$dist{4}++;} elsif ($ami <0.1) {$dist{5}++;} elsif ($ami <1) {$dist{6}++;} else {$dist{7}++; } #MAIN OUTPUT IS PRINTING HERE if ( $ami != 0 ) { $ami = &log2($ami); } print OUT $num_recs_read, "\t", $ami, "\n"; } else {print OUT $num_recs_read, "\t", "short seq \n"; $short_count++;} } print "\tDISTRIBUTION of AMI within seven intervals: 1= less than e5; 2