#!/usr/bin/perl # # indelDynamicMM.pl - Samuel Shepard - 1.2010 # Convert a nucleotide sequence to a binary representation based on purine-pyrimidine repeats. # Contact sammysheep@gmail.com # LICENSE: GPL version 3 or later # 1.22.10 - SS - Branched from "convertTrainTestMM.pl" # Reimplemented for dynamic indel testing. Takes new argument called "window". # Purine-pyrimidines are tested. # Get the input parameters. if ( scalar(@ARGV) < 5 ) { $message = "\nUsage:\n\tperl $0 [frame]\n"; die( $message ); } # process parameters if ( scalar(@ARGV) > 5 ) { $frameshift = $ARGV[5]; } else { $frameshift = 0; } # read samples open( ITRAIN, '<', "$ARGV[0]/training_intron.fa") or die("Cannot open $ARGV[0]/training_intron.fa\n"); open( ETRAIN, '<', "$ARGV[0]/training_exon.fa") or die("Cannot open $ARGV[0]/training_exon.fa\n"); open( ITEST, '<', "$ARGV[0]/test_intron.fa") or die("Cannot open $ARGV[0]/test_intron.fa\n"); open( ETEST, '<', "$ARGV[0]/test_exon.fa") or die("Cannot open $ARGV[0]/test_exon.fa\n"); if ( int($ARGV[2]) > 0 ) { $jump = $ARGV[2]; } else { die("Let JUMP be: int > 0\n"); } $mmOrder = $ARGV[3]; if ( int($ARGV[4]) > 0 ) { $repeatWindow = $ARGV[4]; } else { die("Let JUMP be: int > 0\n"); } # Initialize variables. @handles = ('ITRAIN','ETRAIN','ITEST','ETEST'); $/ = '>'; $i = 0; $itrain_start = $itrain_end = 0; $etrain_start = $etrain_end = 0; $itest_start = $itest_end = 0; $etest_start = $etest_end = 0; $initBits = $mmOrder; $transBits = $mmOrder + 1; $minimum_length = $repeatWindow * ($mmOrder+1); $remaining_length = 0; foreach $handle ( @handles ) { if ( $handle eq 'ITRAIN' ) { $itrain_start = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_start = $i; } elsif ( $handle eq 'ITEST' ) { $itest_start = $i; } else { $etest_start = $i; } while( $record = <$handle> ) { @lines = split("\n", $record); $header = shift(@lines); chomp(@lines); $sequence = lc(join('',@lines)); $current_length = length($sequence); # if too short, skip it. $remaining_length = $current_length - $frameshift; if ( $remaining_length < $minimum_length ) { next; } else { $nt_count = ($sequence =~ tr/atgc//); if ( $current_length != $nt_count ) { next; } else { $sequences[$i] = substr($sequence, $frameshift, $remaining_length); $i++; } } } if ( $handle eq 'ITRAIN' ) { $itrain_end = $i; } elsif ( $handle eq 'ETRAIN' ) { $etrain_end = $i; } elsif ( $handle eq 'ITEST' ) { $itest_end = $i; } else { $etest_end = $i; } close($handle); } # Variables %initE = %initI = %transE = %transI = (); $patternsInit = 2**$initBits; $patternsTrans = 2**$transBits; open( OUT, '>', "$ARGV[1]") or die("Cannot open $ARGV[1].\n"); @bit_sequences = @scoresI = @scoresE = (); # initialize probabilities $template = '%0'.$initBits.'b'; for( $i = 0; $i < $patternsInit; $i++ ) { $key = sprintf($template, $i); $initE{$key} = $initI{$key} = 0; } $template = '%0'.$transBits.'b'; for( $i = 0; $i < $patternsTrans; $i++ ) { $key = sprintf($template, $i); $transE{$key} = $transI{$key} = 0; } $sumInitI = $sumInitE = $sumTransI = $sumTransE = 0; # convert sequences for ( $i = 0; $i < $etest_end; $i++ ) { $current_length = length($sequences[$i]); $bit_sequences[$i] = ''; for ( $pos = 0; $pos <= ($current_length - $repeatWindow); $pos += $jump ) { if ($pos>$current_length-$repeatWindow-$jump) {next;} $theWindow = substr($sequences[$i], $pos, $repeatWindow); $theNextWindow = substr($sequences[$i], $pos+$jump, $repeatWindow); $theWindow =~ s/c|t/y/g; $theWindow =~ s/a|g/r/g; $theNextWindow =~ s/c|t/y/g; $theNextWindow =~ s/a|g/r/g; if ( $theWindow eq $theNextWindow ) { $bit_sequences[$i] .= '1'; } else { $bit_sequences[$i] .= '0'; } } } # train data # INTRONS $totalI = 0; for( $i = $itrain_start; $i < $itrain_end; $i++ ) { $current_length = length($bit_sequences[$i]); for($pos = 0; $pos <= ($current_length-$initBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $initBits); $initI{$key}++; $sumInitI++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $transBits); $transI{$key}++; $sumTransI++; } } # EXONS $totalE = 0; for( $i = $etrain_start; $i < $etrain_end; $i++ ) { $current_length = length($bit_sequences[$i]); for($pos = 0; $pos <= ($current_length-$initBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $initBits); $initE{$key}++; $sumInitE++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++) { $key = substr($bit_sequences[$i], $pos, $transBits); $transE{$key}++; $sumTransE++; } } # Take frequencies. foreach $key ( keys(%initE) ) { $initE{$key} /= $sumInitE; $initI{$key} /= $sumInitI; } $emptyProb = 0; $totalUses = 0; # test data for( $i = $itest_start; $i < $itest_end; $i++ ) { $current_length = length($bit_sequences[$i]); $which = $i - $itest_start; $scoresI[$which] = 0; $key = substr( $bit_sequences[$i], 0, $initBits); if ( $initI{$key} != 0 ) { $probI = log($initI{$key}); $totalUses++; } else { $emptyProb++; } if ( $initE{$key} ) { $probE = log($initE{$key}); $totalUses++; } else { $emptyProb++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++ ) { $key = substr( $bit_sequences[$i], $pos, $transBits); $key0 = substr( $key, 0, $initBits ) . '0'; $key1 = substr( $key, 0, $initBits ) . '1'; if( $transI{$key0} != 0 && $transI{$key1} != 0 ) { $totalUses++; $probI += log($transI{$key} / ($transI{$key0} + $transI{$key1} )); } else { $emptyProb++; } if( $transE{$key0} != 0 && $transE{$key1} != 0 ) { $totalUses++; $probE += log($transE{$key} / ($transE{$key0} + $transE{$key1} )); } else { $emptyProb++; } } $scoresI[$which] = $probE - $probI; } for( $i = $etest_start; $i < $etest_end; $i++ ) { $current_length = length($bit_sequences[$i]); $which = $i - $etest_start; $scoresE[$which] = 0; $key = substr( $bit_sequences[$i], 0, $initBits); if ( $initI{$key} != 0 ) { $totalUses++; $probI = log($initI{$key}); } else { $emptyProb++; } if ( $initE{$key} ) { $totalUses++; $probE = log($initE{$key}); } else { $emptyProb++; } for($pos = 0; $pos <= ($current_length-$transBits); $pos++ ) { $key = substr( $bit_sequences[$i], $pos, $transBits); $key0 = substr( $key, 0, $initBits ) . '0'; $key1 = substr( $key, 0, $initBits ) . '1'; if( $transI{$key0} != 0 && $transI{$key1} != 0 ) { $totalUses++; $probI += log($transI{$key} / ($transI{$key0} + $transI{$key1} )); } else { $emptyProb++; } if( $transE{$key0} != 0 && $transE{$key1} != 0 ) { $totalUses++; $probE += log($transE{$key} / ($transE{$key0} + $transE{$key1} )); } else { $emptyProb++; } } $scoresE[$which] = $probE - $probI; } #$exponent = log($totalUses+$emptyProb)/log(10); $possibleUses = $emptyProb + $totalUses; $knowledgeUsage = sprintf("%0.6f", ( $totalUses / $possibleUses ) ); print "$knowledgeUsage\t($emptyProb)\tW${repeatWindow}J$jump.\n"; print OUT "W${repeatWindow}J$jump\n",join(' ', @scoresI),"\n",join(' ', @scoresE),"\n\n"; close(OUT);